Unverified Commit aad33aab by Roman Lebedev Committed by GitHub

[Tooling] Rewrite generate_difference_report(). (#678)

My knowledge of python is not great, so this is kinda horrible. Two things: 1. If there were repetitions, for the RHS (i.e. the new value) we were always using the first repetition, which naturally results in incorrect change reports for the second and following repetitions. And what is even worse, that completely broke U test. :( 2. A better support for different repetition count for U test was missing. It's important if we are to be able to report 'iteration as repetition', since it is rather likely that the iteration count will mismatch. Now, the rough idea on how this is implemented now. I think this is the right solution. 1. Get all benchmark names (in order) from the lhs benchmark. 2. While preserving the order, keep the unique names 3. Get all benchmark names (in order) from the rhs benchmark. 4. While preserving the order, keep the unique names 5. Intersect `2.` and `4.`, get the list of unique benchmark names that exist on both sides. 6. Now, we want to group (partition) all the benchmarks with the same name. ``` BM_FOO: [lhs]: BM_FOO/repetition0 BM_FOO/repetition1 [rhs]: BM_FOO/repetition0 BM_FOO/repetition1 BM_FOO/repetition2 ... ``` We also drop mismatches in `time_unit` here. _(whose bright idea was it to store arbitrarily scaled timers in json **?!** )_ 7. Iterate for each partition 7.1. Conditionally, diff the overlapping repetitions (the count of repetitions may be different.) 7.2. Conditionally, do the U test: 7.2.1. Get **all** the values of `"real_time"` field from the lhs benchmark 7.2.2. Get **all** the values of `"cpu_time"` field from the lhs benchmark 7.2.3. Get **all** the values of `"real_time"` field from the rhs benchmark 7.2.4. Get **all** the values of `"cpu_time"` field from the rhs benchmark NOTE: the repetition count may be different, but we want *all* the values! 7.2.5. Do the rest of the u test stuff 7.2.6. Print u test 8. ??? 9. **PROFIT**! Fixes #677
parent 439d6b1c
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
"name": "BM_Two", "name": "BM_Two",
"iterations": 1000, "iterations": 1000,
"real_time": 8, "real_time": 8,
"cpu_time": 80, "cpu_time": 86,
"time_unit": "ns" "time_unit": "ns"
}, },
{ {
...@@ -38,12 +38,28 @@ ...@@ -38,12 +38,28 @@
"time_unit": "ns" "time_unit": "ns"
}, },
{ {
"name": "short",
"run_type": "aggregate",
"iterations": 1000,
"real_time": 8,
"cpu_time": 77,
"time_unit": "ns"
},
{
"name": "medium", "name": "medium",
"run_type": "iteration", "run_type": "iteration",
"iterations": 1000, "iterations": 1000,
"real_time": 8, "real_time": 8,
"cpu_time": 80, "cpu_time": 80,
"time_unit": "ns" "time_unit": "ns"
},
{
"name": "medium",
"run_type": "iteration",
"iterations": 1000,
"real_time": 9,
"cpu_time": 82,
"time_unit": "ns"
} }
] ]
} }
...@@ -26,15 +26,31 @@ ...@@ -26,15 +26,31 @@
"name": "BM_Two", "name": "BM_Two",
"iterations": 1000, "iterations": 1000,
"real_time": 7, "real_time": 7,
"cpu_time": 70, "cpu_time": 72,
"time_unit": "ns" "time_unit": "ns"
}, },
{ {
"name": "short", "name": "short",
"run_type": "aggregate", "run_type": "aggregate",
"iterations": 1000, "iterations": 1000,
"real_time": 8, "real_time": 7,
"cpu_time": 80, "cpu_time": 75,
"time_unit": "ns"
},
{
"name": "short",
"run_type": "aggregate",
"iterations": 762,
"real_time": 4.54,
"cpu_time": 66.6,
"time_unit": "ns"
},
{
"name": "short",
"run_type": "iteration",
"iterations": 1000,
"real_time": 800,
"cpu_time": 1,
"time_unit": "ns" "time_unit": "ns"
}, },
{ {
......
...@@ -36,6 +36,7 @@ BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m') ...@@ -36,6 +36,7 @@ BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
UTEST_MIN_REPETITIONS = 2 UTEST_MIN_REPETITIONS = 2
UTEST_OPTIMAL_REPETITIONS = 9 # Lowest reasonable number, More is better. UTEST_OPTIMAL_REPETITIONS = 9 # Lowest reasonable number, More is better.
UTEST_COL_NAME = "_pvalue"
def color_format(use_color, fmt_str, *args, **kwargs): def color_format(use_color, fmt_str, *args, **kwargs):
...@@ -93,6 +94,99 @@ def filter_benchmark(json_orig, family, replacement=""): ...@@ -93,6 +94,99 @@ def filter_benchmark(json_orig, family, replacement=""):
return filtered return filtered
def get_unique_benchmark_names(json):
"""
While *keeping* the order, give all the unique 'names' used for benchmarks.
"""
seen = set()
uniqued = [x['name'] for x in json['benchmarks']
if x['name'] not in seen and
(seen.add(x['name']) or True)]
return uniqued
def intersect(list1, list2):
"""
Given two lists, get a new list consisting of the elements only contained
in *both of the input lists*, while preserving the ordering.
"""
return [x for x in list1 if x in list2]
def partition_benchmarks(json1, json2):
"""
While preserving the ordering, find benchmarks with the same names in
both of the inputs, and group them.
(i.e. partition/filter into groups with common name)
"""
json1_unique_names = get_unique_benchmark_names(json1)
json2_unique_names = get_unique_benchmark_names(json2)
names = intersect(json1_unique_names, json2_unique_names)
partitions = []
for name in names:
# Pick the time unit from the first entry of the lhs benchmark.
time_unit = (x['time_unit']
for x in json1['benchmarks'] if x['name'] == name).next()
# Filter by name and time unit.
lhs = [x for x in json1['benchmarks'] if x['name'] == name and
x['time_unit'] == time_unit]
rhs = [x for x in json2['benchmarks'] if x['name'] == name and
x['time_unit'] == time_unit]
partitions.append([lhs, rhs])
return partitions
def extract_field(partition, field_name):
# The count of elements may be different. We want *all* of them.
lhs = [x[field_name] for x in partition[0]]
rhs = [x[field_name] for x in partition[1]]
return [lhs, rhs]
def print_utest(partition, utest_alpha, first_col_width, use_color=True):
timings_time = extract_field(partition, 'real_time')
timings_cpu = extract_field(partition, 'cpu_time')
min_rep_cnt = min(len(timings_time[0]),
len(timings_time[1]),
len(timings_cpu[0]),
len(timings_cpu[1]))
# Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
if min_rep_cnt < UTEST_MIN_REPETITIONS:
return []
def get_utest_color(pval):
return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
time_pvalue = mannwhitneyu(
timings_time[0], timings_time[1], alternative='two-sided').pvalue
cpu_pvalue = mannwhitneyu(
timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
dsc = "U Test, Repetitions: {} vs {}".format(
len(timings_cpu[0]), len(timings_cpu[1]))
dsc_color = BC_OKGREEN
if min_rep_cnt < UTEST_OPTIMAL_REPETITIONS:
dsc_color = BC_WARNING
dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
UTEST_OPTIMAL_REPETITIONS)
special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{} {}"
last_name = partition[0][0]['name']
return [color_format(use_color,
special_str,
BC_HEADER,
"{}{}".format(last_name, UTEST_COL_NAME),
first_col_width,
get_utest_color(time_pvalue), time_pvalue,
get_utest_color(cpu_pvalue), cpu_pvalue,
dsc_color, dsc,
endc=BC_ENDC)]
def generate_difference_report( def generate_difference_report(
json1, json1,
json2, json2,
...@@ -113,71 +207,29 @@ def generate_difference_report( ...@@ -113,71 +207,29 @@ def generate_difference_report(
return b return b
return None return None
utest_col_name = "_pvalue"
first_col_width = max( first_col_width = max(
first_col_width, first_col_width,
len('Benchmark')) len('Benchmark'))
first_col_width += len(utest_col_name) first_col_width += len(UTEST_COL_NAME)
first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format( first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format(
'Benchmark', 12 + first_col_width) 'Benchmark', 12 + first_col_width)
output_strs = [first_line, '-' * len(first_line)] output_strs = [first_line, '-' * len(first_line)]
last_name = None partitions = partition_benchmarks(json1, json2)
timings_time = [[], []] for partition in partitions:
timings_cpu = [[], []] # Careful, we may have different repetition count.
for i in range(min(len(partition[0]), len(partition[1]))):
gen = (bn for bn in json1['benchmarks'] bn = partition[0][i]
if 'real_time' in bn and 'cpu_time' in bn) other_bench = partition[1][i]
for bn in gen:
fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{} {}"
if last_name is None:
last_name = bn['name']
if last_name != bn['name']:
if ((len(timings_time[0]) >= UTEST_MIN_REPETITIONS) and
(len(timings_time[1]) >= UTEST_MIN_REPETITIONS) and
(len(timings_cpu[0]) >= UTEST_MIN_REPETITIONS) and
(len(timings_cpu[1]) >= UTEST_MIN_REPETITIONS)):
if utest:
def get_utest_color(pval):
if pval >= utest_alpha:
return BC_FAIL
else:
return BC_OKGREEN
time_pvalue = mannwhitneyu(
timings_time[0], timings_time[1], alternative='two-sided').pvalue
cpu_pvalue = mannwhitneyu(
timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
dsc = "U Test, Repetitions: {}".format(len(timings_cpu[0]))
dsc_color = BC_OKGREEN
if len(timings_cpu[0]) < UTEST_OPTIMAL_REPETITIONS:
dsc_color = BC_WARNING
dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
UTEST_OPTIMAL_REPETITIONS)
output_strs += [color_format(use_color,
special_str,
BC_HEADER,
"{}{}".format(last_name,
utest_col_name),
first_col_width,
get_utest_color(time_pvalue),
time_pvalue,
get_utest_color(cpu_pvalue),
cpu_pvalue,
dsc_color,
dsc,
endc=BC_ENDC)]
last_name = bn['name']
timings_time = [[], []]
timings_cpu = [[], []]
other_bench = find_test(bn['name']) # *If* we were asked to only display aggregates,
if not other_bench: # and if it is non-aggregate, then skip it.
if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
assert bn['run_type'] == other_bench['run_type']
if bn['run_type'] != 'aggregate':
continue continue
if bn['time_unit'] != other_bench['time_unit']: fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
continue
def get_color(res): def get_color(res):
if res > 0.05: if res > 0.05:
...@@ -187,20 +239,8 @@ def generate_difference_report( ...@@ -187,20 +239,8 @@ def generate_difference_report(
else: else:
return BC_CYAN return BC_CYAN
timings_time[0].append(bn['real_time']) tres = calculate_change(bn['real_time'], other_bench['real_time'])
timings_time[1].append(other_bench['real_time']) cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
timings_cpu[0].append(bn['cpu_time'])
timings_cpu[1].append(other_bench['cpu_time'])
# *After* recording this run for u test, *if* we were asked to only
# display aggregates, and if it is non-aggregate, then skip it.
if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
assert bn['run_type'] == other_bench['run_type']
if bn['run_type'] != 'aggregate':
continue;
tres = calculate_change(timings_time[0][-1], timings_time[1][-1])
cpures = calculate_change(timings_cpu[0][-1], timings_cpu[1][-1])
output_strs += [color_format(use_color, output_strs += [color_format(use_color,
fmt_str, fmt_str,
BC_HEADER, BC_HEADER,
...@@ -210,13 +250,22 @@ def generate_difference_report( ...@@ -210,13 +250,22 @@ def generate_difference_report(
tres, tres,
get_color(cpures), get_color(cpures),
cpures, cpures,
timings_time[0][-1], bn['real_time'],
timings_time[1][-1], other_bench['real_time'],
timings_cpu[0][-1], bn['cpu_time'],
timings_cpu[1][-1], other_bench['cpu_time'],
endc=BC_ENDC)] endc=BC_ENDC)]
# After processing the whole partition, if requested, do the U test.
if utest:
output_strs += print_utest(partition,
utest_alpha=utest_alpha,
first_col_width=first_col_width,
use_color=use_color)
return output_strs return output_strs
############################################################################### ###############################################################################
# Unit tests # Unit tests
...@@ -224,6 +273,33 @@ def generate_difference_report( ...@@ -224,6 +273,33 @@ def generate_difference_report(
import unittest import unittest
class TestGetUniqueBenchmarkNames(unittest.TestCase):
def load_results(self):
import json
testInputs = os.path.join(
os.path.dirname(
os.path.realpath(__file__)),
'Inputs')
testOutput = os.path.join(testInputs, 'test3_run0.json')
with open(testOutput, 'r') as f:
json = json.load(f)
return json
def test_basic(self):
expect_lines = [
'BM_One',
'BM_Two',
'short', # These two are not sorted
'medium', # These two are not sorted
]
json = self.load_results()
output_lines = get_unique_benchmark_names(json)
print("\n")
print("\n".join(output_lines))
self.assertEqual(len(output_lines), len(expect_lines))
for i in range(0, len(output_lines)):
self.assertEqual(expect_lines[i], output_lines[i])
class TestReportDifference(unittest.TestCase): class TestReportDifference(unittest.TestCase):
def load_results(self): def load_results(self):
import json import json
...@@ -267,7 +343,7 @@ class TestReportDifference(unittest.TestCase): ...@@ -267,7 +343,7 @@ class TestReportDifference(unittest.TestCase):
for i in range(0, len(output_lines)): for i in range(0, len(output_lines)):
parts = [x for x in output_lines[i].split(' ') if x] parts = [x for x in output_lines[i].split(' ') if x]
self.assertEqual(len(parts), 7) self.assertEqual(len(parts), 7)
self.assertEqual(parts, expect_lines[i]) self.assertEqual(expect_lines[i], parts)
class TestReportDifferenceBetweenFamilies(unittest.TestCase): class TestReportDifferenceBetweenFamilies(unittest.TestCase):
...@@ -301,7 +377,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase): ...@@ -301,7 +377,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
for i in range(0, len(output_lines)): for i in range(0, len(output_lines)):
parts = [x for x in output_lines[i].split(' ') if x] parts = [x for x in output_lines[i].split(' ') if x]
self.assertEqual(len(parts), 7) self.assertEqual(len(parts), 7)
self.assertEqual(parts, expect_lines[i]) self.assertEqual(expect_lines[i], parts)
class TestReportDifferenceWithUTest(unittest.TestCase): class TestReportDifferenceWithUTest(unittest.TestCase):
...@@ -324,13 +400,15 @@ class TestReportDifferenceWithUTest(unittest.TestCase): ...@@ -324,13 +400,15 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
expect_lines = [ expect_lines = [
['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'], ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'], ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
['BM_Two', '+0.2500', '+0.1125', '8', '10', '80', '89'], ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
['BM_Two_pvalue', ['BM_Two_pvalue',
'0.2207', '0.6985',
'0.6831', '0.6985',
'U', 'U',
'Test,', 'Test,',
'Repetitions:', 'Repetitions:',
'2',
'vs',
'2.', '2.',
'WARNING:', 'WARNING:',
'Results', 'Results',
...@@ -338,7 +416,23 @@ class TestReportDifferenceWithUTest(unittest.TestCase): ...@@ -338,7 +416,23 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
'9+', '9+',
'repetitions', 'repetitions',
'recommended.'], 'recommended.'],
['short', '+0.0000', '+0.0000', '8', '8', '80', '80'], ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
['short_pvalue',
'0.7671',
'0.1489',
'U',
'Test,',
'Repetitions:',
'2',
'vs',
'3.',
'WARNING:',
'Results',
'unreliable!',
'9+',
'repetitions',
'recommended.'],
['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'], ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
] ]
json1, json2 = self.load_results() json1, json2 = self.load_results()
...@@ -350,10 +444,11 @@ class TestReportDifferenceWithUTest(unittest.TestCase): ...@@ -350,10 +444,11 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
self.assertEqual(len(output_lines), len(expect_lines)) self.assertEqual(len(output_lines), len(expect_lines))
for i in range(0, len(output_lines)): for i in range(0, len(output_lines)):
parts = [x for x in output_lines[i].split(' ') if x] parts = [x for x in output_lines[i].split(' ') if x]
self.assertEqual(parts, expect_lines[i]) self.assertEqual(expect_lines[i], parts)
class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCase): class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
unittest.TestCase):
def load_results(self): def load_results(self):
import json import json
testInputs = os.path.join( testInputs = os.path.join(
...@@ -373,13 +468,15 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCa ...@@ -373,13 +468,15 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCa
expect_lines = [ expect_lines = [
['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'], ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'], ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
['BM_Two', '+0.2500', '+0.1125', '8', '10', '80', '89'], ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
['BM_Two_pvalue', ['BM_Two_pvalue',
'0.2207', '0.6985',
'0.6831', '0.6985',
'U', 'U',
'Test,', 'Test,',
'Repetitions:', 'Repetitions:',
'2',
'vs',
'2.', '2.',
'WARNING:', 'WARNING:',
'Results', 'Results',
...@@ -387,7 +484,23 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCa ...@@ -387,7 +484,23 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCa
'9+', '9+',
'repetitions', 'repetitions',
'recommended.'], 'recommended.'],
['short', '+0.0000', '+0.0000', '8', '8', '80', '80'], ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
['short_pvalue',
'0.7671',
'0.1489',
'U',
'Test,',
'Repetitions:',
'2',
'vs',
'3.',
'WARNING:',
'Results',
'unreliable!',
'9+',
'repetitions',
'recommended.'],
] ]
json1, json2 = self.load_results() json1, json2 = self.load_results()
output_lines_with_header = generate_difference_report( output_lines_with_header = generate_difference_report(
...@@ -399,7 +512,7 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCa ...@@ -399,7 +512,7 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(unittest.TestCa
self.assertEqual(len(output_lines), len(expect_lines)) self.assertEqual(len(output_lines), len(expect_lines))
for i in range(0, len(output_lines)): for i in range(0, len(output_lines)):
parts = [x for x in output_lines[i].split(' ') if x] parts = [x for x in output_lines[i].split(' ') if x]
self.assertEqual(parts, expect_lines[i]) self.assertEqual(expect_lines[i], parts)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment