py/makecompresseddata.py: Make compression deterministic.
Error string compression is not deterministic in certain cases: it depends on the Python version (whether dicts are ordered by default or not) and probably also the order files are passed to this script, leading to a difference in which words are included in the top 128 most common. The changes in this commit use OrderedDict to keep parsed lines in a known order, and, when computing how many bytes are saved by a given word, it uses the word itself to break ties (which would otherwise be "random").
This commit is contained in:
parent
1b1ceb67b2
commit
388d419ba3
|
@ -51,9 +51,10 @@ def word_compression(error_strings):
|
||||||
topn[word] += 1
|
topn[word] += 1
|
||||||
|
|
||||||
# Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
|
# Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
|
||||||
|
# Use the word itself for ties so that compression is deterministic.
|
||||||
def bytes_saved(item):
|
def bytes_saved(item):
|
||||||
w, n = item
|
w, n = item
|
||||||
return -((len(w) + 1) * (n - 1))
|
return -((len(w) + 1) * (n - 1)), w
|
||||||
|
|
||||||
top128 = sorted(topn.items(), key=bytes_saved)[:128]
|
top128 = sorted(topn.items(), key=bytes_saved)[:128]
|
||||||
|
|
||||||
|
@ -143,7 +144,7 @@ def ngram_compression(error_strings):
|
||||||
|
|
||||||
|
|
||||||
def main(collected_path, fn):
|
def main(collected_path, fn):
|
||||||
error_strings = {}
|
error_strings = collections.OrderedDict()
|
||||||
max_uncompressed_len = 0
|
max_uncompressed_len = 0
|
||||||
num_uses = 0
|
num_uses = 0
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue