From 388d419ba39b061923d2568814195e8bf73330d4 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Sun, 19 Apr 2020 23:47:22 +1000
Subject: [PATCH] py/makecompresseddata.py: Make compression deterministic.

Error string compression is not deterministic in certain cases: it depends
on the Python version (whether dicts are ordered by default or not) and
probably also the order files are passed to this script, leading to a
difference in which words are included in the top 128 most common.

The changes in this commit use OrderedDict to keep parsed lines in a known
order, and, when computing how many bytes are saved by a given word, it
uses the word itself to break ties (which would otherwise be "random").
---
 py/makecompresseddata.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/py/makecompresseddata.py b/py/makecompresseddata.py
index 28223a6d9c..9603de8713 100644
--- a/py/makecompresseddata.py
+++ b/py/makecompresseddata.py
@@ -51,9 +51,10 @@ def word_compression(error_strings):
             topn[word] += 1
 
     # Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
+    # Use the word itself for ties so that compression is deterministic.
     def bytes_saved(item):
         w, n = item
-        return -((len(w) + 1) * (n - 1))
+        return -((len(w) + 1) * (n - 1)), w
 
     top128 = sorted(topn.items(), key=bytes_saved)[:128]
 
@@ -143,7 +144,7 @@ def ngram_compression(error_strings):
 
 
 def main(collected_path, fn):
-    error_strings = {}
+    error_strings = collections.OrderedDict()
     max_uncompressed_len = 0
     num_uses = 0