#!/usr/bin/env python3 import argparse import mmap import json from itertools import combinations from letters_rare_first import from_lettercount_file from letter2bitmask import Letter2Bitmask def find(lettercount_file, min_words_new_letter, db_file): l2b = Letter2Bitmask(from_lettercount_file(lettercount_file)) with mmap.mmap(db_file.fileno(), 2 ** 26 * 4, prot=mmap.PROT_READ) as mm: with memoryview(mm).cast("I") as view: # The 26 letters, as an array of one-bit bitmasks. bits = [ 1 << i for i in range(0, 26) ] # This is the (crude) way of choosing what to learn next. def findBestIndex(countNIndex): """Given a list of pairs, each pair consisting of a wordcount and a letter-set bitmask, find the optimal such pair. The (word-)count should be at least min_words_new_letter. Among those, the bitmask should be as large as possible (so rare letters are introduced early, which gives them a chance to be practiced more often). If no count at least min_words_new_letter is found, use a pair with the largest count achieved.""" # A bitmask is "rich" if it allows enough new words: rich = [ci for ci in countNIndex if min_words_new_letter <= ci[0]] def swap(x): return (x[1], x[0]) # Do we have a rich bitmask? if 1 <= len(rich): # Yes. Of these, use the smallest bitmap. # Rare letters are mapped to small bitmap values, # so prefering small bitmaps presumably results in # introducion of rare letters early, so they are exercised more often. return swap(sorted([swap(r) for r in rich])[0]) else: # We have not found a rich bitmask. # In this case, use the bitmasks that allows for the most words: largest_count = sorted(countNIndex, reverse=True)[0][0] # We could potentially have several that give the same number of words. # If so, prepare the smaller bitmask: ci_with_that_many_words = [ci for ci in countNIndex if ci[0] == largest_count] return swap(sorted([swap(ci) for ci in ci_with_that_many_words])[0]) # First step: Try all combinations of three letters (bits, really). # Which three letters allow the maximal numbers of words? # Simply brute-force "try all": countNIndex = [] for three_bits in combinations(bits, 3): index = three_bits[0] | three_bits[1] | three_bits[2] count = view[index] countNIndex.append((count, index)) (count_initial, first_three_bits) = findBestIndex(countNIndex) result = {} result["lesson_01"] = { "letters": l2b.chars(first_three_bits), "new_words": count_initial, "words_total": count_initial} # Now consecutively add one bit at a time. bits_so_far = first_three_bits all_bits = 2 ** 26 - 1 lesson_number = 2 count_so_far = count_initial while bits_so_far != all_bits: countNIndex = [] for new_bit in [1 << i for i in range(0, 26)]: bits = new_bit | bits_so_far if bits != bits_so_far: countNIndex.append((view[bits] - count_so_far, new_bit)) (new_count, new_bit) = findBestIndex(countNIndex) result[f"lesson_{lesson_number:02d}"] = { "new_letter": l2b.chars(new_bit)[0], "old_letters": l2b.chars(bits_so_far), "new_words": new_count, "words_total": count_so_far + new_count } bits_so_far = bits_so_far | new_bit count_so_far += new_count lesson_number += 1 return result if __name__ == "__main__": parser = argparse.ArgumentParser(description="Find choices for initial three letters, given a binfile.") parser.add_argument("--binfile", default="letterset2count.kmrw", type=argparse.FileType(mode="rb", bufsize=0), help="The database file used as a basis.") parser.add_argument("--lettercount", default="lettercount.txt", type=argparse.FileType("r", encoding="UTF8"), help="The input lettercount file as generated by lettercount") parser.add_argument("--words", default=120, type=int, help="Number of new words needed for a new letter to be learnable") args = parser.parse_args() print(json.dumps(find(args.lettercount, args.words, args.binfile), indent=2))