kopia lustrzana https://gitlab.com/4ham/koch-method-real-words
85 wiersze
3.0 KiB
Python
Executable File
85 wiersze
3.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import sys
|
|
import re
|
|
import subprocess
|
|
|
|
def raw_words(dic_file, aff_file, encoding):
|
|
unmunch = subprocess.run(["unmunch", dic_file, aff_file],
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
check=False, encoding=encoding)
|
|
# Catching stderr and printing
|
|
# provides transcoding from ISO-8859-1 to UTF-8 when needed
|
|
print(unmunch.stderr, file=sys.stderr, flush=True)
|
|
unmunch.check_returncode()
|
|
for word in unmunch.stdout.split("\n"):
|
|
yield word.lower()
|
|
|
|
def good_words(words):
|
|
"""Remove words that contain digits or are only one character long."""
|
|
letters = re.compile("(\w\w+)")
|
|
for word in words:
|
|
match = letters.fullmatch(word)
|
|
if match:
|
|
yield match.group(1)
|
|
|
|
def ascii_words(words):
|
|
"""Apply some common transliteration appropriate for German "Umlaute",
|
|
then remove all words still not ASCII, and also remove roman numbers.
|
|
"""
|
|
to_ascii = str.maketrans({'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'})
|
|
is_ascii = re.compile("[a-z]+")
|
|
is_roman_number = re.compile("[lxvi]+")
|
|
for word in words:
|
|
word_maybeascii = word.translate(to_ascii)
|
|
if is_ascii.fullmatch(word_maybeascii) and not is_roman_number.fullmatch(word_maybeascii):
|
|
yield word_maybeascii
|
|
|
|
def short_ascii_words(words, max_word_length):
|
|
"""Filter all words that are longer than max_word_length."""
|
|
for word in words:
|
|
if len(word) <= max_word_length:
|
|
yield word
|
|
|
|
def generate(outfile, language, max_word_length):
|
|
"""Generate the sorted wordlist into outfile
|
|
(assumed to have a `write` method accepting strings).
|
|
|
|
The parameter `language` can be set to de_DE, en_US, or en_GB."""
|
|
|
|
encoding = {
|
|
"de_DE": "ISO-8859-1",
|
|
"en_US": "UTF-8",
|
|
"en_GB": "UTF-8"
|
|
}[language]
|
|
|
|
dic_file = f"/usr/share/hunspell/{language}.dic"
|
|
aff_file = f"/usr/share/hunspell/{language}.aff"
|
|
|
|
for word in sorted(set(
|
|
short_ascii_words(
|
|
good_words(ascii_words(raw_words(dic_file, aff_file, encoding))),
|
|
max_word_length))):
|
|
outfile.write(f"{word}\n")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Fetch words from hunspell's unmunch (default German FraMi), filter out junk, and write list.")
|
|
parser.add_argument("--lang", default="de_DE",
|
|
help="language to use",
|
|
choices=["de_DE", "en_GB", "en_US"])
|
|
|
|
parser.add_argument("--out",
|
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
default="wordlist.txt",
|
|
help="The output file of one word per line to be written (in UTF-8 encoding).")
|
|
parser.add_argument("--max-word-length",
|
|
type=int,
|
|
default=6,
|
|
help="Do not use words longer than this many characters.")
|
|
args = parser.parse_args()
|
|
|
|
outfile = args.out
|
|
generate(outfile, args.lang, args.max_word_length)
|
|
|