from Alphabet import Alphabet class Detector: def __init__(self): self._alphs = None def add_alphabet(self, alph: Alphabet): if self._alphs is None: self._alphs = [] self._alphs.append(alph) def match_text(self, text: str) -> dict: """ Determine if a given text uses some characters from any alphabet :param text: :return: """ results = {} for alph in self._alphs: results[alph.get_name()] = [[]] # [matches], percentage results['Unknown'] = [[]] for c in text: if c == "\n": # ignoring whitespace continue found = False for alph in self._alphs: for entry in alph.get_data(): if c == entry['Char']: results[alph.get_name()][0].append(c) found = True if not found: results['Unknown'][0].append(c) # Create statistics for key in results.keys(): results[key].append(len(results[key][0])/float(len(text))) return results @staticmethod def pretty_result(result: dict, small: bool=False): s = "" longest_name = max([len(key) for key in result.keys()]) longest_match = max([len(m[0]) for m in result.values()]) for key in result.keys(): name = key + ": " while len(name) <= 2+longest_name: name += " " match_list = result[key][0] while len(match_list) <= longest_match: match_list.append(" ") matches = "" for m in match_list: matches += m percentage = round(result[key][1]*100, 2) if not small: s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n" else: s += name + str(percentage) + "\n" return s if __name__ == "__main__": kanji = Alphabet("Kanji (small)", "kanji.csv") kanji2 = Alphabet("Kanji (big)", "kanji2.csv") hiragana = Alphabet("Hiragana", "hiragana.csv") katakana = Alphabet("Katakana", "katakana.csv") print(kanji.get_name(), len(kanji.get_data())) print(kanji2.get_name(), len(kanji2.get_data())) print(hiragana.get_name(), len(hiragana.get_data())) print(katakana.get_name(), len(katakana.get_data())) print() d = Detector() d.add_alphabet(kanji) d.add_alphabet(kanji2) d.add_alphabet(hiragana) d.add_alphabet(katakana) text = "" for line in open("./sample-text.txt", 'r'): text += line print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))