You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
94 lines
2.7 KiB
Python
94 lines
2.7 KiB
Python
from Alphabet import Alphabet
|
|
|
|
|
|
class Detector:
|
|
|
|
def __init__(self):
|
|
self._alphs = None
|
|
|
|
def add_alphabet(self, alph: Alphabet):
|
|
if self._alphs is None:
|
|
self._alphs = []
|
|
self._alphs.append(alph)
|
|
|
|
def match_text(self, text: str) -> dict:
|
|
"""
|
|
Determine if a given text uses some characters from any alphabet
|
|
:param text:
|
|
:return:
|
|
"""
|
|
results = {}
|
|
for alph in self._alphs:
|
|
results[alph.get_name()] = [[]] # [matches], percentage
|
|
results['Unknown'] = [[]]
|
|
|
|
for c in text:
|
|
if c == "\n": # ignoring whitespace
|
|
continue
|
|
found = False
|
|
for alph in self._alphs:
|
|
|
|
for entry in alph.get_data():
|
|
if c == entry['Char']:
|
|
results[alph.get_name()][0].append(c)
|
|
found = True
|
|
|
|
if not found:
|
|
results['Unknown'][0].append(c)
|
|
|
|
# Create statistics
|
|
for key in results.keys():
|
|
results[key].append(len(results[key][0])/float(len(text)))
|
|
return results
|
|
|
|
@staticmethod
|
|
def pretty_result(result: dict, small: bool=False):
|
|
s = ""
|
|
longest_name = max([len(key) for key in result.keys()])
|
|
longest_match = max([len(m[0]) for m in result.values()])
|
|
|
|
for key in result.keys():
|
|
name = key + ": "
|
|
while len(name) <= 2+longest_name:
|
|
name += " "
|
|
|
|
match_list = result[key][0]
|
|
while len(match_list) <= longest_match:
|
|
match_list.append(" ")
|
|
matches = ""
|
|
for m in match_list:
|
|
matches += m
|
|
|
|
percentage = round(result[key][1]*100, 2)
|
|
if not small:
|
|
s += name + "Matches: " + matches + " MatchPercent: " + str(percentage) + "\n"
|
|
else:
|
|
s += name + str(percentage) + "\n"
|
|
|
|
return s
|
|
|
|
|
|
if __name__ == "__main__":
|
|
kanji = Alphabet("Kanji (small)", "kanji.csv")
|
|
kanji2 = Alphabet("Kanji (big)", "kanji2.csv")
|
|
hiragana = Alphabet("Hiragana", "hiragana.csv")
|
|
katakana = Alphabet("Katakana", "katakana.csv")
|
|
print(kanji.get_name(), len(kanji.get_data()))
|
|
print(kanji2.get_name(), len(kanji2.get_data()))
|
|
print(hiragana.get_name(), len(hiragana.get_data()))
|
|
print(katakana.get_name(), len(katakana.get_data()))
|
|
print()
|
|
|
|
d = Detector()
|
|
d.add_alphabet(kanji)
|
|
d.add_alphabet(kanji2)
|
|
d.add_alphabet(hiragana)
|
|
d.add_alphabet(katakana)
|
|
|
|
|
|
text = ""
|
|
for line in open("./sample-text.txt", 'r'):
|
|
text += line
|
|
|
|
print("Matches: \n{0}".format(d.pretty_result(d.match_text(text), small=True)))
|