ftyers · AnastasijaKravtsova · Oct 31, 2018 · Oct 31, 2018 · Nov 21, 2018 · Mar 30, 2019
diff --git a/2018-komp-ling/practicals/segmentation/segmentation-response.md b/2018-komp-ling/practicals/segmentation/segmentation-response.md
diff --git a/2018-komp-ling/practicals/segmentation/wiki_paragraphs.txt b/2018-komp-ling/practicals/segmentation/wiki_paragraphs.txt
diff --git a/2018-komp-ling/practicals/tokenisation/maxmatch.py b/2018-komp-ling/practicals/tokenisation/maxmatch.py
@@ -0,0 +1,16 @@
+def max_match(sentence, dictionary):
+"""
+sentence - a sentence for tokenisation
+dictionary - a list of unique words
+
+"""
+    for i in range(len(sentence)-1, -1, -1):
+        first_word = (sentence[0:i+1])
+        keeper = sentence[i+1:len(sentence)]
+        if first_word in dictionary:
+            return [first_word] + max_match(keeper, dictionary)
+
+    first_word = sentence[0]
+    keeper = sentence[1:len(sentence)]
+
+    return [first_word] + max_match(keeper, dictionary)
diff --git a/2018-komp-ling/practicals/tokenisation/tokenisation-response.md b/2018-komp-ling/practicals/tokenisation/tokenisation-response.md
@@ -0,0 +1,32 @@
+
+# Tokenisation report
+For the current task maxmatch algorithm was implemented (you can see it in the same folder with this report).
+
+As a prerequirement a list of unique words should be prepared.
+
+There are 2 inputs for the algorithm:
+- a sentence for tokenisation
+- a list of unique words.
+
+### Examples
+Original sentence:
+> これに不快感を示す住民はいましたが,現在,表立って反対や抗議の声を挙げている住民はいないようです。
+
+Original tokenisation:
+> 'これ', 'に', '不快感', 'を', '示す', '住民', 'は', 'い', 'まし', 'た', 'が', ',', '現在', ',', '表立っ', 'て', '反対', 'や', '抗議', 'の', '声', 'を', '挙げ', 'て', 'いる', '住民', 'は', 'い', 'ない', 'よう', 'です', '。'
+
+Maxmatch tokenisation:
+> 'これ', 'に', '不快', '感', 'を', '示す', '住民', 'は', 'いま', 'し', 'たが', ',', '現在', ',', '表', '立っ', 'て', '反対', 'や', '抗議', 'の', '声', 'を', '挙げて', 'いる', '住民', 'は', 'い', 'ない', 'ようで', 'す', '。'
+
+These are tokenisation errors for this sentence:
+- '不快' and '感' instead of '不快感' (*oversegmentation*)
+- 'いま' and 'し' instead of 'い' and 'まし'
+- 'たが' instead of 'た' and 'が'
+- '表', '立っ'and 'て' instead of '表立っ'
+- '挙げて' instead of '挙げ' and 'て'
+- 'ようで' and 'す' instead of 'よう' and 'です'
+
+### Evaluation
+
+As an evaluation metric WER (Word Error Rate) was used.
+WER for first 100 sentences of the test dataset was about 23%.
diff --git a/2018-komp-ling/practicals/transliteration/freq.py b/2018-komp-ling/practicals/transliteration/freq.py
@@ -0,0 +1,36 @@
+import sys
+
+vocab = {} # dict to store frequency list
+
+# for each of the lines of input
+for line in sys.stdin.readlines(): 
+	# if there is no tab character, skip the line
+	if '\t' not in line:
+		continue
+	# make a list of the cells in the row
+	row = line.split('\t')
+	# if there are not 10 cells, skip the line
+	if len(row) != 10:
+		continue
+	# if the token is punctuation sign, skip the line
+	if row[3] == 'PUNCT':
+		continue
+	# the form is the value of the second cell
+	form = row[1].lower()
+	# if we haven't seen it yet, set the frequency count to 0
+	if form not in vocab:
+		vocab[form] = 0
+	vocab[form] = vocab[form] + 1
+
+# sort by frequency
+freq = []
+for w in vocab:
+	freq.append((vocab[w], w))
+freq.sort(reverse=True)
+
+# writing to file
+fs = open('freq_sort.txt', 'w+')
+for w in freq:
+	print('%d\t%s' % (w[0], w[1]), file = fs, sep = '\n')
+
+fs.close()