diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index 598c3cc2..ae3dc808 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -43,6 +43,8 @@
["spelling_check"],
["max_abstract_size_check"],
["theme_in_report_check"],
+ ["compare_goal_and_content_check"],
+ ["compare_tasks_and_content_check"],
]
DEFAULT_TYPE = 'pres'
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index 50729335..4a12c868 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -22,5 +22,7 @@
from .sections_check import LRReportSectionCheck
from .style_check import ReportStyleCheck
from .spelling_check import SpellingCheck
+from .compare_goal_and_content import CompareGoalAndContentCheck
+from .compare_tasks_and_content import CompareTasksAndContentCheck
from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
-from .template_name import ReportTemplateNameCheck
\ No newline at end of file
+from .template_name import ReportTemplateNameCheck
diff --git a/app/main/checks/report_checks/compare_goal_and_content.py b/app/main/checks/report_checks/compare_goal_and_content.py
new file mode 100644
index 00000000..263386a8
--- /dev/null
+++ b/app/main/checks/report_checks/compare_goal_and_content.py
@@ -0,0 +1,87 @@
+from ..base_check import BaseReportCriterion, answer
+
+import app.nlp.text_similarity as ts
+
+
+class CompareGoalAndContentCheck(BaseReportCriterion):
+ label = "Проверка соответствия цели и содержания"
+ description = "Степень раскрытия цели в содержании"
+ id = 'compare_goal_and_content_check'
+
+ def __init__(self, file_info):
+ super().__init__(file_info)
+ self.headers = []
+ self.goal = ""
+ self.chapters = {}
+ self.weights = {}
+ self.to_pass = 0
+ self.to_ignore = []
+
+ def late_init(self):
+ self.headers = self.file.make_chapters(self.file_type['report_type'])
+ self.weights = {
+ "ВВЕДЕНИЕ": 1,
+ "1": 2,
+ "2": 2,
+ "3": 5,
+ "4": 2,
+ "5": 1,
+ "ЗАКЛЮЧЕНИЕ": 1
+ }
+ self.to_pass = 0.1
+ self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]
+
+ def check(self):
+ self.late_init()
+ if self.file.page_counter() < 4:
+ return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
+ result = ""
+ intro_text = ""
+ for header in self.headers:
+ if header["text"].lower().find("введение") >= 0:
+ for child in header["child"]:
+ intro_text += child["text"]
+ goal_index = intro_text.find("Цель")
+ if goal_index > 0:
+ goal_start = goal_index + len("Цель") + 1
+ goal_end = intro_text.find(".", goal_start)
+ self.goal = intro_text[goal_start:goal_end]
+ else:
+ return answer(False, "В введении не найдена цель работы")
+ for header in self.headers:
+ if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
+ continue
+ text = ""
+ for child in header["child"]:
+ text += child['text']
+ self.chapters[header["text"]] = text
+ self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
+ NLPProcessor = ts.NLPProcessor()
+ calculate_result = NLPProcessor.calculate_cosine_similarity(self.goal, self.chapters)
+ max_result = max(calculate_result.values())
+ for k, v in calculate_result.items():
+ for chapter, weight in self.weights.items():
+ if k.find(chapter) == 0:
+ calculate_result[k] = v * weight
+ break
+ calculate_result[k] = calculate_result[k] / max_result
+ avg = round(sum(calculate_result.values()) / len(calculate_result.values()), 3)
+ if avg < self.to_pass:
+ return answer(False,
+ f"Цель недостаточно раскрыта в содержании (нужно {self.to_pass * 100}%, набрано {avg * 100}%)")
+ result += f"
Тема раскрыта на {avg * 100}%
"
+ sorted_chapters = dict(sorted(calculate_result.items(), key=lambda item: item[1], reverse=True))
+ result += f"
7 разделов, наиболее раскрывающих тему:
"
+ for i, key in enumerate(sorted_chapters.keys()):
+ if i >= 7:
+ break
+ result += f"
\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают тему
"
+ result += f"
7 разделов, наименее раскрывающих тему:
"
+ for i, key in enumerate(sorted_chapters.keys()):
+ if i < len(sorted_chapters) - 7:
+ continue
+ result += f"
\"{key}\", {self.__output(sorted_chapters[key], sum(sorted_chapters.values()))}% текста раскрывают тему
"
+ return answer(True, result)
+
+ def __output(self, value, summ):
+ return round(value / summ, 3) * 100
diff --git a/app/main/checks/report_checks/compare_tasks_and_content.py b/app/main/checks/report_checks/compare_tasks_and_content.py
new file mode 100644
index 00000000..06d757ad
--- /dev/null
+++ b/app/main/checks/report_checks/compare_tasks_and_content.py
@@ -0,0 +1,109 @@
+from ..base_check import BaseReportCriterion, answer
+
+import app.nlp.text_similarity as ts
+
+
+class CompareTasksAndContentCheck(BaseReportCriterion):
+ label = "Проверка соответствия задач и содержания"
+ description = "Степень раскрытия задач в содержании"
+ id = 'compare_tasks_and_content_check'
+
+ def __init__(self, file_info):
+ super().__init__(file_info)
+ self.headers = []
+ self.tasks = []
+ self.chapters = {}
+ self.weights = {}
+ self.all_to_pass = 0
+ self.specific_to_pass = 0
+ self.to_ignore = []
+ self.minimum_tasks = 0
+
+ def late_init(self):
+ self.headers = self.file.make_chapters(self.file_type['report_type'])
+ self.weights = {
+ "ВВЕДЕНИЕ": 1,
+ "1": 2,
+ "2": 2,
+ "3": 5,
+ "4": 2,
+ "5": 1,
+ "ЗАКЛЮЧЕНИЕ": 1
+ }
+ self.all_to_pass = 0.15
+ self.specific_to_pass = 0.05
+ self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"]
+ self.minimum_tasks = 3
+
+ def check(self):
+ self.late_init()
+ if self.file.page_counter() < 4:
+ return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
+ result = ""
+ possible_tasks = []
+ for header in self.headers:
+ if header["text"].lower().find("введение") >= 0:
+ for i, child in enumerate(header["child"]):
+ if child["text"].lower().find("задачи") >= 0:
+ possible_tasks.append(i)
+ if child["text"].lower().find("объект") >= 0 and child["text"].lower().find("исследования") > 0:
+ if not possible_tasks:
+ return answer(False, "В введении не найдены задачи работы")
+ tasks = header["child"][max(possible_tasks) + 1:i]
+ while len(tasks) <= self.minimum_tasks:
+ try:
+ possible_tasks.remove(max(possible_tasks))
+ tasks = header["child"][max(possible_tasks) + 1:i]
+ except:
+ return answer(False, f"В введении меньше {self.minimum_tasks} задач, что меньше необходимого минимума")
+ self.tasks = [task["text"] for task in tasks]
+ break
+ if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore):
+ continue
+ text = ""
+ for child in header["child"]:
+ text += child['text']
+ self.chapters[header["text"]] = text
+ self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()}
+ NLPProcessor = ts.NLPProcessor()
+ all_tasks_result = NLPProcessor.calculate_cosine_similarity(" ".join(self.tasks), self.chapters)
+ max_result = max(all_tasks_result.values())
+ for k, v in all_tasks_result.items():
+ for chapter, weight in self.weights.items():
+ if k.find(chapter) == 0:
+ all_tasks_result[k] = v * weight
+ break
+ all_tasks_result[k] = round(all_tasks_result[k] / max_result, 3)
+ avg = round(sum(all_tasks_result.values()) / len(all_tasks_result.values()), 3)
+ if avg < self.all_to_pass:
+ return answer(False, f"Задачи недостаточно раскрыты в содержании (нужно {self.all_to_pass * 100}%, набрано {avg * 100}%)")
+ result += f"
Задачи раскрыты на {avg * 100}%
"
+ for task in self.tasks:
+ cur_task = NLPProcessor.calculate_cosine_similarity(task, self.chapters)
+ max_result = max(cur_task.values())
+ for k, v in cur_task.items():
+ for chapter, weight in self.weights.items():
+ if k.find(chapter) == 0:
+ cur_task[k] = v * weight
+ break
+ cur_task[k] = cur_task[k] / max_result
+ sorted_chapters = dict(sorted(cur_task.items(), key=lambda item: item[1], reverse=True))
+ specific_avg = sum(sorted_chapters.values()) / len(sorted_chapters.values())
+ specific_avg = round(specific_avg, 3)
+ if specific_avg < self.specific_to_pass:
+ return answer(False, f"
Задача \"{task}\" недостаточно раскрыта
")
+ result += f"
Задача \"{task}\" раскрыта на {round(specific_avg * 100, 2)}%
Задачу \"{task}\" наиболее раскрывают разделы:
"
+ for i, key in enumerate(sorted_chapters.keys()):
+ if i >= 3:
+ break
+ result += f"
\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают задачу
"
+ all_tasks_result = dict(sorted(all_tasks_result.items(), key=lambda item: item[1], reverse=True))
+ result += f"
Разделы, наименее раскрывающие задачи:
"
+ for i, key in enumerate(all_tasks_result.keys()):
+ if i < len(all_tasks_result.keys()) - 5:
+ continue
+ result += f"
{key}: {round(all_tasks_result[key] * 100, 3)}%
"
+ return answer(True, result)
+
+ def __output(self, value, summ):
+ return (value / summ) * 100
diff --git a/app/nlp/text_similarity.py b/app/nlp/text_similarity.py
new file mode 100644
index 00000000..ce8be923
--- /dev/null
+++ b/app/nlp/text_similarity.py
@@ -0,0 +1,97 @@
+from collections import defaultdict
+
+import nltk
+import numpy as np
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import SnowballStemmer
+from nltk.util import ngrams
+import string
+
+
+class NLPProcessor:
+ def __init__(self, language='russian'):
+ nltk.download('punkt')
+ nltk.download('stopwords')
+ self.stop_words = set(stopwords.words(language))
+ self.stemmer = SnowballStemmer(language)
+
+ def preprocessing(self, text):
+ text = text.translate(str.maketrans('', '', string.punctuation))
+ tokens = word_tokenize(text)
+ tokens = [word for word in tokens if word.lower() not in self.stop_words]
+ return [self.stemmer.stem(token) for token in tokens]
+
+ def get_ngrams(self, tokens, n=2):
+ result = []
+ for i in range(n):
+ n_grams = ngrams(tokens, i + 1)
+ result.extend([' '.join(grams) for grams in n_grams])
+ return result
+
+ def get_bag_of_n_gramms(self, corpus):
+ new_corpus = []
+ for item in corpus:
+ for n_gramm in item:
+ new_corpus.append(n_gramm)
+ index_word = {}
+ i = 0
+ for word in new_corpus:
+ if word in index_word.keys():
+ continue
+ index_word[word] = i
+ i += 1
+ return index_word
+
+ def get_vector_by_BOW(self, bag_of_ngramms, doc, docs):
+ def tf(word, doc):
+ return doc.count(word) / len(doc)
+
+ def idf(word, docs):
+ word_in_docs = 0
+ for item in docs:
+ if word in item:
+ word_in_docs += 1
+ return np.log10(len(docs) / (word_in_docs + 1))
+
+ def tf_idf(word, doc, docs):
+ return tf(word, doc) * idf(word, docs)
+
+ count_dict = defaultdict(int)
+ vec = np.zeros(len(bag_of_ngramms))
+ for word in doc:
+ count_dict[word] += tf_idf(word, doc, docs)
+
+ for key, item in count_dict.items():
+ vec[bag_of_ngramms[key]] = item
+ return vec
+
+ def cosine_similarity(self, vector1, vector2):
+ norm1 = np.linalg.norm(vector1)
+ norm2 = np.linalg.norm(vector2)
+ dot_product = np.dot(vector1, vector2)
+ if norm1 == 0.0 or norm2 == 0.0:
+ return 0
+ cosine_sim = dot_product / (norm1 * norm2)
+ return round(cosine_sim, 3)
+
+ def calculate_cosine_similarity(self, goal, texts: dict):
+ if not (goal or texts):
+ return
+ corpus = []
+ text1_n_grams = self.get_ngrams(self.preprocessing(goal))
+ text2_n_grams = {}
+ for chapter in texts.keys():
+ text2_n_grams[chapter] = self.get_ngrams(self.preprocessing(texts[chapter]))
+ corpus.append(text1_n_grams)
+ corpus.extend(text2_n_grams.values())
+ bag_of_n_grams = self.get_bag_of_n_gramms(corpus)
+ goal_vector = self.get_vector_by_BOW(bag_of_n_grams, text1_n_grams, corpus)
+ text_vectors = {}
+ for chapter, text in text2_n_grams.items():
+ text_vectors[chapter] = self.get_vector_by_BOW(bag_of_n_grams, text, corpus)
+ result = {}
+ for chapter in text_vectors.keys():
+ text_vector = text_vectors[chapter]
+ result[chapter] = self.cosine_similarity(goal_vector, text_vector)
+ return result
\ No newline at end of file