diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index 598c3cc2..ae3dc808 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -43,6 +43,8 @@ ["spelling_check"], ["max_abstract_size_check"], ["theme_in_report_check"], + ["compare_goal_and_content_check"], + ["compare_tasks_and_content_check"], ] DEFAULT_TYPE = 'pres' diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py index 50729335..4a12c868 100644 --- a/app/main/checks/report_checks/__init__.py +++ b/app/main/checks/report_checks/__init__.py @@ -22,5 +22,7 @@ from .sections_check import LRReportSectionCheck from .style_check import ReportStyleCheck from .spelling_check import SpellingCheck +from .compare_goal_and_content import CompareGoalAndContentCheck +from .compare_tasks_and_content import CompareTasksAndContentCheck from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck -from .template_name import ReportTemplateNameCheck \ No newline at end of file +from .template_name import ReportTemplateNameCheck diff --git a/app/main/checks/report_checks/compare_goal_and_content.py b/app/main/checks/report_checks/compare_goal_and_content.py new file mode 100644 index 00000000..263386a8 --- /dev/null +++ b/app/main/checks/report_checks/compare_goal_and_content.py @@ -0,0 +1,87 @@ +from ..base_check import BaseReportCriterion, answer + +import app.nlp.text_similarity as ts + + +class CompareGoalAndContentCheck(BaseReportCriterion): + label = "Проверка соответствия цели и содержания" + description = "Степень раскрытия цели в содержании" + id = 'compare_goal_and_content_check' + + def __init__(self, file_info): + super().__init__(file_info) + self.headers = [] + self.goal = "" + self.chapters = {} + self.weights = {} + self.to_pass = 0 + self.to_ignore = [] + + def late_init(self): + self.headers = self.file.make_chapters(self.file_type['report_type']) + self.weights = { + "ВВЕДЕНИЕ": 1, + "1": 2, + "2": 2, + "3": 5, + "4": 2, + "5": 1, + "ЗАКЛЮЧЕНИЕ": 1 + } + self.to_pass = 0.1 + self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"] + + def check(self): + self.late_init() + if self.file.page_counter() < 4: + return answer(False, "В отчете недостаточно страниц. Нечего проверять.") + result = "" + intro_text = "" + for header in self.headers: + if header["text"].lower().find("введение") >= 0: + for child in header["child"]: + intro_text += child["text"] + goal_index = intro_text.find("Цель") + if goal_index > 0: + goal_start = goal_index + len("Цель") + 1 + goal_end = intro_text.find(".", goal_start) + self.goal = intro_text[goal_start:goal_end] + else: + return answer(False, "В введении не найдена цель работы") + for header in self.headers: + if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore): + continue + text = "" + for child in header["child"]: + text += child['text'] + self.chapters[header["text"]] = text + self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()} + NLPProcessor = ts.NLPProcessor() + calculate_result = NLPProcessor.calculate_cosine_similarity(self.goal, self.chapters) + max_result = max(calculate_result.values()) + for k, v in calculate_result.items(): + for chapter, weight in self.weights.items(): + if k.find(chapter) == 0: + calculate_result[k] = v * weight + break + calculate_result[k] = calculate_result[k] / max_result + avg = round(sum(calculate_result.values()) / len(calculate_result.values()), 3) + if avg < self.to_pass: + return answer(False, + f"Цель недостаточно раскрыта в содержании (нужно {self.to_pass * 100}%, набрано {avg * 100}%)") + result += f"
Тема раскрыта на {avg * 100}%
" + sorted_chapters = dict(sorted(calculate_result.items(), key=lambda item: item[1], reverse=True)) + result += f"
7 разделов, наиболее раскрывающих тему:
" + for i, key in enumerate(sorted_chapters.keys()): + if i >= 7: + break + result += f"
\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают тему
" + result += f"
7 разделов, наименее раскрывающих тему:
" + for i, key in enumerate(sorted_chapters.keys()): + if i < len(sorted_chapters) - 7: + continue + result += f"
\"{key}\", {self.__output(sorted_chapters[key], sum(sorted_chapters.values()))}% текста раскрывают тему
" + return answer(True, result) + + def __output(self, value, summ): + return round(value / summ, 3) * 100 diff --git a/app/main/checks/report_checks/compare_tasks_and_content.py b/app/main/checks/report_checks/compare_tasks_and_content.py new file mode 100644 index 00000000..06d757ad --- /dev/null +++ b/app/main/checks/report_checks/compare_tasks_and_content.py @@ -0,0 +1,109 @@ +from ..base_check import BaseReportCriterion, answer + +import app.nlp.text_similarity as ts + + +class CompareTasksAndContentCheck(BaseReportCriterion): + label = "Проверка соответствия задач и содержания" + description = "Степень раскрытия задач в содержании" + id = 'compare_tasks_and_content_check' + + def __init__(self, file_info): + super().__init__(file_info) + self.headers = [] + self.tasks = [] + self.chapters = {} + self.weights = {} + self.all_to_pass = 0 + self.specific_to_pass = 0 + self.to_ignore = [] + self.minimum_tasks = 0 + + def late_init(self): + self.headers = self.file.make_chapters(self.file_type['report_type']) + self.weights = { + "ВВЕДЕНИЕ": 1, + "1": 2, + "2": 2, + "3": 5, + "4": 2, + "5": 1, + "ЗАКЛЮЧЕНИЕ": 1 + } + self.all_to_pass = 0.15 + self.specific_to_pass = 0.05 + self.to_ignore = ["СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", "ПРИЛОЖЕНИЕ"] + self.minimum_tasks = 3 + + def check(self): + self.late_init() + if self.file.page_counter() < 4: + return answer(False, "В отчете недостаточно страниц. Нечего проверять.") + result = "" + possible_tasks = [] + for header in self.headers: + if header["text"].lower().find("введение") >= 0: + for i, child in enumerate(header["child"]): + if child["text"].lower().find("задачи") >= 0: + possible_tasks.append(i) + if child["text"].lower().find("объект") >= 0 and child["text"].lower().find("исследования") > 0: + if not possible_tasks: + return answer(False, "В введении не найдены задачи работы") + tasks = header["child"][max(possible_tasks) + 1:i] + while len(tasks) <= self.minimum_tasks: + try: + possible_tasks.remove(max(possible_tasks)) + tasks = header["child"][max(possible_tasks) + 1:i] + except: + return answer(False, f"В введении меньше {self.minimum_tasks} задач, что меньше необходимого минимума") + self.tasks = [task["text"] for task in tasks] + break + if any(ignore_phrase in header["text"] for ignore_phrase in self.to_ignore): + continue + text = "" + for child in header["child"]: + text += child['text'] + self.chapters[header["text"]] = text + self.chapters = {k: v for k, v in self.chapters.items() if v and v.strip()} + NLPProcessor = ts.NLPProcessor() + all_tasks_result = NLPProcessor.calculate_cosine_similarity(" ".join(self.tasks), self.chapters) + max_result = max(all_tasks_result.values()) + for k, v in all_tasks_result.items(): + for chapter, weight in self.weights.items(): + if k.find(chapter) == 0: + all_tasks_result[k] = v * weight + break + all_tasks_result[k] = round(all_tasks_result[k] / max_result, 3) + avg = round(sum(all_tasks_result.values()) / len(all_tasks_result.values()), 3) + if avg < self.all_to_pass: + return answer(False, f"Задачи недостаточно раскрыты в содержании (нужно {self.all_to_pass * 100}%, набрано {avg * 100}%)") + result += f"
Задачи раскрыты на {avg * 100}%
" + for task in self.tasks: + cur_task = NLPProcessor.calculate_cosine_similarity(task, self.chapters) + max_result = max(cur_task.values()) + for k, v in cur_task.items(): + for chapter, weight in self.weights.items(): + if k.find(chapter) == 0: + cur_task[k] = v * weight + break + cur_task[k] = cur_task[k] / max_result + sorted_chapters = dict(sorted(cur_task.items(), key=lambda item: item[1], reverse=True)) + specific_avg = sum(sorted_chapters.values()) / len(sorted_chapters.values()) + specific_avg = round(specific_avg, 3) + if specific_avg < self.specific_to_pass: + return answer(False, f"
Задача \"{task}\" недостаточно раскрыта
") + result += f"
Задача \"{task}\" раскрыта на {round(specific_avg * 100, 2)}%

Задачу \"{task}\" наиболее раскрывают разделы:
" + for i, key in enumerate(sorted_chapters.keys()): + if i >= 3: + break + result += f"
\"{key}\", {round(self.__output(sorted_chapters[key], sum(sorted_chapters.values())), 3)}% текста раскрывают задачу
" + all_tasks_result = dict(sorted(all_tasks_result.items(), key=lambda item: item[1], reverse=True)) + result += f"
Разделы, наименее раскрывающие задачи:
" + for i, key in enumerate(all_tasks_result.keys()): + if i < len(all_tasks_result.keys()) - 5: + continue + result += f"
{key}: {round(all_tasks_result[key] * 100, 3)}%
" + return answer(True, result) + + def __output(self, value, summ): + return (value / summ) * 100 diff --git a/app/nlp/text_similarity.py b/app/nlp/text_similarity.py new file mode 100644 index 00000000..ce8be923 --- /dev/null +++ b/app/nlp/text_similarity.py @@ -0,0 +1,97 @@ +from collections import defaultdict + +import nltk +import numpy as np +from nltk import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import SnowballStemmer +from nltk.util import ngrams +import string + + +class NLPProcessor: + def __init__(self, language='russian'): + nltk.download('punkt') + nltk.download('stopwords') + self.stop_words = set(stopwords.words(language)) + self.stemmer = SnowballStemmer(language) + + def preprocessing(self, text): + text = text.translate(str.maketrans('', '', string.punctuation)) + tokens = word_tokenize(text) + tokens = [word for word in tokens if word.lower() not in self.stop_words] + return [self.stemmer.stem(token) for token in tokens] + + def get_ngrams(self, tokens, n=2): + result = [] + for i in range(n): + n_grams = ngrams(tokens, i + 1) + result.extend([' '.join(grams) for grams in n_grams]) + return result + + def get_bag_of_n_gramms(self, corpus): + new_corpus = [] + for item in corpus: + for n_gramm in item: + new_corpus.append(n_gramm) + index_word = {} + i = 0 + for word in new_corpus: + if word in index_word.keys(): + continue + index_word[word] = i + i += 1 + return index_word + + def get_vector_by_BOW(self, bag_of_ngramms, doc, docs): + def tf(word, doc): + return doc.count(word) / len(doc) + + def idf(word, docs): + word_in_docs = 0 + for item in docs: + if word in item: + word_in_docs += 1 + return np.log10(len(docs) / (word_in_docs + 1)) + + def tf_idf(word, doc, docs): + return tf(word, doc) * idf(word, docs) + + count_dict = defaultdict(int) + vec = np.zeros(len(bag_of_ngramms)) + for word in doc: + count_dict[word] += tf_idf(word, doc, docs) + + for key, item in count_dict.items(): + vec[bag_of_ngramms[key]] = item + return vec + + def cosine_similarity(self, vector1, vector2): + norm1 = np.linalg.norm(vector1) + norm2 = np.linalg.norm(vector2) + dot_product = np.dot(vector1, vector2) + if norm1 == 0.0 or norm2 == 0.0: + return 0 + cosine_sim = dot_product / (norm1 * norm2) + return round(cosine_sim, 3) + + def calculate_cosine_similarity(self, goal, texts: dict): + if not (goal or texts): + return + corpus = [] + text1_n_grams = self.get_ngrams(self.preprocessing(goal)) + text2_n_grams = {} + for chapter in texts.keys(): + text2_n_grams[chapter] = self.get_ngrams(self.preprocessing(texts[chapter])) + corpus.append(text1_n_grams) + corpus.extend(text2_n_grams.values()) + bag_of_n_grams = self.get_bag_of_n_gramms(corpus) + goal_vector = self.get_vector_by_BOW(bag_of_n_grams, text1_n_grams, corpus) + text_vectors = {} + for chapter, text in text2_n_grams.items(): + text_vectors[chapter] = self.get_vector_by_BOW(bag_of_n_grams, text, corpus) + result = {} + for chapter in text_vectors.keys(): + text_vector = text_vectors[chapter] + result[chapter] = self.cosine_similarity(goal_vector, text_vector) + return result \ No newline at end of file