diff --git a/Dockerfile b/Dockerfile index 00795845..a4182c9e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,10 +7,10 @@ RUN npm install && npm install webpack ADD ./assets ./assets RUN npm run build -FROM dvivanov/dis-base:v0.4 +FROM dvivanov/dis-base:v0.5 LABEL project='dis' -LABEL version='0.4' +LABEL version='0.5' WORKDIR /usr/src/project diff --git a/Dockerfile_base b/Dockerfile_base index 0f196f92..d1724abe 100644 --- a/Dockerfile_base +++ b/Dockerfile_base @@ -1,14 +1,19 @@ FROM python:3.10-slim-bullseye LABEL project='dis' -LABEL version='0.4-base' +LABEL version='0.5-base' ENV LANG en_US.UTF-8 ENV TZ=Europe/Moscow RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre +RUN apt-get update && apt-get install -y \ + libreoffice-writer \ + libreoffice-impress \ + default-jre \ + tesseract-ocr \ + tesseract-ocr-rus ADD requirements.txt . RUN python3 -m pip install -r requirements.txt --no-cache-dir diff --git a/app/db/db_methods.py b/app/db/db_methods.py index 215c6e4c..3b797104 100644 --- a/app/db/db_methods.py +++ b/app/db/db_methods.py @@ -7,7 +7,7 @@ from pymongo import MongoClient from utils import convert_to -from .db_types import User, Presentation, Check, Consumers, Logs +from .db_types import User, Presentation, Check, Consumers, Logs, Image client = MongoClient("mongodb://mongodb:27017") db = client['pres-parser-db'] @@ -18,14 +18,62 @@ checks_collection = db['checks'] consumers_collection = db['consumers'] criteria_pack_collection = db['criteria_pack'] +parsed_texts_collection = db['parsed_texts'] logs_collection = db.create_collection( 'logs', capped=True, size=5242880) if not db['logs'] else db['logs'] celery_check_collection = db['celery_check'] # collection for mapping celery_task to check +celery_tesseract_collection = db['celery_tesseract'] +images_collection = db['images'] # коллекция для хранения изображений def get_client(): return client +def get_image(image_id): + image = images_collection.find({'_id': image_id}) + if image is not None: + return Image(image) + else: + return None + +def get_images(check_id): + images = images_collection.find({'check_id': str(check_id)}) + if images is not None: + image_list = [] + for img in images: + image_list.append(Image(img)) + return image_list + else: + return None + +def save_image_to_db(check_id, image_data, caption, image_size, text=None, page=None): + image = Image({ + 'check_id': check_id, + 'image_data': image_data, + 'caption': caption, + 'image_size': image_size, + 'text' : text, + 'page' : page, + }) + result = images_collection.insert_one(image.pack()) + return result.inserted_id + +def update_image(image): + return bool(images_collection.find_one_and_replace({'_id': image._id}, image.pack())) + +def add_image_text(image_id, new_text): + result = images_collection.update_one( + {'_id': image_id}, + {'$set': {'text': new_text}} + ) + return result.matched_count > 0 + +def add_image_page(image_id, page): + result = images_collection.update_one( + {'_id': image_id}, + {'$set': {'page': page}} + ) + return result.matched_count > 0 # Returns user if user was created and None if already exists def add_user(username, password_hash='', is_LTI=False): @@ -145,6 +193,12 @@ def add_check(file_id, check): def update_check(check): return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack())) +def add_parsed_text(check_id, parsed_text): + result = parsed_texts_collection.update_one({'filename': parsed_text.filename}, {'$set': parsed_text.pack()}, upsert=True) + if result.upserted_id: parsed_texts_id = result.upserted_id + else: parsed_texts_id = parsed_texts_collection.find_one({'filename': parsed_text.filename})['_id'] + files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': parsed_texts_id}}) + return parsed_texts_id def write_pdf(filename, filepath): converted_filepath = convert_to(filepath, target_format='pdf') @@ -443,3 +497,40 @@ def get_celery_task(celery_task_id): def get_celery_task_by_check(check_id): return celery_check_collection.find_one({'check_id': check_id}) + + +def get_celery_task_status_by_check(check_id): + celery_task = get_celery_task_by_check(check_id) + if celery_task and 'finished_at' in celery_task: + return True + return False + + +def add_celery_tesseract_task(celery_tesseract_task_id, check_id): + return celery_tesseract_collection.insert_one( + {'celery_tesseract_task_id': celery_tesseract_task_id, 'check_id': check_id, 'started_at': datetime.now()}).inserted_id + + +def get_celery_tesseract_task_status_by_check(check_id): + celery_tesseract_task = get_celery_tesseract_task_by_check(check_id) + if celery_tesseract_task and 'finished_at' in celery_tesseract_task: + return True + return False + + +def mark_celery_tesseract_task_as_finished_by_check(check_id, tesseract_result, finished_time=None): + celery_tesseract_task = get_celery_tesseract_task_by_check(check_id) + if not celery_tesseract_task: return + if finished_time is None: finished_time = datetime.now() + return celery_tesseract_collection.update_one({'check_id': check_id}, { + '$set': {'finished_at': finished_time, + 'tesseract_result': tesseract_result, + 'processing_time': (finished_time - celery_tesseract_task['started_at']).total_seconds()}}) + + +def get_celery_tesseract_task(celery_tesseract_task_id): + return celery_tesseract_collection.find_one({'celery_tesseract_task_id': celery_tesseract_task_id}) + + +def get_celery_tesseract_task_by_check(check_id): + return celery_tesseract_collection.find_one({'check_id': check_id}) diff --git a/app/db/db_types.py b/app/db/db_types.py index 049e5bdc..53d3a07f 100644 --- a/app/db/db_types.py +++ b/app/db/db_types.py @@ -150,3 +150,31 @@ def none_to_false(x): is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False return {'is_ended': is_ended, 'is_failed': is_failed} + +class Image(PackableWithId): + def __init__(self, dictionary=None): + super().__init__(dictionary) + dictionary = dictionary or {} + self.check_id = dictionary.get('check_id') # Привязка к check_id + self.caption = dictionary.get('caption', '') # Подпись к изображению + self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata + self.image_size = dictionary.get('image_size') # Размер изображения в сантимерах + self.text = dictionary.get('text', None) + self.page = dictionary.get('page', None) + + def pack(self): + package = super().pack() + package['check_id'] = str(self.check_id) + package['caption'] = self.caption + package['image_data'] = self.image_data + package['image_size'] = self.image_size + package['text'] = self.text + package['page'] = self.page + return package + +class ParsedText(PackableWithId): + def __init__(self, dictionary=None): + super().__init__(dictionary) + dictionary = dictionary or {} + self.filename = dictionary.get('filename', '') + self.parsed_chapters = dictionary.get('parsed_chapters', []) diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index dea098c2..89b1af43 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -22,6 +22,8 @@ ] BASE_REPORT_CRITERION = [ ["simple_check"], + ["image_text_check"], + ['image_quality_check'], ["banned_words_in_literature"], ["page_counter"], ["image_share_check"], diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py index a8b11c0e..c18a5c12 100644 --- a/app/main/checks/report_checks/__init__.py +++ b/app/main/checks/report_checks/__init__.py @@ -25,6 +25,8 @@ from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck from .template_name import ReportTemplateNameCheck from .empty_task_page_check import EmptyTaskPageCheck +from .image_text_check import ImageTextCheck +from .image_quality_check import ImageQualityCheck from .water_in_the_text_check import WaterInTheTextCheck from .sw_section_banned_words import SWSectionBannedWordsCheck from .sw_section_lit_reference import SWSectionLiteratureReferenceCheck diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py new file mode 100644 index 00000000..d069fe94 --- /dev/null +++ b/app/main/checks/report_checks/image_quality_check.py @@ -0,0 +1,54 @@ +from ..base_check import BaseReportCriterion, answer +import cv2 +import numpy as np + +class ImageQualityCheck(BaseReportCriterion): + label = "Проверка качества изображений" + description = '' + id = 'image_quality_check' + # необходимо подобрать min_laplacian и min_entropy + def __init__(self, file_info, min_laplacian=100, min_entropy=1): + super().__init__(file_info) + self.images = self.file.images + self.min_laplacian = min_laplacian + self.min_entropy = min_entropy + self.laplacian_score = None + self.entropy_score = None + + def check(self): + deny_list = [] + if self.images: + for img in self.images: + image_array = np.frombuffer(img.image_data, dtype=np.uint8) + img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + + if img_cv is None: + deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.
") + continue + + self.find_params(img_cv) + + if self.laplacian_score is None or self.entropy_score is None: + deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.
") + continue + + if self.laplacian_score < self.min_laplacian: + deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).
") + + if self.entropy_score < self.min_entropy: + deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).
") + else: + return answer(True, 'Изображения не найдены!') + if deny_list: + return answer(False, f'Изображения нечитаемы!
Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.
{"".join(deny_list)}') + else: + return answer(True, 'Изображения корректны!') + + def find_params(self, image): + if image is None or image.size == 0: + return None, None + gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var() + hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256]) + hist = hist / hist.sum() + self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10)) \ No newline at end of file diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py new file mode 100644 index 00000000..0c5add85 --- /dev/null +++ b/app/main/checks/report_checks/image_text_check.py @@ -0,0 +1,28 @@ +from ..base_check import BaseReportCriterion, answer + + +class ImageTextCheck(BaseReportCriterion): + label = "Проверка текста, считанного с изображений" + description = '' + id = 'image_text_check' + # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density + def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=4): + super().__init__(file_info) + self.images = self.file.images + self.symbols_set = symbols_set + self.max_symbols_percentage = max_symbols_percentage + self.max_text_density = max_text_density + + def check(self): + from app.tesseract_tasks import tesseract_recognize, callback_task + from db.db_methods import add_celery_tesseract_task + if self.images: + tesseract_task = tesseract_recognize.apply_async( + args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density], + link=callback_task.s(self.images[0].check_id), + link_error=callback_task.s(self.images[0].check_id) + ) + add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id) + return answer(True, 'Изображения проверяются!') + else: + return answer(True, 'Изображения не найдены!') diff --git a/app/main/parser.py b/app/main/parser.py index 593b8cfd..dcb33b31 100644 --- a/app/main/parser.py +++ b/app/main/parser.py @@ -8,10 +8,11 @@ from main.reports.md_uploader import MdUploader from utils import convert_to + logger = logging.getLogger('root_logger') +def parse(filepath, pdf_filepath, check_id): -def parse(filepath, pdf_filepath): tmp_filepath = filepath.lower() try: if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')): @@ -19,7 +20,12 @@ def parse(filepath, pdf_filepath): if tmp_filepath.endswith(('.odp', '.ppt')): logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.") new_filepath = convert_to(filepath, target_format='pptx') - file_object = PresentationPPTX(new_filepath) + + presentation = PresentationPPTX(new_filepath) + presentation.extract_images_with_captions(check_id) + file_object = presentation + + elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )): new_filepath = filepath if tmp_filepath.endswith(('.doc', '.odt')): @@ -29,6 +35,7 @@ def parse(filepath, pdf_filepath): docx = DocxUploader() docx.upload(new_filepath, pdf_filepath) docx.parse() + docx.extract_images_with_captions(check_id) file_object = docx elif tmp_filepath.endswith('.md' ): @@ -54,4 +61,4 @@ def save_to_temp_file(file): temp_file.write(file.read()) temp_file.close() file.seek(0) - return temp_file.name + return temp_file.name \ No newline at end of file diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py index dd909f8c..a8b8581f 100644 --- a/app/main/presentations/pptx/presentation_pptx.py +++ b/app/main/presentations/pptx/presentation_pptx.py @@ -1,4 +1,7 @@ +from io import BytesIO + from pptx import Presentation +from pptx.enum.shapes import MSO_SHAPE_TYPE from .slide_pptx import SlidePPTX from ..presentation_basic import PresentationBasic @@ -17,3 +20,39 @@ def add_slides(self): def __str__(self): return super().__str__() + + def extract_images_with_captions(self, check_id): + from app.db.db_methods import save_image_to_db + + # Проход по каждому слайду в презентации + for slide in self.slides: + image_found = False + image_data = None + caption_text = None + + # Проход по всем фигурам на слайде + for shape in slide.slide.shapes: # Используем slide.slide для доступа к текущему слайду + if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: + image_found = True + image_part = shape.image # Получаем объект изображения + + # Извлекаем бинарные данные изображения + image_stream = image_part.blob + image_data = BytesIO(image_stream) + + # Если мы нашли изображение, ищем следующий непустой текст как подпись + if image_found: + for shape in slide.slide.shapes: + if not shape.has_text_frame: + continue + text = shape.text.strip() + if text: # Находим непустое текстовое поле (предположительно, это подпись) + caption_text = text + # Сохраняем изображение и его подпись + save_image_to_db(check_id, image_data.getvalue(), caption_text) + break # Предполагаем, что это подпись к текущему изображению + + # Сброс флага и данных изображения для следующего цикла + image_found = False + image_data = None + caption_text = None diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py index d0653fae..8a6a7303 100644 --- a/app/main/reports/document_uploader.py +++ b/app/main/reports/document_uploader.py @@ -12,6 +12,7 @@ def __init__(self): self.literature_page = 0 self.first_lines = [] self.page_count = 0 + self.images = [] @abstractmethod def upload(self): diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py index ac30dee4..1c52295c 100644 --- a/app/main/reports/docx_uploader/docx_uploader.py +++ b/app/main/reports/docx_uploader/docx_uploader.py @@ -12,6 +12,7 @@ from ..document_uploader import DocumentUploader + class DocxUploader(DocumentUploader): def __init__(self): super().__init__() @@ -242,6 +243,52 @@ def show_chapters(self, work_type): chapters_str += "    " + header["text"] + "
" return chapters_str + def extract_images_with_captions(self, check_id): + from app.db.db_methods import save_image_to_db, get_images + + emu_to_cm = 360000 + image_found = False + image_data = None + image_style="ВКР_Подпись для рисунков" + if not self.images: + for i, paragraph in enumerate(self.file.paragraphs): + for run in paragraph.runs: + if "graphic" in run._element.xml: + image_streams = run._element.findall('.//a:blip', namespaces={ + 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) + for image_stream in image_streams: + embed_id = image_stream.get( + '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') + if embed_id: + image_found = True + image_part = self.file.part.related_parts[embed_id] + image_data = image_part.blob + extent = run._element.find('.//wp:extent', namespaces={ + 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) + width_cm = height_cm = None + if extent is not None: + width_cm = int(extent.get('cx')) / emu_to_cm + height_cm = int(extent.get('cy')) / emu_to_cm + if image_found: + caption = "picture without caption" + next_paragraph_index = i + 1 + while next_paragraph_index < len(self.file.paragraphs): + next_paragraph = self.file.paragraphs[next_paragraph_index] + style_name = next_paragraph.style.name.lower() + next_text = next_paragraph.text.strip() + if any("graphic" in r._element.xml for r in next_paragraph.runs): + break + elif next_text and style_name == image_style.lower() and 'Рисунок' in next_text: + caption = next_text + break + next_paragraph_index += 1 + save_image_to_db(check_id, image_data, caption, (width_cm, height_cm)) + image_found = False + image_data = None + + self.images = get_images(check_id) + + def main(args): file = args.file diff --git a/app/main/reports/pasre_file/parse_file.py b/app/main/reports/pasre_file/parse_file.py new file mode 100644 index 00000000..d6f272f6 --- /dev/null +++ b/app/main/reports/pasre_file/parse_file.py @@ -0,0 +1,36 @@ +import re +from db import db_methods + +def parse_headers_and_pages_and_images(chapters, docx): + text_on_page = docx.pdf_file.get_text_on_page() + images = docx.images + for page, text in text_on_page.items(): + text = re.sub(r"(-\n)", "", text) + text = re.sub(r"\s\n", " ", text) + if "СОДЕРЖАНИЕ" in text: + continue + for chapter in chapters: + if chapter["header"] in text: + chapter["start_page"] = page + for image in images: + if image.caption in text: + db_methods.add_image_page(image._id, page) + for chapter in chapters: + for image in images: + if image.caption in chapter["text"]: + chapter["images"].append(image._id) + return chapters + + +def parse_chapters(docx): + chapters = [] + for chapter in docx.chapters: + head = chapter["styled_text"]["text"] + if "ПРИЛОЖЕНИЕ" in head: + head = head.split(".")[0] + if chapter["child"] != [] and "heading" in chapter["style"]: + temp_text = "" + for i in range(len(chapter["child"])): + temp_text += chapter["child"][i]["styled_text"]["text"] + chapters.append({"header": head, "start_page": 0, "text": temp_text, "images": []}) + return chapters \ No newline at end of file diff --git a/app/server.py b/app/server.py index 4eef1e9e..a3798178 100644 --- a/app/server.py +++ b/app/server.py @@ -179,4 +179,4 @@ def __call__(self, environ, start_response): logger.info("Сервер запущен по адресу http://" + str(ip) + ':' + str(port) + " в " + ("отладочном" if DEBUG else "рабочем") + " режиме") utils.create_consumers(app.config['LTI_CONSUMERS']) - app.run(debug=DEBUG, host=ip, port=8080, use_reloader=True) + app.run(debug=DEBUG, host=ip, port=8080, use_reloader=True) \ No newline at end of file diff --git a/app/tasks.py b/app/tasks.py index c7ba47df..e4510ce4 100644 --- a/app/tasks.py +++ b/app/tasks.py @@ -6,12 +6,14 @@ from celery.signals import worker_ready from passback_grades import run_passback +from main.reports.pasre_file import parse_file from db import db_methods -from db.db_types import Check +from db.db_types import Check, ParsedText from main.checker import check from main.parser import parse from main.check_packs import BASE_PACKS from root_logger import get_root_logger +from tesseract_tasks import update_tesseract_criteria_result config = ConfigParser() config.read('app/config.ini') @@ -52,9 +54,18 @@ def create_task(self, check_info): original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}") pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf") try: - updated_check = check(parse(original_filepath, pdf_filepath), check_obj) - updated_check.is_ended = True + parsed_file_object = parse(original_filepath, pdf_filepath, check_id) + parsed_file_object.make_chapters(check_obj.file_type['report_type']) + parsed_file_object.make_headers(check_obj.file_type['report_type']) + chapters = parse_file.parse_chapters(parsed_file_object) + + updated_check = check(parsed_file_object, check_obj) updated_check.is_failed = False + parsed_text = ParsedText(dict(filename=check_info['filename'])) + parsed_text.parsed_chapters = parse_file.parse_headers_and_pages_and_images(chapters, parsed_file_object) + db_methods.add_parsed_text(check_id, parsed_text) + if db_methods.get_celery_tesseract_task_status_by_check(check_id): + update_tesseract_criteria_result(updated_check) db_methods.update_check(updated_check) # save to db db_methods.mark_celery_task_as_finished(self.request.id) diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py new file mode 100644 index 00000000..e6175243 --- /dev/null +++ b/app/tesseract_tasks.py @@ -0,0 +1,149 @@ +import os +import time +from celery import Celery +from celery.exceptions import SoftTimeLimitExceeded, MaxRetriesExceededError +import pytesseract +import cv2 +import numpy as np +from root_logger import get_root_logger +from db import db_methods +import re +from bson import ObjectId +from main.check_packs.pack_config import BASE_REPORT_CRITERION + +TASK_RETRY_COUNTDOWN = 30 +SOFT_TIME_LIMIT_FOR_CALLBACK = 30 +MAX_RETRIES = 1 +TASK_SOFT_TIME_LIMIT = 120 + +logger = get_root_logger('tesseract_tasks') + +celery = Celery(__name__) +celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://redis:6379") +celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://redis:6379") + +celery.conf.timezone = 'Europe/Moscow' + +TESSERACT_CONFIG = { + 'lang': 'rus+eng', + 'config': '--psm 6', +} + +@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT) +def tesseract_recognize(self, check_id, symbols_set, max_symbols_percentage, max_text_density): + try: + images = db_methods.get_images(check_id) + if images: + for image in images: + image_array = np.frombuffer(image.image_data, dtype=np.uint8) + img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + if img_cv is None: + raise ValueError(f"Не удалось декодировать изображение с подписью '{image.caption}' из двоичных данных") + text = image.text + if not text: + text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG) + if text.strip(): + logger.info(f"Текст успешно распознан для изображения с подписью '{image.caption}'") + else: + logger.info(f"Текст для изображения с подписью '{image.caption}' пустой.") + try: + db_methods.add_image_text(image._id, (re.sub(r'\s+', ' ', text)).strip()) + except Exception as e: + raise ValueError(f"Ошибка при сохранении текста для изображения с подписью '{image.caption}': {e}") + try: + update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density) + except Exception as e: + raise ValueError(f"Ошибка во время проверки текста: {e}") + except SoftTimeLimitExceeded: + logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.") + try: + self.retry(countdown=TASK_RETRY_COUNTDOWN) + except MaxRetriesExceededError: + logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}") + add_tesseract_result(check_id, [[f"Превышен лимит времени и попыток"], 0]) + except Exception as e: + logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True) + try: + self.retry(countdown=TASK_RETRY_COUNTDOWN) + except MaxRetriesExceededError: + logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}") + add_tesseract_result(check_id,[[f"Ошибка при распознавании текста: {e}"], 0]) + + +@celery.task(name="callback_task", queue='callback-queue', soft_time_limit=SOFT_TIME_LIMIT_FOR_CALLBACK) +def callback_task(result, check_id): + try: + time.sleep(10) + check = db_methods.get_check(ObjectId(check_id)) + if db_methods.get_celery_task_status_by_check(ObjectId(check_id)): + if check.is_ended: + logger.info(f"Проверка успешно завершена для check_id: {check_id}") + return + update_tesseract_criteria_result(check) + db_methods.update_check(check) + logger.info(f"Проверка успешно обновлена для check_id: {check_id}") + return + else: + logger.info(f"Задачи create_task и tesseract_recognize для check_id: {check_id} обрабатываются корректно. Состояние гонки исключено.") + return + except SoftTimeLimitExceeded: + logger.warning(f"Превышен мягкий лимит времени для callback_task с check_id: {check_id}.") + except Exception as e: + logger.error(f"Ошибка в callback_task для check_id: {check_id}: {e}") + + +def update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density): + images = db_methods.get_images(check_id) + deny_list = [] + for image in images: + if image.text: + width, height = image.image_size + text_density = calculate_text_density(image.text, width * height) + if text_density > max_text_density: + deny_list.append( + f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: " + f"{text_density:.2f} (максимум {max_text_density:.2f}). Это может означать, что текст нечитаем.
" + ) + symbols_count = count_symbols_in_text(image.text, symbols_set) + text_length = len(image.text) + symbols_percentage = (symbols_count / text_length) * 100 + if symbols_percentage > max_symbols_percentage: + deny_list.append( + f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: " + f"{symbols_percentage:.2f}% (максимум {max_symbols_percentage:.2f}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.
" + ) + if deny_list: + result = [[f'Проблемы с текстом на изображениях!
{"".join(deny_list)}'], 0] + else: + result = [['Текст на изображениях корректен!'], 1] + add_tesseract_result(check_id, result) + + +def add_tesseract_result(check_id, result): + updated_check = db_methods.get_check(ObjectId(check_id)) + db_methods.mark_celery_tesseract_task_as_finished_by_check(check_id, result) + if db_methods.get_celery_task_status_by_check(check_id): + update_tesseract_criteria_result(updated_check) + db_methods.update_check(updated_check) + + +def update_tesseract_criteria_result(check): + tesseract_task = db_methods.get_celery_tesseract_task_by_check(str(check._id)) + for criteria in check.enabled_checks: + if criteria["id"] == 'image_text_check': + criteria["verdict"] = tesseract_task['tesseract_result'][0] + criteria["score"] = tesseract_task['tesseract_result'][1] + check.score = max(0, round(check.score - (1 - tesseract_task['tesseract_result'][1]) / len(BASE_REPORT_CRITERION), 3)) + check.is_ended = True + return + + +def count_symbols_in_text(text, symbols_set): + return sum(1 for char in text if char in symbols_set) + + +def calculate_text_density(text, image_area): + text_without_spaces = ''.join(text.split()) + if image_area == 0: + return 0 + return len(text_without_spaces) / image_area \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 310b0b8b..5c1cf6d3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -74,6 +74,23 @@ services: volumes: - ../slides_checker_mongo_data:/data/db cpuset: ${CONTAINER_CPU:-0-1} + + tesseract_worker: + image: document_insight_system_image + restart: always + command: celery --app=app.tesseract_tasks.celery worker -n tesseract@worker -Q tesseract-queue,callback-queue --loglevel=info + environment: + - CELERY_BROKER_URL=${REDIS_URL} + - CELERY_RESULT_BACKEND=${REDIS_URL} + depends_on: + - redis + - mongodb + volumes: + - presentation_files:/usr/src/project/files/ + - "/etc/timezone:/etc/timezone:ro" + - "/etc/localtime:/etc/localtime:ro" + cpuset: ${CONTAINER_CPU:-0-1} + mem_limit: ${WORKER_MEMORY:-1G} volumes: flower_data: diff --git a/requirements.txt b/requirements.txt index 8710f80b..9228afdf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,5 @@ filetype==1.2.0 language-tool-python==2.8.1 markdown==3.4.4 md2pdf==1.0.1 +opencv-python==4.5.5.64 +pytesseract==0.3.10 \ No newline at end of file