diff --git a/Dockerfile b/Dockerfile
index 00795845..a4182c9e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,10 +7,10 @@ RUN npm install && npm install webpack
ADD ./assets ./assets
RUN npm run build
-FROM dvivanov/dis-base:v0.4
+FROM dvivanov/dis-base:v0.5
LABEL project='dis'
-LABEL version='0.4'
+LABEL version='0.5'
WORKDIR /usr/src/project
diff --git a/Dockerfile_base b/Dockerfile_base
index 0f196f92..d1724abe 100644
--- a/Dockerfile_base
+++ b/Dockerfile_base
@@ -1,14 +1,19 @@
FROM python:3.10-slim-bullseye
LABEL project='dis'
-LABEL version='0.4-base'
+LABEL version='0.5-base'
ENV LANG en_US.UTF-8
ENV TZ=Europe/Moscow
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre
+RUN apt-get update && apt-get install -y \
+ libreoffice-writer \
+ libreoffice-impress \
+ default-jre \
+ tesseract-ocr \
+ tesseract-ocr-rus
ADD requirements.txt .
RUN python3 -m pip install -r requirements.txt --no-cache-dir
diff --git a/app/db/db_methods.py b/app/db/db_methods.py
index 215c6e4c..3b797104 100644
--- a/app/db/db_methods.py
+++ b/app/db/db_methods.py
@@ -7,7 +7,7 @@
from pymongo import MongoClient
from utils import convert_to
-from .db_types import User, Presentation, Check, Consumers, Logs
+from .db_types import User, Presentation, Check, Consumers, Logs, Image
client = MongoClient("mongodb://mongodb:27017")
db = client['pres-parser-db']
@@ -18,14 +18,62 @@
checks_collection = db['checks']
consumers_collection = db['consumers']
criteria_pack_collection = db['criteria_pack']
+parsed_texts_collection = db['parsed_texts']
logs_collection = db.create_collection(
'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
celery_check_collection = db['celery_check'] # collection for mapping celery_task to check
+celery_tesseract_collection = db['celery_tesseract']
+images_collection = db['images'] # коллекция для хранения изображений
def get_client():
return client
+def get_image(image_id):
+ image = images_collection.find({'_id': image_id})
+ if image is not None:
+ return Image(image)
+ else:
+ return None
+
+def get_images(check_id):
+ images = images_collection.find({'check_id': str(check_id)})
+ if images is not None:
+ image_list = []
+ for img in images:
+ image_list.append(Image(img))
+ return image_list
+ else:
+ return None
+
+def save_image_to_db(check_id, image_data, caption, image_size, text=None, page=None):
+ image = Image({
+ 'check_id': check_id,
+ 'image_data': image_data,
+ 'caption': caption,
+ 'image_size': image_size,
+ 'text' : text,
+ 'page' : page,
+ })
+ result = images_collection.insert_one(image.pack())
+ return result.inserted_id
+
+def update_image(image):
+ return bool(images_collection.find_one_and_replace({'_id': image._id}, image.pack()))
+
+def add_image_text(image_id, new_text):
+ result = images_collection.update_one(
+ {'_id': image_id},
+ {'$set': {'text': new_text}}
+ )
+ return result.matched_count > 0
+
+def add_image_page(image_id, page):
+ result = images_collection.update_one(
+ {'_id': image_id},
+ {'$set': {'page': page}}
+ )
+ return result.matched_count > 0
# Returns user if user was created and None if already exists
def add_user(username, password_hash='', is_LTI=False):
@@ -145,6 +193,12 @@ def add_check(file_id, check):
def update_check(check):
return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack()))
+def add_parsed_text(check_id, parsed_text):
+ result = parsed_texts_collection.update_one({'filename': parsed_text.filename}, {'$set': parsed_text.pack()}, upsert=True)
+ if result.upserted_id: parsed_texts_id = result.upserted_id
+ else: parsed_texts_id = parsed_texts_collection.find_one({'filename': parsed_text.filename})['_id']
+ files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': parsed_texts_id}})
+ return parsed_texts_id
def write_pdf(filename, filepath):
converted_filepath = convert_to(filepath, target_format='pdf')
@@ -443,3 +497,40 @@ def get_celery_task(celery_task_id):
def get_celery_task_by_check(check_id):
return celery_check_collection.find_one({'check_id': check_id})
+
+
+def get_celery_task_status_by_check(check_id):
+ celery_task = get_celery_task_by_check(check_id)
+ if celery_task and 'finished_at' in celery_task:
+ return True
+ return False
+
+
+def add_celery_tesseract_task(celery_tesseract_task_id, check_id):
+ return celery_tesseract_collection.insert_one(
+ {'celery_tesseract_task_id': celery_tesseract_task_id, 'check_id': check_id, 'started_at': datetime.now()}).inserted_id
+
+
+def get_celery_tesseract_task_status_by_check(check_id):
+ celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
+ if celery_tesseract_task and 'finished_at' in celery_tesseract_task:
+ return True
+ return False
+
+
+def mark_celery_tesseract_task_as_finished_by_check(check_id, tesseract_result, finished_time=None):
+ celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
+ if not celery_tesseract_task: return
+ if finished_time is None: finished_time = datetime.now()
+ return celery_tesseract_collection.update_one({'check_id': check_id}, {
+ '$set': {'finished_at': finished_time,
+ 'tesseract_result': tesseract_result,
+ 'processing_time': (finished_time - celery_tesseract_task['started_at']).total_seconds()}})
+
+
+def get_celery_tesseract_task(celery_tesseract_task_id):
+ return celery_tesseract_collection.find_one({'celery_tesseract_task_id': celery_tesseract_task_id})
+
+
+def get_celery_tesseract_task_by_check(check_id):
+ return celery_tesseract_collection.find_one({'check_id': check_id})
diff --git a/app/db/db_types.py b/app/db/db_types.py
index 049e5bdc..53d3a07f 100644
--- a/app/db/db_types.py
+++ b/app/db/db_types.py
@@ -150,3 +150,31 @@ def none_to_false(x):
is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False
is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False
return {'is_ended': is_ended, 'is_failed': is_failed}
+
+class Image(PackableWithId):
+ def __init__(self, dictionary=None):
+ super().__init__(dictionary)
+ dictionary = dictionary or {}
+ self.check_id = dictionary.get('check_id') # Привязка к check_id
+ self.caption = dictionary.get('caption', '') # Подпись к изображению
+ self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata
+ self.image_size = dictionary.get('image_size') # Размер изображения в сантимерах
+ self.text = dictionary.get('text', None)
+ self.page = dictionary.get('page', None)
+
+ def pack(self):
+ package = super().pack()
+ package['check_id'] = str(self.check_id)
+ package['caption'] = self.caption
+ package['image_data'] = self.image_data
+ package['image_size'] = self.image_size
+ package['text'] = self.text
+ package['page'] = self.page
+ return package
+
+class ParsedText(PackableWithId):
+ def __init__(self, dictionary=None):
+ super().__init__(dictionary)
+ dictionary = dictionary or {}
+ self.filename = dictionary.get('filename', '')
+ self.parsed_chapters = dictionary.get('parsed_chapters', [])
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index dea098c2..89b1af43 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -22,6 +22,8 @@
]
BASE_REPORT_CRITERION = [
["simple_check"],
+ ["image_text_check"],
+ ['image_quality_check'],
["banned_words_in_literature"],
["page_counter"],
["image_share_check"],
diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
index a8b11c0e..c18a5c12 100644
--- a/app/main/checks/report_checks/__init__.py
+++ b/app/main/checks/report_checks/__init__.py
@@ -25,6 +25,8 @@
from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
from .template_name import ReportTemplateNameCheck
from .empty_task_page_check import EmptyTaskPageCheck
+from .image_text_check import ImageTextCheck
+from .image_quality_check import ImageQualityCheck
from .water_in_the_text_check import WaterInTheTextCheck
from .sw_section_banned_words import SWSectionBannedWordsCheck
from .sw_section_lit_reference import SWSectionLiteratureReferenceCheck
diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py
new file mode 100644
index 00000000..d069fe94
--- /dev/null
+++ b/app/main/checks/report_checks/image_quality_check.py
@@ -0,0 +1,54 @@
+from ..base_check import BaseReportCriterion, answer
+import cv2
+import numpy as np
+
+class ImageQualityCheck(BaseReportCriterion):
+ label = "Проверка качества изображений"
+ description = ''
+ id = 'image_quality_check'
+ # необходимо подобрать min_laplacian и min_entropy
+ def __init__(self, file_info, min_laplacian=100, min_entropy=1):
+ super().__init__(file_info)
+ self.images = self.file.images
+ self.min_laplacian = min_laplacian
+ self.min_entropy = min_entropy
+ self.laplacian_score = None
+ self.entropy_score = None
+
+ def check(self):
+ deny_list = []
+ if self.images:
+ for img in self.images:
+ image_array = np.frombuffer(img.image_data, dtype=np.uint8)
+ img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+
+ if img_cv is None:
+ deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.
")
+ continue
+
+ self.find_params(img_cv)
+
+ if self.laplacian_score is None or self.entropy_score is None:
+ deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.
")
+ continue
+
+ if self.laplacian_score < self.min_laplacian:
+ deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).
")
+
+ if self.entropy_score < self.min_entropy:
+ deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).
")
+ else:
+ return answer(True, 'Изображения не найдены!')
+ if deny_list:
+ return answer(False, f'Изображения нечитаемы!
Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.
{"".join(deny_list)}')
+ else:
+ return answer(True, 'Изображения корректны!')
+
+ def find_params(self, image):
+ if image is None or image.size == 0:
+ return None, None
+ gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+ self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var()
+ hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
+ hist = hist / hist.sum()
+ self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10))
\ No newline at end of file
diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py
new file mode 100644
index 00000000..0c5add85
--- /dev/null
+++ b/app/main/checks/report_checks/image_text_check.py
@@ -0,0 +1,28 @@
+from ..base_check import BaseReportCriterion, answer
+
+
+class ImageTextCheck(BaseReportCriterion):
+ label = "Проверка текста, считанного с изображений"
+ description = ''
+ id = 'image_text_check'
+ # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
+ def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=4):
+ super().__init__(file_info)
+ self.images = self.file.images
+ self.symbols_set = symbols_set
+ self.max_symbols_percentage = max_symbols_percentage
+ self.max_text_density = max_text_density
+
+ def check(self):
+ from app.tesseract_tasks import tesseract_recognize, callback_task
+ from db.db_methods import add_celery_tesseract_task
+ if self.images:
+ tesseract_task = tesseract_recognize.apply_async(
+ args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density],
+ link=callback_task.s(self.images[0].check_id),
+ link_error=callback_task.s(self.images[0].check_id)
+ )
+ add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id)
+ return answer(True, 'Изображения проверяются!')
+ else:
+ return answer(True, 'Изображения не найдены!')
diff --git a/app/main/parser.py b/app/main/parser.py
index 593b8cfd..dcb33b31 100644
--- a/app/main/parser.py
+++ b/app/main/parser.py
@@ -8,10 +8,11 @@
from main.reports.md_uploader import MdUploader
from utils import convert_to
+
logger = logging.getLogger('root_logger')
+def parse(filepath, pdf_filepath, check_id):
-def parse(filepath, pdf_filepath):
tmp_filepath = filepath.lower()
try:
if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
@@ -19,7 +20,12 @@ def parse(filepath, pdf_filepath):
if tmp_filepath.endswith(('.odp', '.ppt')):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
- file_object = PresentationPPTX(new_filepath)
+
+ presentation = PresentationPPTX(new_filepath)
+ presentation.extract_images_with_captions(check_id)
+ file_object = presentation
+
+
elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
@@ -29,6 +35,7 @@ def parse(filepath, pdf_filepath):
docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)
docx.parse()
+ docx.extract_images_with_captions(check_id)
file_object = docx
elif tmp_filepath.endswith('.md' ):
@@ -54,4 +61,4 @@ def save_to_temp_file(file):
temp_file.write(file.read())
temp_file.close()
file.seek(0)
- return temp_file.name
+ return temp_file.name
\ No newline at end of file
diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py
index dd909f8c..a8b8581f 100644
--- a/app/main/presentations/pptx/presentation_pptx.py
+++ b/app/main/presentations/pptx/presentation_pptx.py
@@ -1,4 +1,7 @@
+from io import BytesIO
+
from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
from .slide_pptx import SlidePPTX
from ..presentation_basic import PresentationBasic
@@ -17,3 +20,39 @@ def add_slides(self):
def __str__(self):
return super().__str__()
+
+ def extract_images_with_captions(self, check_id):
+ from app.db.db_methods import save_image_to_db
+
+ # Проход по каждому слайду в презентации
+ for slide in self.slides:
+ image_found = False
+ image_data = None
+ caption_text = None
+
+ # Проход по всем фигурам на слайде
+ for shape in slide.slide.shapes: # Используем slide.slide для доступа к текущему слайду
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+ image_found = True
+ image_part = shape.image # Получаем объект изображения
+
+ # Извлекаем бинарные данные изображения
+ image_stream = image_part.blob
+ image_data = BytesIO(image_stream)
+
+ # Если мы нашли изображение, ищем следующий непустой текст как подпись
+ if image_found:
+ for shape in slide.slide.shapes:
+ if not shape.has_text_frame:
+ continue
+ text = shape.text.strip()
+ if text: # Находим непустое текстовое поле (предположительно, это подпись)
+ caption_text = text
+ # Сохраняем изображение и его подпись
+ save_image_to_db(check_id, image_data.getvalue(), caption_text)
+ break # Предполагаем, что это подпись к текущему изображению
+
+ # Сброс флага и данных изображения для следующего цикла
+ image_found = False
+ image_data = None
+ caption_text = None
diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py
index d0653fae..8a6a7303 100644
--- a/app/main/reports/document_uploader.py
+++ b/app/main/reports/document_uploader.py
@@ -12,6 +12,7 @@ def __init__(self):
self.literature_page = 0
self.first_lines = []
self.page_count = 0
+ self.images = []
@abstractmethod
def upload(self):
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
index ac30dee4..1c52295c 100644
--- a/app/main/reports/docx_uploader/docx_uploader.py
+++ b/app/main/reports/docx_uploader/docx_uploader.py
@@ -12,6 +12,7 @@
from ..document_uploader import DocumentUploader
+
class DocxUploader(DocumentUploader):
def __init__(self):
super().__init__()
@@ -242,6 +243,52 @@ def show_chapters(self, work_type):
chapters_str += " " + header["text"] + "
"
return chapters_str
+ def extract_images_with_captions(self, check_id):
+ from app.db.db_methods import save_image_to_db, get_images
+
+ emu_to_cm = 360000
+ image_found = False
+ image_data = None
+ image_style="ВКР_Подпись для рисунков"
+ if not self.images:
+ for i, paragraph in enumerate(self.file.paragraphs):
+ for run in paragraph.runs:
+ if "graphic" in run._element.xml:
+ image_streams = run._element.findall('.//a:blip', namespaces={
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
+ for image_stream in image_streams:
+ embed_id = image_stream.get(
+ '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
+ if embed_id:
+ image_found = True
+ image_part = self.file.part.related_parts[embed_id]
+ image_data = image_part.blob
+ extent = run._element.find('.//wp:extent', namespaces={
+ 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+ width_cm = height_cm = None
+ if extent is not None:
+ width_cm = int(extent.get('cx')) / emu_to_cm
+ height_cm = int(extent.get('cy')) / emu_to_cm
+ if image_found:
+ caption = "picture without caption"
+ next_paragraph_index = i + 1
+ while next_paragraph_index < len(self.file.paragraphs):
+ next_paragraph = self.file.paragraphs[next_paragraph_index]
+ style_name = next_paragraph.style.name.lower()
+ next_text = next_paragraph.text.strip()
+ if any("graphic" in r._element.xml for r in next_paragraph.runs):
+ break
+ elif next_text and style_name == image_style.lower() and 'Рисунок' in next_text:
+ caption = next_text
+ break
+ next_paragraph_index += 1
+ save_image_to_db(check_id, image_data, caption, (width_cm, height_cm))
+ image_found = False
+ image_data = None
+
+ self.images = get_images(check_id)
+
+
def main(args):
file = args.file
diff --git a/app/main/reports/pasre_file/parse_file.py b/app/main/reports/pasre_file/parse_file.py
new file mode 100644
index 00000000..d6f272f6
--- /dev/null
+++ b/app/main/reports/pasre_file/parse_file.py
@@ -0,0 +1,36 @@
+import re
+from db import db_methods
+
+def parse_headers_and_pages_and_images(chapters, docx):
+ text_on_page = docx.pdf_file.get_text_on_page()
+ images = docx.images
+ for page, text in text_on_page.items():
+ text = re.sub(r"(-\n)", "", text)
+ text = re.sub(r"\s\n", " ", text)
+ if "СОДЕРЖАНИЕ" in text:
+ continue
+ for chapter in chapters:
+ if chapter["header"] in text:
+ chapter["start_page"] = page
+ for image in images:
+ if image.caption in text:
+ db_methods.add_image_page(image._id, page)
+ for chapter in chapters:
+ for image in images:
+ if image.caption in chapter["text"]:
+ chapter["images"].append(image._id)
+ return chapters
+
+
+def parse_chapters(docx):
+ chapters = []
+ for chapter in docx.chapters:
+ head = chapter["styled_text"]["text"]
+ if "ПРИЛОЖЕНИЕ" in head:
+ head = head.split(".")[0]
+ if chapter["child"] != [] and "heading" in chapter["style"]:
+ temp_text = ""
+ for i in range(len(chapter["child"])):
+ temp_text += chapter["child"][i]["styled_text"]["text"]
+ chapters.append({"header": head, "start_page": 0, "text": temp_text, "images": []})
+ return chapters
\ No newline at end of file
diff --git a/app/server.py b/app/server.py
index 4eef1e9e..a3798178 100644
--- a/app/server.py
+++ b/app/server.py
@@ -179,4 +179,4 @@ def __call__(self, environ, start_response):
logger.info("Сервер запущен по адресу http://" + str(ip) + ':' + str(port) + " в " +
("отладочном" if DEBUG else "рабочем") + " режиме")
utils.create_consumers(app.config['LTI_CONSUMERS'])
- app.run(debug=DEBUG, host=ip, port=8080, use_reloader=True)
+ app.run(debug=DEBUG, host=ip, port=8080, use_reloader=True)
\ No newline at end of file
diff --git a/app/tasks.py b/app/tasks.py
index c7ba47df..e4510ce4 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -6,12 +6,14 @@
from celery.signals import worker_ready
from passback_grades import run_passback
+from main.reports.pasre_file import parse_file
from db import db_methods
-from db.db_types import Check
+from db.db_types import Check, ParsedText
from main.checker import check
from main.parser import parse
from main.check_packs import BASE_PACKS
from root_logger import get_root_logger
+from tesseract_tasks import update_tesseract_criteria_result
config = ConfigParser()
config.read('app/config.ini')
@@ -52,9 +54,18 @@ def create_task(self, check_info):
original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}")
pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf")
try:
- updated_check = check(parse(original_filepath, pdf_filepath), check_obj)
- updated_check.is_ended = True
+ parsed_file_object = parse(original_filepath, pdf_filepath, check_id)
+ parsed_file_object.make_chapters(check_obj.file_type['report_type'])
+ parsed_file_object.make_headers(check_obj.file_type['report_type'])
+ chapters = parse_file.parse_chapters(parsed_file_object)
+
+ updated_check = check(parsed_file_object, check_obj)
updated_check.is_failed = False
+ parsed_text = ParsedText(dict(filename=check_info['filename']))
+ parsed_text.parsed_chapters = parse_file.parse_headers_and_pages_and_images(chapters, parsed_file_object)
+ db_methods.add_parsed_text(check_id, parsed_text)
+ if db_methods.get_celery_tesseract_task_status_by_check(check_id):
+ update_tesseract_criteria_result(updated_check)
db_methods.update_check(updated_check) # save to db
db_methods.mark_celery_task_as_finished(self.request.id)
diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py
new file mode 100644
index 00000000..e6175243
--- /dev/null
+++ b/app/tesseract_tasks.py
@@ -0,0 +1,149 @@
+import os
+import time
+from celery import Celery
+from celery.exceptions import SoftTimeLimitExceeded, MaxRetriesExceededError
+import pytesseract
+import cv2
+import numpy as np
+from root_logger import get_root_logger
+from db import db_methods
+import re
+from bson import ObjectId
+from main.check_packs.pack_config import BASE_REPORT_CRITERION
+
+TASK_RETRY_COUNTDOWN = 30
+SOFT_TIME_LIMIT_FOR_CALLBACK = 30
+MAX_RETRIES = 1
+TASK_SOFT_TIME_LIMIT = 120
+
+logger = get_root_logger('tesseract_tasks')
+
+celery = Celery(__name__)
+celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://redis:6379")
+celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://redis:6379")
+
+celery.conf.timezone = 'Europe/Moscow'
+
+TESSERACT_CONFIG = {
+ 'lang': 'rus+eng',
+ 'config': '--psm 6',
+}
+
+@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT)
+def tesseract_recognize(self, check_id, symbols_set, max_symbols_percentage, max_text_density):
+ try:
+ images = db_methods.get_images(check_id)
+ if images:
+ for image in images:
+ image_array = np.frombuffer(image.image_data, dtype=np.uint8)
+ img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+ if img_cv is None:
+ raise ValueError(f"Не удалось декодировать изображение с подписью '{image.caption}' из двоичных данных")
+ text = image.text
+ if not text:
+ text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG)
+ if text.strip():
+ logger.info(f"Текст успешно распознан для изображения с подписью '{image.caption}'")
+ else:
+ logger.info(f"Текст для изображения с подписью '{image.caption}' пустой.")
+ try:
+ db_methods.add_image_text(image._id, (re.sub(r'\s+', ' ', text)).strip())
+ except Exception as e:
+ raise ValueError(f"Ошибка при сохранении текста для изображения с подписью '{image.caption}': {e}")
+ try:
+ update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density)
+ except Exception as e:
+ raise ValueError(f"Ошибка во время проверки текста: {e}")
+ except SoftTimeLimitExceeded:
+ logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.")
+ try:
+ self.retry(countdown=TASK_RETRY_COUNTDOWN)
+ except MaxRetriesExceededError:
+ logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}")
+ add_tesseract_result(check_id, [[f"Превышен лимит времени и попыток"], 0])
+ except Exception as e:
+ logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True)
+ try:
+ self.retry(countdown=TASK_RETRY_COUNTDOWN)
+ except MaxRetriesExceededError:
+ logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}")
+ add_tesseract_result(check_id,[[f"Ошибка при распознавании текста: {e}"], 0])
+
+
+@celery.task(name="callback_task", queue='callback-queue', soft_time_limit=SOFT_TIME_LIMIT_FOR_CALLBACK)
+def callback_task(result, check_id):
+ try:
+ time.sleep(10)
+ check = db_methods.get_check(ObjectId(check_id))
+ if db_methods.get_celery_task_status_by_check(ObjectId(check_id)):
+ if check.is_ended:
+ logger.info(f"Проверка успешно завершена для check_id: {check_id}")
+ return
+ update_tesseract_criteria_result(check)
+ db_methods.update_check(check)
+ logger.info(f"Проверка успешно обновлена для check_id: {check_id}")
+ return
+ else:
+ logger.info(f"Задачи create_task и tesseract_recognize для check_id: {check_id} обрабатываются корректно. Состояние гонки исключено.")
+ return
+ except SoftTimeLimitExceeded:
+ logger.warning(f"Превышен мягкий лимит времени для callback_task с check_id: {check_id}.")
+ except Exception as e:
+ logger.error(f"Ошибка в callback_task для check_id: {check_id}: {e}")
+
+
+def update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density):
+ images = db_methods.get_images(check_id)
+ deny_list = []
+ for image in images:
+ if image.text:
+ width, height = image.image_size
+ text_density = calculate_text_density(image.text, width * height)
+ if text_density > max_text_density:
+ deny_list.append(
+ f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: "
+ f"{text_density:.2f} (максимум {max_text_density:.2f}). Это может означать, что текст нечитаем.
"
+ )
+ symbols_count = count_symbols_in_text(image.text, symbols_set)
+ text_length = len(image.text)
+ symbols_percentage = (symbols_count / text_length) * 100
+ if symbols_percentage > max_symbols_percentage:
+ deny_list.append(
+ f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: "
+ f"{symbols_percentage:.2f}% (максимум {max_symbols_percentage:.2f}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.
"
+ )
+ if deny_list:
+ result = [[f'Проблемы с текстом на изображениях!
{"".join(deny_list)}'], 0]
+ else:
+ result = [['Текст на изображениях корректен!'], 1]
+ add_tesseract_result(check_id, result)
+
+
+def add_tesseract_result(check_id, result):
+ updated_check = db_methods.get_check(ObjectId(check_id))
+ db_methods.mark_celery_tesseract_task_as_finished_by_check(check_id, result)
+ if db_methods.get_celery_task_status_by_check(check_id):
+ update_tesseract_criteria_result(updated_check)
+ db_methods.update_check(updated_check)
+
+
+def update_tesseract_criteria_result(check):
+ tesseract_task = db_methods.get_celery_tesseract_task_by_check(str(check._id))
+ for criteria in check.enabled_checks:
+ if criteria["id"] == 'image_text_check':
+ criteria["verdict"] = tesseract_task['tesseract_result'][0]
+ criteria["score"] = tesseract_task['tesseract_result'][1]
+ check.score = max(0, round(check.score - (1 - tesseract_task['tesseract_result'][1]) / len(BASE_REPORT_CRITERION), 3))
+ check.is_ended = True
+ return
+
+
+def count_symbols_in_text(text, symbols_set):
+ return sum(1 for char in text if char in symbols_set)
+
+
+def calculate_text_density(text, image_area):
+ text_without_spaces = ''.join(text.split())
+ if image_area == 0:
+ return 0
+ return len(text_without_spaces) / image_area
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 310b0b8b..5c1cf6d3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -74,6 +74,23 @@ services:
volumes:
- ../slides_checker_mongo_data:/data/db
cpuset: ${CONTAINER_CPU:-0-1}
+
+ tesseract_worker:
+ image: document_insight_system_image
+ restart: always
+ command: celery --app=app.tesseract_tasks.celery worker -n tesseract@worker -Q tesseract-queue,callback-queue --loglevel=info
+ environment:
+ - CELERY_BROKER_URL=${REDIS_URL}
+ - CELERY_RESULT_BACKEND=${REDIS_URL}
+ depends_on:
+ - redis
+ - mongodb
+ volumes:
+ - presentation_files:/usr/src/project/files/
+ - "/etc/timezone:/etc/timezone:ro"
+ - "/etc/localtime:/etc/localtime:ro"
+ cpuset: ${CONTAINER_CPU:-0-1}
+ mem_limit: ${WORKER_MEMORY:-1G}
volumes:
flower_data:
diff --git a/requirements.txt b/requirements.txt
index 8710f80b..9228afdf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,3 +35,5 @@ filetype==1.2.0
language-tool-python==2.8.1
markdown==3.4.4
md2pdf==1.0.1
+opencv-python==4.5.5.64
+pytesseract==0.3.10
\ No newline at end of file