Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ RUN npm install && npm install webpack
ADD ./assets ./assets
RUN npm run build

FROM dvivanov/dis-base:v0.4
FROM dvivanov/dis-base:v0.5

LABEL project='dis'
LABEL version='0.4'
LABEL version='0.5'

WORKDIR /usr/src/project

Expand Down
9 changes: 7 additions & 2 deletions Dockerfile_base
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
FROM python:3.10-slim-bullseye

LABEL project='dis'
LABEL version='0.4-base'
LABEL version='0.5-base'

ENV LANG en_US.UTF-8
ENV TZ=Europe/Moscow

RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

RUN apt update && apt install -y libreoffice-writer libreoffice-impress default-jre
RUN apt-get update && apt-get install -y \
libreoffice-writer \
libreoffice-impress \
default-jre \
tesseract-ocr \
tesseract-ocr-rus

ADD requirements.txt .
RUN python3 -m pip install -r requirements.txt --no-cache-dir
93 changes: 92 additions & 1 deletion app/db/db_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pymongo import MongoClient
from utils import convert_to

from .db_types import User, Presentation, Check, Consumers, Logs
from .db_types import User, Presentation, Check, Consumers, Logs, Image

client = MongoClient("mongodb://mongodb:27017")
db = client['pres-parser-db']
Expand All @@ -18,14 +18,62 @@
checks_collection = db['checks']
consumers_collection = db['consumers']
criteria_pack_collection = db['criteria_pack']
parsed_texts_collection = db['parsed_texts']
logs_collection = db.create_collection(
'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
celery_check_collection = db['celery_check'] # collection for mapping celery_task to check
celery_tesseract_collection = db['celery_tesseract']
images_collection = db['images'] # коллекция для хранения изображений


def get_client():
return client

def get_image(image_id):
image = images_collection.find({'_id': image_id})
if image is not None:
return Image(image)
else:
return None

def get_images(check_id):
images = images_collection.find({'check_id': str(check_id)})
if images is not None:
image_list = []
for img in images:
image_list.append(Image(img))
return image_list
else:
return None

def save_image_to_db(check_id, image_data, caption, image_size, text=None, page=None):
image = Image({
'check_id': check_id,
'image_data': image_data,
'caption': caption,
'image_size': image_size,
'text' : text,
'page' : page,
})
result = images_collection.insert_one(image.pack())
return result.inserted_id

def update_image(image):
return bool(images_collection.find_one_and_replace({'_id': image._id}, image.pack()))

def add_image_text(image_id, new_text):
result = images_collection.update_one(
{'_id': image_id},
{'$set': {'text': new_text}}
)
return result.matched_count > 0

def add_image_page(image_id, page):
result = images_collection.update_one(
{'_id': image_id},
{'$set': {'page': page}}
)
return result.matched_count > 0

# Returns user if user was created and None if already exists
def add_user(username, password_hash='', is_LTI=False):
Expand Down Expand Up @@ -145,6 +193,12 @@ def add_check(file_id, check):
def update_check(check):
return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack()))

def add_parsed_text(check_id, parsed_text):
result = parsed_texts_collection.update_one({'filename': parsed_text.filename}, {'$set': parsed_text.pack()}, upsert=True)
if result.upserted_id: parsed_texts_id = result.upserted_id
else: parsed_texts_id = parsed_texts_collection.find_one({'filename': parsed_text.filename})['_id']
files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': parsed_texts_id}})
return parsed_texts_id

def write_pdf(filename, filepath):
converted_filepath = convert_to(filepath, target_format='pdf')
Expand Down Expand Up @@ -443,3 +497,40 @@ def get_celery_task(celery_task_id):

def get_celery_task_by_check(check_id):
return celery_check_collection.find_one({'check_id': check_id})


def get_celery_task_status_by_check(check_id):
celery_task = get_celery_task_by_check(check_id)
if celery_task and 'finished_at' in celery_task:
return True
return False


def add_celery_tesseract_task(celery_tesseract_task_id, check_id):
return celery_tesseract_collection.insert_one(
{'celery_tesseract_task_id': celery_tesseract_task_id, 'check_id': check_id, 'started_at': datetime.now()}).inserted_id


def get_celery_tesseract_task_status_by_check(check_id):
celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
if celery_tesseract_task and 'finished_at' in celery_tesseract_task:
return True
return False


def mark_celery_tesseract_task_as_finished_by_check(check_id, tesseract_result, finished_time=None):
celery_tesseract_task = get_celery_tesseract_task_by_check(check_id)
if not celery_tesseract_task: return
if finished_time is None: finished_time = datetime.now()
return celery_tesseract_collection.update_one({'check_id': check_id}, {
'$set': {'finished_at': finished_time,
'tesseract_result': tesseract_result,
'processing_time': (finished_time - celery_tesseract_task['started_at']).total_seconds()}})


def get_celery_tesseract_task(celery_tesseract_task_id):
return celery_tesseract_collection.find_one({'celery_tesseract_task_id': celery_tesseract_task_id})


def get_celery_tesseract_task_by_check(check_id):
return celery_tesseract_collection.find_one({'check_id': check_id})
28 changes: 28 additions & 0 deletions app/db/db_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,31 @@ def none_to_false(x):
is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False
is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False
return {'is_ended': is_ended, 'is_failed': is_failed}

class Image(PackableWithId):
def __init__(self, dictionary=None):
super().__init__(dictionary)
dictionary = dictionary or {}
self.check_id = dictionary.get('check_id') # Привязка к check_id
self.caption = dictionary.get('caption', '') # Подпись к изображению
self.image_data = dictionary.get('image_data') # Файл изображения в формате bindata
self.image_size = dictionary.get('image_size') # Размер изображения в сантимерах
self.text = dictionary.get('text', None)
self.page = dictionary.get('page', None)

def pack(self):
package = super().pack()
package['check_id'] = str(self.check_id)
package['caption'] = self.caption
package['image_data'] = self.image_data
package['image_size'] = self.image_size
package['text'] = self.text
package['page'] = self.page
return package

class ParsedText(PackableWithId):
def __init__(self, dictionary=None):
super().__init__(dictionary)
dictionary = dictionary or {}
self.filename = dictionary.get('filename', '')
self.parsed_chapters = dictionary.get('parsed_chapters', [])
2 changes: 2 additions & 0 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
]
BASE_REPORT_CRITERION = [
["simple_check"],
["image_text_check"],
['image_quality_check'],
["banned_words_in_literature"],
["page_counter"],
["image_share_check"],
Expand Down
2 changes: 2 additions & 0 deletions app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
from .template_name import ReportTemplateNameCheck
from .empty_task_page_check import EmptyTaskPageCheck
from .image_text_check import ImageTextCheck
from .image_quality_check import ImageQualityCheck
from .water_in_the_text_check import WaterInTheTextCheck
from .sw_section_banned_words import SWSectionBannedWordsCheck
from .sw_section_lit_reference import SWSectionLiteratureReferenceCheck
Expand Down
54 changes: 54 additions & 0 deletions app/main/checks/report_checks/image_quality_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from ..base_check import BaseReportCriterion, answer
import cv2
import numpy as np

class ImageQualityCheck(BaseReportCriterion):
label = "Проверка качества изображений"
description = ''
id = 'image_quality_check'
# необходимо подобрать min_laplacian и min_entropy
def __init__(self, file_info, min_laplacian=100, min_entropy=1):
super().__init__(file_info)
self.images = self.file.images
self.min_laplacian = min_laplacian
self.min_entropy = min_entropy
self.laplacian_score = None
self.entropy_score = None

def check(self):
deny_list = []
if self.images:
for img in self.images:
image_array = np.frombuffer(img.image_data, dtype=np.uint8)
img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR)

if img_cv is None:
deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
continue

self.find_params(img_cv)

if self.laplacian_score is None or self.entropy_score is None:
deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.<br>")
continue

if self.laplacian_score < self.min_laplacian:
deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).<br>")

if self.entropy_score < self.min_entropy:
deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).<br>")
else:
return answer(True, 'Изображения не найдены!')
if deny_list:
return answer(False, f'Изображения нечитаемы! <br>Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.<br>{"".join(deny_list)}')
else:
return answer(True, 'Изображения корректны!')

def find_params(self, image):
if image is None or image.size == 0:
return None, None
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var()
hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256])
hist = hist / hist.sum()
self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10))
28 changes: 28 additions & 0 deletions app/main/checks/report_checks/image_text_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from ..base_check import BaseReportCriterion, answer


class ImageTextCheck(BaseReportCriterion):
label = "Проверка текста, считанного с изображений"
description = ''
id = 'image_text_check'
# Подобрать значения для symbols_set, max_symbols_percentage, max_text_density
def __init__(self, file_info, symbols_set=['%', '1'], max_symbols_percentage=0, max_text_density=4):
super().__init__(file_info)
self.images = self.file.images
self.symbols_set = symbols_set
self.max_symbols_percentage = max_symbols_percentage
self.max_text_density = max_text_density

def check(self):
from app.tesseract_tasks import tesseract_recognize, callback_task
from db.db_methods import add_celery_tesseract_task
if self.images:
tesseract_task = tesseract_recognize.apply_async(
args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density],
link=callback_task.s(self.images[0].check_id),
link_error=callback_task.s(self.images[0].check_id)
)
add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id)
return answer(True, 'Изображения проверяются!')
else:
return answer(True, 'Изображения не найдены!')
13 changes: 10 additions & 3 deletions app/main/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,24 @@
from main.reports.md_uploader import MdUploader
from utils import convert_to


logger = logging.getLogger('root_logger')

def parse(filepath, pdf_filepath, check_id):

def parse(filepath, pdf_filepath):
tmp_filepath = filepath.lower()
try:
if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')):
new_filepath = filepath
if tmp_filepath.endswith(('.odp', '.ppt')):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
file_object = PresentationPPTX(new_filepath)

presentation = PresentationPPTX(new_filepath)
presentation.extract_images_with_captions(check_id)
file_object = presentation


elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
Expand All @@ -29,6 +35,7 @@ def parse(filepath, pdf_filepath):
docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)
docx.parse()
docx.extract_images_with_captions(check_id)
file_object = docx

elif tmp_filepath.endswith('.md' ):
Expand All @@ -54,4 +61,4 @@ def save_to_temp_file(file):
temp_file.write(file.read())
temp_file.close()
file.seek(0)
return temp_file.name
return temp_file.name
39 changes: 39 additions & 0 deletions app/main/presentations/pptx/presentation_pptx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from io import BytesIO

from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE

from .slide_pptx import SlidePPTX
from ..presentation_basic import PresentationBasic
Expand All @@ -17,3 +20,39 @@ def add_slides(self):

def __str__(self):
return super().__str__()

def extract_images_with_captions(self, check_id):
from app.db.db_methods import save_image_to_db

# Проход по каждому слайду в презентации
for slide in self.slides:
image_found = False
image_data = None
caption_text = None

# Проход по всем фигурам на слайде
for shape in slide.slide.shapes: # Используем slide.slide для доступа к текущему слайду
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
image_found = True
image_part = shape.image # Получаем объект изображения

# Извлекаем бинарные данные изображения
image_stream = image_part.blob
image_data = BytesIO(image_stream)

# Если мы нашли изображение, ищем следующий непустой текст как подпись
if image_found:
for shape in slide.slide.shapes:
if not shape.has_text_frame:
continue
text = shape.text.strip()
if text: # Находим непустое текстовое поле (предположительно, это подпись)
caption_text = text
# Сохраняем изображение и его подпись
save_image_to_db(check_id, image_data.getvalue(), caption_text)
break # Предполагаем, что это подпись к текущему изображению

# Сброс флага и данных изображения для следующего цикла
image_found = False
image_data = None
caption_text = None
1 change: 1 addition & 0 deletions app/main/reports/document_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(self):
self.literature_page = 0
self.first_lines = []
self.page_count = 0
self.images = []

@abstractmethod
def upload(self):
Expand Down
Loading
Loading