diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 17bf229..6ce2add 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,9 +19,9 @@ repos: entry: poetry lock --check pass_filenames: false language: system - # - id: system - # name: MyPy - # entry: poetry run mypy docling_ibm_models - # pass_filenames: false - # language: system - # files: '\.py$' \ No newline at end of file + - id: system + name: MyPy + entry: poetry run mypy docling_ibm_models + pass_filenames: false + language: system + files: '\.py$' \ No newline at end of file diff --git a/docling_ibm_models/__init__.py b/docling_ibm_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/code_formula_model/__init__.py b/docling_ibm_models/code_formula_model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/code_formula_model/code_formula_predictor.py b/docling_ibm_models/code_formula_model/code_formula_predictor.py index 5804558..b66e634 100644 --- a/docling_ibm_models/code_formula_model/code_formula_predictor.py +++ b/docling_ibm_models/code_formula_model/code_formula_predictor.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MIT # import logging -from typing import List, Union +from typing import List, Optional, Union import numpy as np import torch @@ -132,7 +132,7 @@ def predict( self, images: List[Union[Image.Image, np.ndarray]], labels: List[str], - temperature: float = 0.1, + temperature: Optional[float] = 0.1, ) -> List[str]: """ Predicts the textual representation of input images (code or LaTeX). @@ -143,7 +143,7 @@ def predict( List of images to be processed, provided as PIL Image objects or numpy arrays. labels : List[str] List of labels indicating the type of each image ('code' or 'formula'). - temperature : float, optional + temperature : Optional[float] Sampling temperature for generation, by default set to 0.1. Returns @@ -159,7 +159,11 @@ def predict( Excpetion In case the temperature is an invalid number. """ - if (type(temperature) != float and type(temperature) != int) or temperature < 0: + if ( + temperature is None + or not (isinstance(temperature, float) or isinstance(temperature, int)) + or temperature < 0 + ): raise Exception("Temperature must be a number greater or equal to 0.") do_sample = True @@ -181,11 +185,10 @@ def predict( else: raise TypeError("Not supported input image format") images_tmp.append(image) - images = images_tmp - images_tensor = torch.stack([self._image_processor(img) for img in images]).to( - self._device - ) + images_tensor = torch.stack( + [self._image_processor(img) for img in images_tmp] + ).to(self._device) prompts = [self._get_prompt(label) for label in labels] diff --git a/docling_ibm_models/code_formula_model/models/__init__.py b/docling_ibm_models/code_formula_model/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/code_formula_model/models/sam_opt.py b/docling_ibm_models/code_formula_model/models/sam_opt.py index c682f45..8bd9d8d 100644 --- a/docling_ibm_models/code_formula_model/models/sam_opt.py +++ b/docling_ibm_models/code_formula_model/models/sam_opt.py @@ -67,14 +67,14 @@ def embed_tokens(self, x): def forward( self, - input_ids: torch.LongTensor = None, + input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - images: torch.FloatTensor = None, + images: Optional[torch.FloatTensor] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: @@ -86,6 +86,7 @@ def forward( if input_ids.shape[1] != 1 or self.training: with torch.set_grad_enabled(self.training): + assert vision_tower is not None image_features = vision_tower(images) image_features = image_features.flatten(2).permute(0, 2, 1) image_features = self.mm_projector(image_features) @@ -94,9 +95,9 @@ def forward( for cur_input_ids, cur_input_embeds, cur_image_features in zip( input_ids, inputs_embeds, image_features ): - image_start_token_position = torch.where( - cur_input_ids == im_start_token - )[0].item() + image_start_token_position = int( + torch.where(cur_input_ids == im_start_token)[0].item() + ) # cast to int for mypy cur_image_features = cur_image_features.to( device=cur_input_embeds.device @@ -115,7 +116,7 @@ def forward( new_input_embeds.append(cur_input_embeds) - inputs_embeds = torch.stack(new_input_embeds, dim=0) + inputs_embeds = torch.stack(new_input_embeds, dim=0) # type: ignore return super(SamOPTModel, self).forward( input_ids=None, diff --git a/docling_ibm_models/document_figure_classifier_model/__init__.py b/docling_ibm_models/document_figure_classifier_model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py b/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py index ea51a72..a6db7a7 100644 --- a/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py +++ b/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py @@ -147,24 +147,23 @@ def predict( The predictions for each image are sorted in descending order of confidence. """ - processed_images = [] + rgb_images = [] for image in images: if isinstance(image, Image.Image): - processed_images.append(image.convert("RGB")) + rgb_images.append(image.convert("RGB")) elif isinstance(image, np.ndarray): - processed_images.append(Image.fromarray(image).convert("RGB")) + rgb_images.append(Image.fromarray(image).convert("RGB")) else: raise TypeError( "Supported input formats are PIL.Image.Image or numpy.ndarray." ) - images = processed_images # (batch_size, 3, 224, 224) - images = [self._image_processor(image) for image in images] - images = torch.stack(images).to(self._device) + processed_images = [self._image_processor(image) for image in rgb_images] + torch_images = torch.stack(processed_images).to(self._device) with torch.no_grad(): - logits = self._model(images).logits # (batch_size, num_classes) + logits = self._model(torch_images).logits # (batch_size, num_classes) probs_batch = logits.softmax(dim=1) # (batch_size, num_classes) probs_batch = probs_batch.cpu().numpy().tolist() diff --git a/docling_ibm_models/layoutmodel/__init__.py b/docling_ibm_models/layoutmodel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/py.typed b/docling_ibm_models/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py b/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py index 488f34c..887d368 100644 --- a/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +++ b/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py @@ -36,7 +36,7 @@ def forward(self, x): class TMTransformerDecoder(nn.TransformerDecoder): - def forward( + def forward( # type: ignore self, tgt: Tensor, memory: Optional[Tensor] = None, @@ -69,11 +69,11 @@ def forward( else: out_cache = torch.stack(tag_cache, dim=0) - return output, out_cache + return output, out_cache # type: ignore class TMTransformerDecoderLayer(nn.TransformerDecoderLayer): - def forward( + def forward( # type: ignore self, tgt: Tensor, memory: Optional[Tensor] = None, diff --git a/docling_ibm_models/tableformer/otsl.py b/docling_ibm_models/tableformer/otsl.py index 447cbf4..85ed697 100644 --- a/docling_ibm_models/tableformer/otsl.py +++ b/docling_ibm_models/tableformer/otsl.py @@ -11,7 +11,7 @@ LOG_LEVEL = logging.INFO # LOG_LEVEL = logging.DEBUG logger = s.get_custom_logger("consolidate", LOG_LEVEL) -png_files = {} # Evaluation files +# png_files = {} # Evaluation files total_pics = 0 diff --git a/docling_ibm_models/tableformer/utils/mem_monitor.py b/docling_ibm_models/tableformer/utils/mem_monitor.py index be47c9c..c263b20 100644 --- a/docling_ibm_models/tableformer/utils/mem_monitor.py +++ b/docling_ibm_models/tableformer/utils/mem_monitor.py @@ -5,6 +5,7 @@ import os import platform import re +from typing import Dict, Union class MemMonitor: @@ -112,7 +113,7 @@ def __init__(self, enable=True): regex_str = r"({}:)(\s+)(\d*)(.*)".format(mem_field) self._status_regex[mem_field] = re.compile(regex_str) - def get_memory_full(self) -> dict: + def get_memory_full(self) -> Union[Dict, int]: r""" - Parse /proc/status to get all memory info. - The method returns a dict with the fields self._status_fields @@ -140,7 +141,7 @@ def get_memory_full(self) -> dict: return memory - def get_memory(self) -> dict: + def get_memory(self) -> Union[Dict, int]: r""" - Parse /proc/statm to get the most important memory fields - This is a fast implementation. diff --git a/docling_ibm_models/tableformer/utils/torch_utils.py b/docling_ibm_models/tableformer/utils/torch_utils.py deleted file mode 100644 index 09bcde2..0000000 --- a/docling_ibm_models/tableformer/utils/torch_utils.py +++ /dev/null @@ -1,216 +0,0 @@ -# -# Copyright IBM Corp. 2024 - 2024 -# SPDX-License-Identifier: MIT -# -import torch - - -def model_info(model, verbose=False): - # Plots a line-by-line description of a PyTorch model - n_p = sum(x.numel() for x in model.parameters()) # number parameters - n_g = sum( - x.numel() for x in model.parameters() if x.requires_grad - ) # number gradients - if verbose: - print( - "%5s %40s %9s %12s %20s %10s %10s" - % ("layer", "name", "gradient", "parameters", "shape", "mu", "sigma") - ) - for i, (name, p) in enumerate(model.named_parameters()): - name = name.replace("module_list.", "") - print( - "%5g %40s %9s %12g %20s %10.3g %10.3g" - % ( - i, - name, - p.requires_grad, - p.numel(), - list(p.shape), - p.mean(), - p.std(), - ) - ) - - try: # FLOPS - from thop import profile - - macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False) - fs = ", %.1f GFLOPS" % (macs / 1e9 * 2) - except Exception: - fs = "" - - print( - "Model Summary: %g layers, %g parameters, %g gradients%s" - % (len(list(model.parameters())), n_p, n_g, fs) - ) - - -# def init_seeds(seed=0): -# torch.manual_seed(seed) -# -# # Reduce randomness (may be slower on Tesla GPUs) -# # https://pytorch.org/docs/stable/notes/randomness.html -# if seed == 0: -# cudnn.deterministic = False -# cudnn.benchmark = True -# -# -# def select_device(device='', apex=False, batch_size=None): -# # device = 'cpu' or '0' or '0,1,2,3' -# cpu_request = device.lower() == 'cpu' -# if device and not cpu_request: # if device requested other than 'cpu' -# os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable -# # check availablity -# assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device -# -# cuda = False if cpu_request else torch.cuda.is_available() -# if cuda: -# c = 1024 ** 2 # bytes to MB -# ng = torch.cuda.device_count() -# if ng > 1 and batch_size: # check that batch_size is compatible with device_count -# assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % \ -# (batch_size, ng) -# x = [torch.cuda.get_device_properties(i) for i in range(ng)] -# # apex for mixed precision https://github.com/NVIDIA/apex -# s = 'Using CUDA ' + ('Apex ' if apex else '') -# for i in range(0, ng): -# if i == 1: -# s = ' ' * len(s) -# print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" % -# (s, i, x[i].name, x[i].total_memory / c)) -# else: -# print('Using CPU') -# -# print('') # skip a line -# return torch.device('cuda:0' if cuda else 'cpu') -# -# -# def time_synchronized(): -# torch.cuda.synchronize() if torch.cuda.is_available() else None -# return time.time() -# -# -# def initialize_weights(model): -# for m in model.modules(): -# t = type(m) -# if t is nn.Conv2d: -# pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') -# elif t is nn.BatchNorm2d: -# m.eps = 1e-4 -# m.momentum = 0.03 -# elif t in [nn.LeakyReLU, nn.ReLU, nn.ReLU6]: -# m.inplace = True -# -# -# def find_modules(model, mclass=nn.Conv2d): -# # finds layer indices matching module class 'mclass' -# return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)] -# -# -# def fuse_conv_and_bn(conv, bn): -# # https://tehnokv.com/posts/fusing-batchnorm-and-conv/ -# with torch.no_grad(): -# # init -# fusedconv = torch.nn.Conv2d(conv.in_channels, -# conv.out_channels, -# kernel_size=conv.kernel_size, -# stride=conv.stride, -# padding=conv.padding, -# bias=True) -# -# # prepare filters -# w_conv = conv.weight.clone().view(conv.out_channels, -1) -# w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) -# fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) -# -# # prepare spatial bias -# if conv.bias is not None: -# b_conv = conv.bias -# else: -# b_conv = torch.zeros(conv.weight.size(0)) -# b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) -# fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) -# -# return fusedconv -# -# -# def load_classifier(name='resnet101', n=2): -# # Loads a pretrained model reshaped to n-class output -# import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch#torchvision -# model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet') -# -# # Display model properties -# for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', -# 'model.std']: -# print(x + ' =', eval(x)) -# -# # Reshape output to n classes -# filters = model.last_linear.weight.shape[1] -# model.last_linear.bias = torch.nn.Parameter(torch.zeros(n)) -# model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters)) -# model.last_linear.out_features = n -# return model -# -# -# def scale_img(img, ratio=1.0, same_shape=True): # img(16,3,256,416), r=ratio -# # scales img(bs,3,y,x) by ratio -# h, w = img.shape[2:] -# s = (int(h * ratio), int(w * ratio)) # new size -# img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize -# if not same_shape: # pad/crop img -# gs = 64 # (pixels) grid size -# h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)] -# return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean -# -# -# class ModelEMA: -# """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models -# Keep a moving average of everything in the model state_dict (parameters and buffers). -# This is intended to allow functionality like -# https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage -# A smoothed version of the weights is necessary for some training schemes to perform well. -# E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use -# RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA -# smoothing of weights to match results. Pay attention to the decay constant you are using -# relative to your update count per epoch. -# To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but -# disable validation of the EMA weights. Validation will have to be done manually in a separate -# process, or after the training stops converging. -# This class is sensitive where it is initialized in the sequence of model init, -# GPU assignment and distributed training wrappers. -# I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and -# single-GPU. -# """ -# -# def __init__(self, model, decay=0.9999, device=''): -# # make a copy of the model for accumulating moving average of weights -# self.ema = deepcopy(model) -# self.ema.eval() -# self.updates = 0 # number of EMA updates -# # decay exponential ramp (to help early epochs) -# self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) -# self.device = device # perform ema on different device from model if set -# if device: -# self.ema.to(device=device) -# for p in self.ema.parameters(): -# p.requires_grad_(False) -# -# def update(self, model): -# self.updates += 1 -# d = self.decay(self.updates) -# with torch.no_grad(): -# if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel): -# msd, esd = model.module.state_dict(), self.ema.module.state_dict() -# else: -# msd, esd = model.state_dict(), self.ema.state_dict() -# -# for k, v in esd.items(): -# if v.dtype.is_floating_point: -# v *= d -# v += (1. - d) * msd[k].detach() -# -# def update_attr(self, model): -# # Assign attributes (which may change during training) -# for k in model.__dict__.keys(): -# if not k.startswith('_'): -# setattr(self.ema, k, getattr(model, k)) diff --git a/pyproject.toml b/pyproject.toml index 5c4da77..7d51cc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,14 +106,14 @@ parser_angular_minor_types = "feat" parser_angular_patch_types = "fix,perf" -# [tool.mypy] -# pretty = true -# no_implicit_optional = true -# python_version = "3.10" -# -# [[tool.mypy.overrides]] -# module = [ -# "torchvision.*", -# "transformers.*" -# ] -# ignore_missing_imports = true \ No newline at end of file +[tool.mypy] +pretty = true +no_implicit_optional = true +python_version = "3.10" + +[[tool.mypy.overrides]] +module = [ + "torchvision.*", + "transformers.*" +] +ignore_missing_imports = true \ No newline at end of file diff --git a/tests/test_code_formula_predictor.py b/tests/test_code_formula_predictor.py index 52614f4..0a62084 100644 --- a/tests/test_code_formula_predictor.py +++ b/tests/test_code_formula_predictor.py @@ -90,6 +90,16 @@ def test_code_formula_predictor(init: dict): is_exception = True assert is_exception + # wrong value for temperature + is_exception = False + try: + dummy_image = Image.new(mode="RGB", size=(100, 100), color=(255, 255, 255)) + for _ in code_formula_predictor.predict([dummy_image], ["label"], None): + pass + except Exception: + is_exception = True + assert is_exception + # mistmatched number of images and labels is_exception = False try: @@ -112,7 +122,7 @@ def test_code_formula_predictor(init: dict): output = code_formula_predictor.predict([img], [label], temperature) output = output[0] - + assert output == gt # Load images as numpy arrays