diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 17bf229..6ce2add 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,9 +19,9 @@ repos:
         entry: poetry lock --check
         pass_filenames: false
         language: system
-      # - id: system
-      #   name: MyPy
-      #   entry: poetry run mypy docling_ibm_models
-      #   pass_filenames: false
-      #   language: system
-      #   files: '\.py$'
\ No newline at end of file
+      - id: system
+        name: MyPy
+        entry: poetry run mypy docling_ibm_models
+        pass_filenames: false
+        language: system
+        files: '\.py$'
\ No newline at end of file
diff --git a/docling_ibm_models/__init__.py b/docling_ibm_models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docling_ibm_models/code_formula_model/__init__.py b/docling_ibm_models/code_formula_model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docling_ibm_models/code_formula_model/code_formula_predictor.py b/docling_ibm_models/code_formula_model/code_formula_predictor.py
index 5804558..b66e634 100644
--- a/docling_ibm_models/code_formula_model/code_formula_predictor.py
+++ b/docling_ibm_models/code_formula_model/code_formula_predictor.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: MIT
 #
 import logging
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import torch
@@ -132,7 +132,7 @@ def predict(
         self,
         images: List[Union[Image.Image, np.ndarray]],
         labels: List[str],
-        temperature: float = 0.1,
+        temperature: Optional[float] = 0.1,
     ) -> List[str]:
         """
         Predicts the textual representation of input images (code or LaTeX).
@@ -143,7 +143,7 @@ def predict(
             List of images to be processed, provided as PIL Image objects or numpy arrays.
         labels : List[str]
             List of labels indicating the type of each image ('code' or 'formula').
-        temperature : float, optional
+        temperature : Optional[float]
             Sampling temperature for generation, by default set to 0.1.
 
         Returns
@@ -159,7 +159,11 @@ def predict(
         Excpetion
             In case the temperature is an invalid number.
         """
-        if (type(temperature) != float and type(temperature) != int) or temperature < 0:
+        if (
+            temperature is None
+            or not (isinstance(temperature, float) or isinstance(temperature, int))
+            or temperature < 0
+        ):
             raise Exception("Temperature must be a number greater or equal to 0.")
 
         do_sample = True
@@ -181,11 +185,10 @@ def predict(
             else:
                 raise TypeError("Not supported input image format")
             images_tmp.append(image)
-        images = images_tmp
 
-        images_tensor = torch.stack([self._image_processor(img) for img in images]).to(
-            self._device
-        )
+        images_tensor = torch.stack(
+            [self._image_processor(img) for img in images_tmp]
+        ).to(self._device)
 
         prompts = [self._get_prompt(label) for label in labels]
 
diff --git a/docling_ibm_models/code_formula_model/models/__init__.py b/docling_ibm_models/code_formula_model/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docling_ibm_models/code_formula_model/models/sam_opt.py b/docling_ibm_models/code_formula_model/models/sam_opt.py
index c682f45..8bd9d8d 100644
--- a/docling_ibm_models/code_formula_model/models/sam_opt.py
+++ b/docling_ibm_models/code_formula_model/models/sam_opt.py
@@ -67,14 +67,14 @@ def embed_tokens(self, x):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: torch.LongTensor,
         attention_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        images: torch.FloatTensor = None,
+        images: Optional[torch.FloatTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
 
@@ -86,6 +86,7 @@ def forward(
 
         if input_ids.shape[1] != 1 or self.training:
             with torch.set_grad_enabled(self.training):
+                assert vision_tower is not None
                 image_features = vision_tower(images)
                 image_features = image_features.flatten(2).permute(0, 2, 1)
                 image_features = self.mm_projector(image_features)
@@ -94,9 +95,9 @@ def forward(
             for cur_input_ids, cur_input_embeds, cur_image_features in zip(
                 input_ids, inputs_embeds, image_features
             ):
-                image_start_token_position = torch.where(
-                    cur_input_ids == im_start_token
-                )[0].item()
+                image_start_token_position = int(
+                    torch.where(cur_input_ids == im_start_token)[0].item()
+                )  # cast to int for mypy
 
                 cur_image_features = cur_image_features.to(
                     device=cur_input_embeds.device
@@ -115,7 +116,7 @@ def forward(
 
                 new_input_embeds.append(cur_input_embeds)
 
-            inputs_embeds = torch.stack(new_input_embeds, dim=0)
+            inputs_embeds = torch.stack(new_input_embeds, dim=0)  # type: ignore
 
         return super(SamOPTModel, self).forward(
             input_ids=None,
diff --git a/docling_ibm_models/document_figure_classifier_model/__init__.py b/docling_ibm_models/document_figure_classifier_model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py b/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py
index ea51a72..a6db7a7 100644
--- a/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py
+++ b/docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py
@@ -147,24 +147,23 @@ def predict(
 
             The predictions for each image are sorted in descending order of confidence.
         """
-        processed_images = []
+        rgb_images = []
         for image in images:
             if isinstance(image, Image.Image):
-                processed_images.append(image.convert("RGB"))
+                rgb_images.append(image.convert("RGB"))
             elif isinstance(image, np.ndarray):
-                processed_images.append(Image.fromarray(image).convert("RGB"))
+                rgb_images.append(Image.fromarray(image).convert("RGB"))
             else:
                 raise TypeError(
                     "Supported input formats are PIL.Image.Image or numpy.ndarray."
                 )
-        images = processed_images
 
         # (batch_size, 3, 224, 224)
-        images = [self._image_processor(image) for image in images]
-        images = torch.stack(images).to(self._device)
+        processed_images = [self._image_processor(image) for image in rgb_images]
+        torch_images = torch.stack(processed_images).to(self._device)
 
         with torch.no_grad():
-            logits = self._model(images).logits  # (batch_size, num_classes)
+            logits = self._model(torch_images).logits  # (batch_size, num_classes)
             probs_batch = logits.softmax(dim=1)  # (batch_size, num_classes)
             probs_batch = probs_batch.cpu().numpy().tolist()
 
diff --git a/docling_ibm_models/layoutmodel/__init__.py b/docling_ibm_models/layoutmodel/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docling_ibm_models/py.typed b/docling_ibm_models/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py b/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py
index 488f34c..887d368 100644
--- a/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py
+++ b/docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py
@@ -36,7 +36,7 @@ def forward(self, x):
 
 
 class TMTransformerDecoder(nn.TransformerDecoder):
-    def forward(
+    def forward(  # type: ignore
         self,
         tgt: Tensor,
         memory: Optional[Tensor] = None,
@@ -69,11 +69,11 @@ def forward(
         else:
             out_cache = torch.stack(tag_cache, dim=0)
 
-        return output, out_cache
+        return output, out_cache  # type: ignore
 
 
 class TMTransformerDecoderLayer(nn.TransformerDecoderLayer):
-    def forward(
+    def forward(  # type: ignore
         self,
         tgt: Tensor,
         memory: Optional[Tensor] = None,
diff --git a/docling_ibm_models/tableformer/otsl.py b/docling_ibm_models/tableformer/otsl.py
index 447cbf4..85ed697 100644
--- a/docling_ibm_models/tableformer/otsl.py
+++ b/docling_ibm_models/tableformer/otsl.py
@@ -11,7 +11,7 @@
 LOG_LEVEL = logging.INFO
 # LOG_LEVEL = logging.DEBUG
 logger = s.get_custom_logger("consolidate", LOG_LEVEL)
-png_files = {}  # Evaluation files
+# png_files = {}  # Evaluation files
 total_pics = 0
 
 
diff --git a/docling_ibm_models/tableformer/utils/mem_monitor.py b/docling_ibm_models/tableformer/utils/mem_monitor.py
index be47c9c..c263b20 100644
--- a/docling_ibm_models/tableformer/utils/mem_monitor.py
+++ b/docling_ibm_models/tableformer/utils/mem_monitor.py
@@ -5,6 +5,7 @@
 import os
 import platform
 import re
+from typing import Dict, Union
 
 
 class MemMonitor:
@@ -112,7 +113,7 @@ def __init__(self, enable=True):
             regex_str = r"({}:)(\s+)(\d*)(.*)".format(mem_field)
             self._status_regex[mem_field] = re.compile(regex_str)
 
-    def get_memory_full(self) -> dict:
+    def get_memory_full(self) -> Union[Dict, int]:
         r"""
         - Parse /proc/<pid>status to get all memory info.
         - The method returns a dict with the fields self._status_fields
@@ -140,7 +141,7 @@ def get_memory_full(self) -> dict:
 
         return memory
 
-    def get_memory(self) -> dict:
+    def get_memory(self) -> Union[Dict, int]:
         r"""
         - Parse /proc/<pid>statm to get the most important memory fields
         - This is a fast implementation.
diff --git a/docling_ibm_models/tableformer/utils/torch_utils.py b/docling_ibm_models/tableformer/utils/torch_utils.py
deleted file mode 100644
index 09bcde2..0000000
--- a/docling_ibm_models/tableformer/utils/torch_utils.py
+++ /dev/null
@@ -1,216 +0,0 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
-import torch
-
-
-def model_info(model, verbose=False):
-    # Plots a line-by-line description of a PyTorch model
-    n_p = sum(x.numel() for x in model.parameters())  # number parameters
-    n_g = sum(
-        x.numel() for x in model.parameters() if x.requires_grad
-    )  # number gradients
-    if verbose:
-        print(
-            "%5s %40s %9s %12s %20s %10s %10s"
-            % ("layer", "name", "gradient", "parameters", "shape", "mu", "sigma")
-        )
-        for i, (name, p) in enumerate(model.named_parameters()):
-            name = name.replace("module_list.", "")
-            print(
-                "%5g %40s %9s %12g %20s %10.3g %10.3g"
-                % (
-                    i,
-                    name,
-                    p.requires_grad,
-                    p.numel(),
-                    list(p.shape),
-                    p.mean(),
-                    p.std(),
-                )
-            )
-
-    try:  # FLOPS
-        from thop import profile
-
-        macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False)
-        fs = ", %.1f GFLOPS" % (macs / 1e9 * 2)
-    except Exception:
-        fs = ""
-
-    print(
-        "Model Summary: %g layers, %g parameters, %g gradients%s"
-        % (len(list(model.parameters())), n_p, n_g, fs)
-    )
-
-
-# def init_seeds(seed=0):
-#     torch.manual_seed(seed)
-#
-#     # Reduce randomness (may be slower on Tesla GPUs)
-#     # https://pytorch.org/docs/stable/notes/randomness.html
-#     if seed == 0:
-#         cudnn.deterministic = False
-#         cudnn.benchmark = True
-#
-#
-# def select_device(device='', apex=False, batch_size=None):
-#     # device = 'cpu' or '0' or '0,1,2,3'
-#     cpu_request = device.lower() == 'cpu'
-#     if device and not cpu_request:  # if device requested other than 'cpu'
-#         os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable
-#         # check availablity
-#         assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device
-#
-#     cuda = False if cpu_request else torch.cuda.is_available()
-#     if cuda:
-#         c = 1024 ** 2  # bytes to MB
-#         ng = torch.cuda.device_count()
-#         if ng > 1 and batch_size:  # check that batch_size is compatible with device_count
-#             assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % \
-#                 (batch_size, ng)
-#         x = [torch.cuda.get_device_properties(i) for i in range(ng)]
-#         # apex for mixed precision https://github.com/NVIDIA/apex
-#         s = 'Using CUDA ' + ('Apex ' if apex else '')
-#         for i in range(0, ng):
-#             if i == 1:
-#                 s = ' ' * len(s)
-#             print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
-#                   (s, i, x[i].name, x[i].total_memory / c))
-#     else:
-#         print('Using CPU')
-#
-#     print('')  # skip a line
-#     return torch.device('cuda:0' if cuda else 'cpu')
-#
-#
-# def time_synchronized():
-#     torch.cuda.synchronize() if torch.cuda.is_available() else None
-#     return time.time()
-#
-#
-# def initialize_weights(model):
-#     for m in model.modules():
-#         t = type(m)
-#         if t is nn.Conv2d:
-#             pass  # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-#         elif t is nn.BatchNorm2d:
-#             m.eps = 1e-4
-#             m.momentum = 0.03
-#         elif t in [nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
-#             m.inplace = True
-#
-#
-# def find_modules(model, mclass=nn.Conv2d):
-#     # finds layer indices matching module class 'mclass'
-#     return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)]
-#
-#
-# def fuse_conv_and_bn(conv, bn):
-#     # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
-#     with torch.no_grad():
-#         # init
-#         fusedconv = torch.nn.Conv2d(conv.in_channels,
-#                                     conv.out_channels,
-#                                     kernel_size=conv.kernel_size,
-#                                     stride=conv.stride,
-#                                     padding=conv.padding,
-#                                     bias=True)
-#
-#         # prepare filters
-#         w_conv = conv.weight.clone().view(conv.out_channels, -1)
-#         w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
-#         fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))
-#
-#         # prepare spatial bias
-#         if conv.bias is not None:
-#             b_conv = conv.bias
-#         else:
-#             b_conv = torch.zeros(conv.weight.size(0))
-#         b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
-#         fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
-#
-#         return fusedconv
-#
-#
-# def load_classifier(name='resnet101', n=2):
-#     # Loads a pretrained model reshaped to n-class output
-#     import pretrainedmodels  # https://github.com/Cadene/pretrained-models.pytorch#torchvision
-#     model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet')
-#
-#     # Display model properties
-#     for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean',
-#               'model.std']:
-#         print(x + ' =', eval(x))
-#
-#     # Reshape output to n classes
-#     filters = model.last_linear.weight.shape[1]
-#     model.last_linear.bias = torch.nn.Parameter(torch.zeros(n))
-#     model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters))
-#     model.last_linear.out_features = n
-#     return model
-#
-#
-# def scale_img(img, ratio=1.0, same_shape=True):  # img(16,3,256,416), r=ratio
-#     # scales img(bs,3,y,x) by ratio
-#     h, w = img.shape[2:]
-#     s = (int(h * ratio), int(w * ratio))  # new size
-#     img = F.interpolate(img, size=s, mode='bilinear', align_corners=False)  # resize
-#     if not same_shape:  # pad/crop img
-#         gs = 64  # (pixels) grid size
-#         h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)]
-#     return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)  # value = imagenet mean
-#
-#
-# class ModelEMA:
-#     """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
-#     Keep a moving average of everything in the model state_dict (parameters and buffers).
-#     This is intended to allow functionality like
-#     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-#     A smoothed version of the weights is necessary for some training schemes to perform well.
-#     E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
-#     RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
-#     smoothing of weights to match results. Pay attention to the decay constant you are using
-#     relative to your update count per epoch.
-#     To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
-#     disable validation of the EMA weights. Validation will have to be done manually in a separate
-#     process, or after the training stops converging.
-#     This class is sensitive where it is initialized in the sequence of model init,
-#     GPU assignment and distributed training wrappers.
-#     I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and
-#     single-GPU.
-#     """
-#
-#     def __init__(self, model, decay=0.9999, device=''):
-#         # make a copy of the model for accumulating moving average of weights
-#         self.ema = deepcopy(model)
-#         self.ema.eval()
-#         self.updates = 0  # number of EMA updates
-#         # decay exponential ramp (to help early epochs)
-#         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
-#         self.device = device  # perform ema on different device from model if set
-#         if device:
-#             self.ema.to(device=device)
-#         for p in self.ema.parameters():
-#             p.requires_grad_(False)
-#
-#     def update(self, model):
-#         self.updates += 1
-#         d = self.decay(self.updates)
-#         with torch.no_grad():
-#             if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel):
-#                 msd, esd = model.module.state_dict(), self.ema.module.state_dict()
-#             else:
-#                 msd, esd = model.state_dict(), self.ema.state_dict()
-#
-#             for k, v in esd.items():
-#                 if v.dtype.is_floating_point:
-#                     v *= d
-#                     v += (1. - d) * msd[k].detach()
-#
-#     def update_attr(self, model):
-#         # Assign attributes (which may change during training)
-#         for k in model.__dict__.keys():
-#             if not k.startswith('_'):
-#                 setattr(self.ema, k, getattr(model, k))
diff --git a/pyproject.toml b/pyproject.toml
index 5c4da77..7d51cc1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,14 +106,14 @@ parser_angular_minor_types = "feat"
 parser_angular_patch_types = "fix,perf"
 
 
-# [tool.mypy]
-# pretty = true
-# no_implicit_optional = true
-# python_version = "3.10"
-#
-# [[tool.mypy.overrides]]
-# module = [
-#     "torchvision.*",
-#     "transformers.*"
-# ]
-# ignore_missing_imports = true
\ No newline at end of file
+[tool.mypy]
+pretty = true
+no_implicit_optional = true
+python_version = "3.10"
+
+[[tool.mypy.overrides]]
+module = [
+    "torchvision.*",
+    "transformers.*"
+]
+ignore_missing_imports = true
\ No newline at end of file
diff --git a/tests/test_code_formula_predictor.py b/tests/test_code_formula_predictor.py
index 52614f4..0a62084 100644
--- a/tests/test_code_formula_predictor.py
+++ b/tests/test_code_formula_predictor.py
@@ -90,6 +90,16 @@ def test_code_formula_predictor(init: dict):
         is_exception = True
     assert is_exception
 
+    # wrong value for temperature
+    is_exception = False
+    try:
+        dummy_image = Image.new(mode="RGB", size=(100, 100), color=(255, 255, 255))
+        for _ in code_formula_predictor.predict([dummy_image], ["label"], None):
+            pass
+    except Exception:
+        is_exception = True
+    assert is_exception
+
     # mistmatched number of images and labels
     is_exception = False
     try:
@@ -112,7 +122,7 @@ def test_code_formula_predictor(init: dict):
 
             output = code_formula_predictor.predict([img], [label], temperature)
             output = output[0]
-            
+
             assert output == gt
 
             # Load images as numpy arrays