diff --git a/recognized.txt b/recognized.txt new file mode 100644 index 0000000..9997a89 --- /dev/null +++ b/recognized.txt @@ -0,0 +1,33 @@ +Relaxin +Prolactin +Androgens +Progesterone +Estrogens +Erythropoietin +Renin +Calcitriol +Somatostatin +Insulin +Glucagon +Noradrenaline +Adrenaline +Neuropeptide +Glucocorticoids +Somatostatin +Androgens +Histamine +Gastrin +Thymopoietin +PTH +Calcitonin +Vasopressin +Oxytocin +MSH +FSH +TSH +Vasopressin +Somatostatin +Dopamine +TRH +CRH +Melatborn diff --git a/tess.ipynb b/tess.ipynb new file mode 100644 index 0000000..3a33d5a --- /dev/null +++ b/tess.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import cv2\n", + "import pytesseract" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[[255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " ...,\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255]],\n", + "\n", + " [[255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " ...,\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255]],\n", + "\n", + " [[255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " ...,\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255]],\n", + "\n", + " ...,\n", + "\n", + " [[255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " ...,\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255]],\n", + "\n", + " [[255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " ...,\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255]],\n", + "\n", + " [[255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " ...,\n", + " [255, 255, 255],\n", + " [255, 255, 255],\n", + " [255, 255, 255]]], dtype=uint8)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "img=cv2.imread(\"sample.jpeg\")\n", + "img" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Relaxin', 'Prolactin', 'Androgens', 'Progesterone', 'Estrogens', 'Erythropoietin', 'Renin', 'Calcitriol', 'Somatostatin', 'Insulin', 'Glucagon', 'Noradrenaline', 'Adrenaline', 'Neuropeptide', 'Glucocorticoids', 'Somatostatin', 'Androgens', 'Histamine', 'Gastrin', 'Thymopoietin', 'PTH', 'Calcitonin', 'Vasopressin', 'Oxytocin', 'MSH', 'FSH', 'TSH', 'Vasopressin', 'Somatostatin', 'Dopamine', 'TRH', 'CRH', 'Melatborn']\n" + ] + } + ], + "source": [ + "import cv2\n", + "import pytesseract\n", + "import re\n", + "\n", + "# Function to filter out invalid or incomplete hormone names\n", + "def filter_hormones(hormone_list):\n", + " valid_hormones = []\n", + " for hormone in hormone_list:\n", + " # Remove any text that is too short (e.g., less than 3 characters) or has numbers or special characters\n", + " if len(hormone) > 2 and hormone.isalpha():\n", + " valid_hormones.append(hormone.strip())\n", + " return valid_hormones\n", + "\n", + "# Assuming you already have your image processed to get contours\n", + "\n", + "# Dilation and contour extraction (unchanged)\n", + "dilation = cv2.dilate(thresh1, rect_kernel, iterations=1)\n", + "contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)\n", + "\n", + "# Creating a copy of image\n", + "im2 = img.copy()\n", + "\n", + "# A text file is created and flushed\n", + "file = open(\"recognized.txt\", \"w+\")\n", + "file.write(\"\")\n", + "file.close()\n", + "\n", + "# List to store recognized hormones\n", + "hormones = []\n", + "\n", + "# Looping through the identified contours\n", + "for cnt in contours:\n", + " x, y, w, h = cv2.boundingRect(cnt)\n", + " \n", + " # Drawing a rectangle on copied image\n", + " rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)\n", + " \n", + " # Cropping the text block for giving input to OCR\n", + " cropped = im2[y:y + h, x:x + w]\n", + "\n", + " # Open the file to append the extracted text\n", + " file = open(\"recognized.txt\", \"a\")\n", + " \n", + " # Apply OCR on the cropped image\n", + " text = pytesseract.image_to_string(cropped)\n", + " \n", + " # Clean up the text (replace newlines with spaces, strip leading/trailing spaces)\n", + " cleaned_text = text.replace(\"\\n\", \" \").strip()\n", + " \n", + " # Split the cleaned text based on commas and newlines (both)\n", + " hormone_list = re.split(r'[,\\n]', cleaned_text)\n", + " \n", + " # Clean up any empty strings or excessive spaces\n", + " hormone_list = [hormone.strip() for hormone in hormone_list if hormone.strip()]\n", + " \n", + " # Filter valid hormones\n", + " valid_hormones = filter_hormones(hormone_list)\n", + " \n", + " # If there are any valid hormone names, append them to the hormones list\n", + " if valid_hormones:\n", + " hormones.extend(valid_hormones)\n", + " file.write(\" \".join(valid_hormones) + \"\\n\")\n", + " \n", + " # Close the file after writing\n", + " file.close()\n", + "\n", + "# Print the recognized hormones\n", + "print(hormones)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test.py b/test.py new file mode 100644 index 0000000..0baba80 --- /dev/null +++ b/test.py @@ -0,0 +1,80 @@ +import cv2 +import pytesseract +import re + +# Read the input image +img = cv2.imread("sample.jpeg") + +# Convert the image to grayscale +gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + +# Apply OTSU thresholding to convert the image to binary +_, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) + +# Define a rectangular structuring element for dilation +rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18)) + +# Dilation to enhance contours +dilation = cv2.dilate(thresh1, rect_kernel, iterations=1) + +# Find contours from the dilated image +contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + +# Create a copy of the original image to draw rectangles on +im2 = img.copy() + +# Function to filter valid hormone names +def filter_hormones(hormone_list): + valid_hormones = [] + for hormone in hormone_list: + # Remove any text that is too short (e.g., less than 3 characters) or has numbers or special characters + if len(hormone) > 2 and hormone.isalpha(): + valid_hormones.append(hormone.strip()) + return valid_hormones + +# Clear the contents of the text file or create a new one +file = open("recognized.txt", "w+") +file.write("") +file.close() + +# List to store recognized hormone names +hormones = [] + +# Loop through all the contours found +for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + + # Draw a rectangle around each contour + rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2) + + # Crop the region of interest (ROI) from the image + cropped = im2[y:y + h, x:x + w] + + # Open the file to append the extracted text + file = open("recognized.txt", "a") + + # Use Tesseract to extract text from the cropped image + text = pytesseract.image_to_string(cropped) + + # Clean up the extracted text + cleaned_text = text.replace("\n", " ").strip() + + # Split the cleaned text based on commas and newlines + hormone_list = re.split(r'[,\n]', cleaned_text) + + # Clean up any empty strings or excessive spaces + hormone_list = [hormone.strip() for hormone in hormone_list if hormone.strip()] + + # Filter valid hormone names from the extracted text + valid_hormones = filter_hormones(hormone_list) + + # If there are valid hormone names, write them to the file and add to the list + if valid_hormones: + hormones.extend(valid_hormones) + file.write(" ".join(valid_hormones) + "\n") + + # Close the file after writing + file.close() + +# Print the recognized hormones +print(hormones)