RandomwalkDev · ajitha26 · Oct 20, 2024 · Oct 20, 2024
diff --git a/recognized.txt b/recognized.txt
@@ -0,0 +1,33 @@
+Relaxin
+Prolactin
+Androgens
+Progesterone
+Estrogens
+Erythropoietin
+Renin
+Calcitriol
+Somatostatin
+Insulin
+Glucagon
+Noradrenaline
+Adrenaline
+Neuropeptide
+Glucocorticoids
+Somatostatin
+Androgens
+Histamine
+Gastrin
+Thymopoietin
+PTH
+Calcitonin
+Vasopressin
+Oxytocin
+MSH
+FSH
+TSH
+Vasopressin
+Somatostatin
+Dopamine
+TRH
+CRH
+Melatborn
diff --git a/tess.ipynb b/tess.ipynb
@@ -0,0 +1,222 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "import pytesseract"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[[255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        ...,\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255]],\n",
+       "\n",
+       "       [[255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        ...,\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255]],\n",
+       "\n",
+       "       [[255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        ...,\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255]],\n",
+       "\n",
+       "       ...,\n",
+       "\n",
+       "       [[255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        ...,\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255]],\n",
+       "\n",
+       "       [[255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        ...,\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255]],\n",
+       "\n",
+       "       [[255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        ...,\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255],\n",
+       "        [255, 255, 255]]], dtype=uint8)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "img=cv2.imread(\"sample.jpeg\")\n",
+    "img"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Relaxin', 'Prolactin', 'Androgens', 'Progesterone', 'Estrogens', 'Erythropoietin', 'Renin', 'Calcitriol', 'Somatostatin', 'Insulin', 'Glucagon', 'Noradrenaline', 'Adrenaline', 'Neuropeptide', 'Glucocorticoids', 'Somatostatin', 'Androgens', 'Histamine', 'Gastrin', 'Thymopoietin', 'PTH', 'Calcitonin', 'Vasopressin', 'Oxytocin', 'MSH', 'FSH', 'TSH', 'Vasopressin', 'Somatostatin', 'Dopamine', 'TRH', 'CRH', 'Melatborn']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import cv2\n",
+    "import pytesseract\n",
+    "import re\n",
+    "\n",
+    "# Function to filter out invalid or incomplete hormone names\n",
+    "def filter_hormones(hormone_list):\n",
+    "    valid_hormones = []\n",
+    "    for hormone in hormone_list:\n",
+    "        # Remove any text that is too short (e.g., less than 3 characters) or has numbers or special characters\n",
+    "        if len(hormone) > 2 and hormone.isalpha():\n",
+    "            valid_hormones.append(hormone.strip())\n",
+    "    return valid_hormones\n",
+    "\n",
+    "# Assuming you already have your image processed to get contours\n",
+    "\n",
+    "# Dilation and contour extraction (unchanged)\n",
+    "dilation = cv2.dilate(thresh1, rect_kernel, iterations=1)\n",
+    "contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)\n",
+    "\n",
+    "# Creating a copy of image\n",
+    "im2 = img.copy()\n",
+    "\n",
+    "# A text file is created and flushed\n",
+    "file = open(\"recognized.txt\", \"w+\")\n",
+    "file.write(\"\")\n",
+    "file.close()\n",
+    "\n",
+    "# List to store recognized hormones\n",
+    "hormones = []\n",
+    "\n",
+    "# Looping through the identified contours\n",
+    "for cnt in contours:\n",
+    "    x, y, w, h = cv2.boundingRect(cnt)\n",
+    "    \n",
+    "    # Drawing a rectangle on copied image\n",
+    "    rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)\n",
+    "    \n",
+    "    # Cropping the text block for giving input to OCR\n",
+    "    cropped = im2[y:y + h, x:x + w]\n",
+    "\n",
+    "    # Open the file to append the extracted text\n",
+    "    file = open(\"recognized.txt\", \"a\")\n",
+    "    \n",
+    "    # Apply OCR on the cropped image\n",
+    "    text = pytesseract.image_to_string(cropped)\n",
+    "    \n",
+    "    # Clean up the text (replace newlines with spaces, strip leading/trailing spaces)\n",
+    "    cleaned_text = text.replace(\"\\n\", \" \").strip()\n",
+    "    \n",
+    "    # Split the cleaned text based on commas and newlines (both)\n",
+    "    hormone_list = re.split(r'[,\\n]', cleaned_text)\n",
+    "    \n",
+    "    # Clean up any empty strings or excessive spaces\n",
+    "    hormone_list = [hormone.strip() for hormone in hormone_list if hormone.strip()]\n",
+    "    \n",
+    "    # Filter valid hormones\n",
+    "    valid_hormones = filter_hormones(hormone_list)\n",
+    "    \n",
+    "    # If there are any valid hormone names, append them to the hormones list\n",
+    "    if valid_hormones:\n",
+    "        hormones.extend(valid_hormones)\n",
+    "        file.write(\" \".join(valid_hormones) + \"\\n\")\n",
+    "    \n",
+    "    # Close the file after writing\n",
+    "    file.close()\n",
+    "\n",
+    "# Print the recognized hormones\n",
+    "print(hormones)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test.py b/test.py
@@ -0,0 +1,80 @@
+import cv2
+import pytesseract
+import re
+
+# Read the input image
+img = cv2.imread("sample.jpeg")
+
+# Convert the image to grayscale
+gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+# Apply OTSU thresholding to convert the image to binary
+_, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
+
+# Define a rectangular structuring element for dilation
+rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))
+
+# Dilation to enhance contours
+dilation = cv2.dilate(thresh1, rect_kernel, iterations=1)
+
+# Find contours from the dilated image
+contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+
+# Create a copy of the original image to draw rectangles on
+im2 = img.copy()
+
+# Function to filter valid hormone names
+def filter_hormones(hormone_list):
+    valid_hormones = []
+    for hormone in hormone_list:
+        # Remove any text that is too short (e.g., less than 3 characters) or has numbers or special characters
+        if len(hormone) > 2 and hormone.isalpha():
+            valid_hormones.append(hormone.strip())
+    return valid_hormones
+
+# Clear the contents of the text file or create a new one
+file = open("recognized.txt", "w+")
+file.write("")
+file.close()
+
+# List to store recognized hormone names
+hormones = []
+
+# Loop through all the contours found
+for cnt in contours:
+    x, y, w, h = cv2.boundingRect(cnt)
+
+    # Draw a rectangle around each contour
+    rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)
+
+    # Crop the region of interest (ROI) from the image
+    cropped = im2[y:y + h, x:x + w]
+
+    # Open the file to append the extracted text
+    file = open("recognized.txt", "a")
+
+    # Use Tesseract to extract text from the cropped image
+    text = pytesseract.image_to_string(cropped)
+
+    # Clean up the extracted text
+    cleaned_text = text.replace("\n", " ").strip()
+
+    # Split the cleaned text based on commas and newlines
+    hormone_list = re.split(r'[,\n]', cleaned_text)
+
+    # Clean up any empty strings or excessive spaces
+    hormone_list = [hormone.strip() for hormone in hormone_list if hormone.strip()]
+
+    # Filter valid hormone names from the extracted text
+    valid_hormones = filter_hormones(hormone_list)
+
+    # If there are valid hormone names, write them to the file and add to the list
+    if valid_hormones:
+        hormones.extend(valid_hormones)
+        file.write(" ".join(valid_hormones) + "\n")
+
+    # Close the file after writing
+    file.close()
+
+# Print the recognized hormones
+print(hormones)