Skip to content

Project #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions recognized.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Relaxin
Prolactin
Androgens
Progesterone
Estrogens
Erythropoietin
Renin
Calcitriol
Somatostatin
Insulin
Glucagon
Noradrenaline
Adrenaline
Neuropeptide
Glucocorticoids
Somatostatin
Androgens
Histamine
Gastrin
Thymopoietin
PTH
Calcitonin
Vasopressin
Oxytocin
MSH
FSH
TSH
Vasopressin
Somatostatin
Dopamine
TRH
CRH
Melatborn
222 changes: 222 additions & 0 deletions tess.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import pytesseract"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[[255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" ...,\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255]],\n",
"\n",
" [[255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" ...,\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255]],\n",
"\n",
" [[255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" ...,\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255]],\n",
"\n",
" ...,\n",
"\n",
" [[255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" ...,\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255]],\n",
"\n",
" [[255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" ...,\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255]],\n",
"\n",
" [[255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" ...,\n",
" [255, 255, 255],\n",
" [255, 255, 255],\n",
" [255, 255, 255]]], dtype=uint8)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"img=cv2.imread(\"sample.jpeg\")\n",
"img"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Relaxin', 'Prolactin', 'Androgens', 'Progesterone', 'Estrogens', 'Erythropoietin', 'Renin', 'Calcitriol', 'Somatostatin', 'Insulin', 'Glucagon', 'Noradrenaline', 'Adrenaline', 'Neuropeptide', 'Glucocorticoids', 'Somatostatin', 'Androgens', 'Histamine', 'Gastrin', 'Thymopoietin', 'PTH', 'Calcitonin', 'Vasopressin', 'Oxytocin', 'MSH', 'FSH', 'TSH', 'Vasopressin', 'Somatostatin', 'Dopamine', 'TRH', 'CRH', 'Melatborn']\n"
]
}
],
"source": [
"import cv2\n",
"import pytesseract\n",
"import re\n",
"\n",
"# Function to filter out invalid or incomplete hormone names\n",
"def filter_hormones(hormone_list):\n",
" valid_hormones = []\n",
" for hormone in hormone_list:\n",
" # Remove any text that is too short (e.g., less than 3 characters) or has numbers or special characters\n",
" if len(hormone) > 2 and hormone.isalpha():\n",
" valid_hormones.append(hormone.strip())\n",
" return valid_hormones\n",
"\n",
"# Assuming you already have your image processed to get contours\n",
"\n",
"# Dilation and contour extraction (unchanged)\n",
"dilation = cv2.dilate(thresh1, rect_kernel, iterations=1)\n",
"contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)\n",
"\n",
"# Creating a copy of image\n",
"im2 = img.copy()\n",
"\n",
"# A text file is created and flushed\n",
"file = open(\"recognized.txt\", \"w+\")\n",
"file.write(\"\")\n",
"file.close()\n",
"\n",
"# List to store recognized hormones\n",
"hormones = []\n",
"\n",
"# Looping through the identified contours\n",
"for cnt in contours:\n",
" x, y, w, h = cv2.boundingRect(cnt)\n",
" \n",
" # Drawing a rectangle on copied image\n",
" rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)\n",
" \n",
" # Cropping the text block for giving input to OCR\n",
" cropped = im2[y:y + h, x:x + w]\n",
"\n",
" # Open the file to append the extracted text\n",
" file = open(\"recognized.txt\", \"a\")\n",
" \n",
" # Apply OCR on the cropped image\n",
" text = pytesseract.image_to_string(cropped)\n",
" \n",
" # Clean up the text (replace newlines with spaces, strip leading/trailing spaces)\n",
" cleaned_text = text.replace(\"\\n\", \" \").strip()\n",
" \n",
" # Split the cleaned text based on commas and newlines (both)\n",
" hormone_list = re.split(r'[,\\n]', cleaned_text)\n",
" \n",
" # Clean up any empty strings or excessive spaces\n",
" hormone_list = [hormone.strip() for hormone in hormone_list if hormone.strip()]\n",
" \n",
" # Filter valid hormones\n",
" valid_hormones = filter_hormones(hormone_list)\n",
" \n",
" # If there are any valid hormone names, append them to the hormones list\n",
" if valid_hormones:\n",
" hormones.extend(valid_hormones)\n",
" file.write(\" \".join(valid_hormones) + \"\\n\")\n",
" \n",
" # Close the file after writing\n",
" file.close()\n",
"\n",
"# Print the recognized hormones\n",
"print(hormones)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
80 changes: 80 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import cv2
import pytesseract
import re

# Read the input image
img = cv2.imread("sample.jpeg")

# Convert the image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Apply OTSU thresholding to convert the image to binary
_, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

# Define a rectangular structuring element for dilation
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))

# Dilation to enhance contours
dilation = cv2.dilate(thresh1, rect_kernel, iterations=1)

# Find contours from the dilated image
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

# Create a copy of the original image to draw rectangles on
im2 = img.copy()

# Function to filter valid hormone names
def filter_hormones(hormone_list):
valid_hormones = []
for hormone in hormone_list:
# Remove any text that is too short (e.g., less than 3 characters) or has numbers or special characters
if len(hormone) > 2 and hormone.isalpha():
valid_hormones.append(hormone.strip())
return valid_hormones

# Clear the contents of the text file or create a new one
file = open("recognized.txt", "w+")
file.write("")
file.close()

# List to store recognized hormone names
hormones = []

# Loop through all the contours found
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)

# Draw a rectangle around each contour
rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Crop the region of interest (ROI) from the image
cropped = im2[y:y + h, x:x + w]

# Open the file to append the extracted text
file = open("recognized.txt", "a")

# Use Tesseract to extract text from the cropped image
text = pytesseract.image_to_string(cropped)

# Clean up the extracted text
cleaned_text = text.replace("\n", " ").strip()

# Split the cleaned text based on commas and newlines
hormone_list = re.split(r'[,\n]', cleaned_text)

# Clean up any empty strings or excessive spaces
hormone_list = [hormone.strip() for hormone in hormone_list if hormone.strip()]

# Filter valid hormone names from the extracted text
valid_hormones = filter_hormones(hormone_list)

# If there are valid hormone names, write them to the file and add to the list
if valid_hormones:
hormones.extend(valid_hormones)
file.write(" ".join(valid_hormones) + "\n")

# Close the file after writing
file.close()

# Print the recognized hormones
print(hormones)