diff --git a/AnshuSaini_ai-ml-task.py b/AnshuSaini_ai-ml-task.py new file mode 100644 index 0000000..b434fdc --- /dev/null +++ b/AnshuSaini_ai-ml-task.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +"""Untitled1.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/17AyuY5Ocj8Whzke_L_m3nrYmWlQmF11S + +**Submitted By:- Anshu Saini (IIITDM Kancheepuram Btech in CSE-AI) +Email:- anshucodes4u@gamil.com / cs22b2051@iiitdm.ac.in +** +""" + +pip install pytesseract + +pip install langchain_groq + +!sudo apt install tesseract-ocr +!sudo apt install libtesseract-dev + +import cv2 +import pytesseract +from pytesseract import Output +from collections import defaultdict +from langchain_groq import ChatGroq +import json +pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' + +# Preprocessing the image and performing OCR +def preprocessing(path): + image = cv2.imread(path) # reading the image + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # grayscaling + _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV) # thresholding to binarize the image + custom_config = r'--oem 3 --psm 6' # Default config for structured output + data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config) + return data + +def extract_text(text): + llm = ChatGroq( + temperature=0, + groq_api_key='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', + model_name="llama-3.1-70b-versatile" + ) + prompt = f""" + Given the following text, extract the relationship between glands and the hormones they secrete. + Organize the output in proper JSON format with glands as keys and lists of hormones as values. + + The output should look like this: + {{ + "Gland Name 1": ["Hormone A", "Hormone B"], + "Gland Name 2": ["Hormone C", "Hormone D"] + }} + give only the output of script + Text: {text} + """ + response = llm.invoke(prompt) + response_content = response.content + + # Parsing response as json (using error handling because it can also fail) + try: + formatted_response = json.loads(response_content) + print("OUTPUT:-") + print(json.dumps(formatted_response, indent=4)) + return formatted_response + except json.JSONDecodeError: + # If parsing fails + print("OUTPUT:-") + print(response_content) + return response_content + +def extract(path): + ocr_data = preprocessing(path) + organized_dict = extract_text(ocr_data) + return organized_dict + +path = "sample.jpeg" +output_dict = extract(path) \ No newline at end of file diff --git a/MyReadme.md b/MyReadme.md new file mode 100644 index 0000000..5f4f816 --- /dev/null +++ b/MyReadme.md @@ -0,0 +1,71 @@ +**Overview** + +I have performed Optical Character Recognition (OCR) on an input image followed by the use of a Large Language Model (LLM) to process the extracted text and identify relationships between glands and the hormones they secrete. The entire process leverages Tesseract for OCR, OpenCV for image preprocessing, and ChatGroq for natural language processing. + +**Structure** + +|-- Anshu Saini_ai-ml-task.py # Main script for the extraction process + +|-- MyReadme.md # My readme file + +|-- README.md + +|-- demonstartion.mp4 #video demonstarting the output + +|-- sample.jpeg # Example input image + + +**Features** + +1.) **Image Preprocessing:** The input image undergoes grayscale conversion and thresholding for better OCR results. + +2.)**Optical Character Recognition (OCR):** Tesseract extracts textual data from the preprocessed image. + +3.)**Text Parsing and Extraction:** The text extracted is passed to an LLM (ChatGroq) to identify glands and their corresponding hormones. + +4.)**JSON Output:** The relationships between glands and hormones are organized and outputted in a structured JSON format. + +**Technologies** + +1.)**OpenCV:** Used for reading and preprocessing the input image. +2.)**Tesseract:** Optical Character Recognition (OCR) engine for extracting text from the image. +3.)**ChatGroq:** A powerful LLM-based API used for parsing the extracted text and identifying relevant gland-hormone relationships. +4.)**Python 3.x:** The language used for development. + +**Installation Prerequisites** + +1.)Python 3.x +2.)OpenCV +3.)Tesseract OCR +4.)ChatGroq API access + +**Install Dependencies** + +1.) Clone the repository:- +use the commands:- +git clone https://github.com/username/ai-ml-task.git + +cd ai-ml-task + +2.) Install required python libraries:- +pip install opencv-python pytesseract langchain_groq + +3.) Install Tesseract:- +for windows download from the official [Tesseract Website](https://github.com/tesseract-ocr/tesseract) + +4.) Add the Tesseract executable path to your Python script (Windows only): +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +**Usage** + +1.)Place the input image (sample.jpeg) in the root directory. +2.)Modify the path variable in the script to point to your image. +3.)Run the script +4.)The output will be displayed in the JSON format. + +**Example Output** + +{ + "Hypothalamus": ["TRH", "CRH", "GHRH","Dopamine"], + "Pineal gland": ["Melatonin"] +} diff --git a/demonstration.mp4 b/demonstration.mp4 new file mode 100644 index 0000000..f7b13fe Binary files /dev/null and b/demonstration.mp4 differ