diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..5d368cf13 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +__pycache__/* +.idea +.idea/* +node_modules +*.docx +*.DS_Store +*.iml +*.log +*.csv +*.pyc +*/subtitles.json \ No newline at end of file diff --git a/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png b/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png new file mode 100644 index 000000000..e8408f738 Binary files /dev/null and b/ChromeExtension/img/CS410_Fall2023_CourseProject_TeamCAHJ.png differ diff --git a/ChromeExtension/index.html b/ChromeExtension/index.html new file mode 100644 index 000000000..3d5af1116 --- /dev/null +++ b/ChromeExtension/index.html @@ -0,0 +1,52 @@ + + + + + + + + Search Coursera Lectures + + +
+
+ +
+
+ + + +
+ +
+ + + + + + + diff --git a/ChromeExtension/js/search.js b/ChromeExtension/js/search.js new file mode 100644 index 000000000..094c07cd4 --- /dev/null +++ b/ChromeExtension/js/search.js @@ -0,0 +1,181 @@ +const search_btn = document.getElementById("submit-button"); +const result_container = document.querySelector('#result-container') + +search_btn.addEventListener('click', function () { + if (result_container.childElementCount > 0) { + // console.log("Has child(ren)") + remove_all_children(result_container) + } + + search_api() +}); + +async function search_wild() { + // console.log("Inside search_wild..") + //import {Client} from '@elastic' + + const ES_URL = "https://search-cs410-project-hw5dhpc4jsg3m74vnbalajt754.aos.us-east-1.on.aws" + const ES_USER = "elastic" + const ES_PASSWORD = "replace me" + + const client = new Client({ + node: ES_URL, + auth: { + username: ES_USER, + password: ES_PASSWORD + } + }) + + + const query_str = document.getElementById("searchbox").textContent + // console.log("query_str ", query_str) + const result = await client.search({ + index: 'subtitles', + size: 1, + from: 0, + query: { + "query_string": { + "query": query_str, + "default_field": "search_for" + } + } + }) + const timestam_obj = result.hits.hits[0]._source + return timestam_obj; +} + + +async function search_api() { + + // console.log("Inside search_api..") + + var headers = new Headers(); + headers.append("Content-Type", "application/json"); + headers.append("Authorization", "Basic ZWxhc3RpYzpwY2lXY2xwTE5kWHVpY1VoWFY4YmhnazI="); + + const query_txt = document.getElementById("searchbox").value + // console.log("query_txt ", query_txt) + const query_payload = { + size: 5, + from: 0, + query: { + "query_string": { + "query": query_txt + } + } + } + // console.log("query_payload ", query_payload) + var requestOptions = { + method: 'POST', + headers: headers, + body: JSON.stringify(query_payload) + }; + + const response = await fetch("https://ac55987c83844faa90726d4e5efe92b9.us-central1.gcp.cloud.es.io/subtitles/_search", requestOptions) + const record = await response.json() + // console.log("record ", record) + if(record.hits.total.value > 0) { + const result_num = Math.min(record.hits.total.value, 5) + // console.log("Maximum number of result: ", result_num) + for (let i = 0; i < result_num; i++) { + const result = record.hits.hits[i]._source + // console.log(result) + const result_dict = {} + const response_str = ''+ result.week + '
' + + ' Title :: ' + result.lecture_title + '
' + + ' timestamp :: ' + result.time + '
' + + ' Subtitles : '+result.text + + '
' + console.log("Resoponse :: ", response_str) + result_dict["week"] = result.week + result_dict["lecture_title"] = result.lecture_title + result_dict["url"] = result.url + result_dict["time"] = result.time + result_dict["subtitles"] = result.text + result_dict["course_name"] = result.course_name + set_result_format(result_dict) + } + } else { + const result_div = document.createElement('div') + result_div.innerHTML = "We could not find a related topic" + result_container.appendChild(result_div) + } + +} + +function set_result_format(result_dict) { + + // Initiate html components + const result_item = document.createElement('div') + const result_second_row = document.createElement('div') + const result_url = document.createElement('a') + const result_week = document.createElement('h4') + const result_time = document.createElement('h4') + const result_lecture_title = document.createElement('h4') + const result_subtitles = document.createElement('p') + + // Set up class/ id for some components + result_item.classList.add("result__item") + result_second_row.classList.add("result__second--row") + result_time.classList.add("timestamp") + result_url.classList.add("lecture__url") + + // Set the content of components + result_url.href = result_dict["url"] + result_week.innerHTML = result_dict["week"] + time_reformat = format_time(result_dict["time"]) + result_time.innerHTML = time_reformat + result_lecture_title.innerHTML = result_dict["lecture_title"] + result_subtitles.innerHTML = result_dict["subtitles"] + + // Organize html component structure + result_item.appendChild(result_url) + result_item.appendChild(result_week) + result_item.appendChild(result_second_row) + result_second_row.appendChild(result_time) + result_second_row.appendChild(result_lecture_title) + result_item.appendChild(result_subtitles) + + result_container.appendChild(result_item) +} + +function format_time(time) { + let parts = time.split(':').map(part => parseInt(part, 10)); + let seconds = parts[0]; + let minutes = parts[1]; + let hours = parts.length > 2 ? parts[2] : 0; + + // Make sure each part has two digits + hours = hours.toString().padStart(2, '0'); + minutes = minutes.toString().padStart(2, '0'); + seconds = seconds.toString().padStart(2, '0'); + + return `${hours}:${minutes}:${seconds}`; +} + +function remove_all_children(element) { + while (element.firstChild) { + element.removeChild(element.firstChild); + } +} + +document.addEventListener('DOMContentLoaded', function () { + const parent = document.querySelector('.result__container'); + + parent.addEventListener('click', function (event) { + // Check if the clicked element or its parent has the class 'container' + let container = event.target.classList.contains('result__item') + ? event.target + : event.target.closest('.result__item'); + + if (container) { + // Extract the URL from the child anchor tag + let url = container.querySelector('.lecture__url').getAttribute('href'); + + // Open the URL + if (url) { + chrome.tabs.create({ url: url }); + } + } + }); +}); diff --git a/ChromeExtension/manifest.json b/ChromeExtension/manifest.json new file mode 100644 index 000000000..34e65f23c --- /dev/null +++ b/ChromeExtension/manifest.json @@ -0,0 +1,16 @@ +{ + "name": "CS410_Fall2023_CourseProject_TeamCAHJ", + "description": "Base Level Extension", + "version": "1.0", + "permissions": [ + "storage", + "tabs" + ], + "host_permissions": ["http://*/*", "https://*/*"], + "manifest_version": 3, + "action": { + "default_popup": "index.html", + "default_icon": "img/CS410_Fall2023_CourseProject_TeamCAHJ.png", + "default_title": "CS410_Fall2023_CourseProject_TeamCAHJ" + } +} \ No newline at end of file diff --git a/ChromeExtension/style.css b/ChromeExtension/style.css new file mode 100644 index 000000000..bdb3ba9f2 --- /dev/null +++ b/ChromeExtension/style.css @@ -0,0 +1,132 @@ +@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap'); + +* { + box-sizing: border-box; + background-color: transparent; +} + +body { + font-family: 'Roboto', sans-serif; + align-items: center; + justify-content:center; + height: 100%; + overflow: hidden; + margin: 0px; +} + +.extension__container{ + display: flex; + flex-direction: column; + outline: 1px solid black; + height: 600px; + width: 450px; + margin: 0px; +} + +.header__course { + display: flex; + align-items: center; + background: white; + border-bottom: 1px solid rgb(225, 225, 225); + box-shadow: 4px 4px 8px 0 rgba(0, 0, 0, 0.2), 6px 6px 10px 0 rgba(0, 0, 0, 0.19); + height: 60px; + margin: 0; + padding: 10px; +} + + +#course-options { + border: none; + background-color: transparent; + font-size: 1.5rem; + font-weight: bold; + color: rgb(55, 55, 55); + flex-grow: 1; + word-wrap: break-word; + overflow: hidden; + max-height: 1.5em; +} + +.result__container { + flex-grow: 1; + background: rgb(245,245,245); + overflow-y: auto; + margin: 0; + padding: 15px; +} + +.result__container .result__item:hover { + cursor: pointer; +} + +.result__item { + display: flex; + flex-direction: column; + background: white; + box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.1), 0 3px 10px 0 rgba(0, 0, 0, 0.1); + border-radius: 8px; + margin-bottom: 15px; + padding: 10px; +} + +.result__item h4 { + line-height: 1rem; + margin: 4px; + word-wrap: break-word; + overflow: hidden; + max-height: 1.5em; +} + +.result__second--row { + display: flex; + flex-direction: row; +} + +.timestamp { + color: rgb(47, 151, 242); +} + +.result__item p { + margin: 4px; + word-wrap: break-word; + line-height: 1em; + max-height: 3em; + overflow: hidden; + position: relative; +} + +/* .result__item p::after { + content: '...'; + position: absolute; + bottom: 0; + right: 0; +} */ + +.footer__input { + display: flex; + align-items: center; + height: 60px; + background: white; + box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.1), 0 6px 20px 0 rgba(0, 0, 0, 0.1); + border-top: 1px solid rgb(225, 225, 225); + margin: 0; + padding: 10px; +} + +#searchbox{ + flex-grow: 1; + margin-right: 10px; + background-color: white; + border: 2px solid grey; + border-radius: 5px; + height: 30px; +} + +#submit-button { + color: white; + background-color: rgb(96, 176, 246); + border: none; + height: 30px; + border-radius: 3px; +} + diff --git a/CourseraTranscriptScraper/CourseraScraper.py b/CourseraTranscriptScraper/CourseraScraper.py new file mode 100644 index 000000000..78c592c88 --- /dev/null +++ b/CourseraTranscriptScraper/CourseraScraper.py @@ -0,0 +1,168 @@ +import re +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException + + +class CourseraScraper: + def __init__(self, course_url: str, username: str, password: str) -> None: + self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) + self.url = course_url + self.username = username + self.password = password + self.course_transcript_for_json = {} + # Login to Coursera to allow scraper to parse pages + CourseraScraperLogin(self.driver, self.username, self.password).login() + self.driver.get(self.url) + + def run_scraper(self): + # Parse course to get list of urls for each week to scrape + course_transcripts = [] + + course_parser = CourseraCourseParser(self.driver) + self.course_name = course_parser.course_name + + # Parse each week url to get list of lecture URLs to scrape + for week_url in course_parser.week_urls: + week_str = "Week" + week_url.rsplit("/", 2)[-1] + week_parser = CourseraWeekParser(self.driver, week_url) + lecture_urls = week_parser.lecture_urls + + week_transcripts = [] + + for lecture_url in lecture_urls: + lecture_title = lecture_url.rsplit("/", 2)[-1] + lecture_subtitles = week_parser.get_lecture_subtitles(lecture_url) + week_transcripts.append({lecture_title: lecture_subtitles}) + + course_transcripts.append({week_str: week_transcripts}) + + self.course_transcript_for_json[self.course_name] = course_transcripts + + +class CourseraScraperLogin: + def __init__(self, driver: webdriver.Chrome, email: str, password: str) -> None: + self.driver = driver + self.url = "https://www.coursera.org" + self.login_email = email + self.login_password = password + + def login(self) -> None: + login_url = self.url + "/?authMode=login" + self.driver.get(login_url) + self.driver.find_element("id", "email").send_keys(self.login_email) + self.driver.find_element("id", "password").send_keys(self.login_password) + self.driver.find_element("xpath", "//button[@type='submit']").click() + input("Finalize CAPTCHA and then press Enter in the shell") + + +class CourseraCourseParser: + def __init__(self, driver: webdriver.Chrome) -> None: + self.driver = driver + self.course_name = self.parse_course_name() + self.get_week_urls() + + def parse_course_name(self) -> str: + title_xpath = "//*[@class='cds-108 cds-Typography-base css-e7lgfl cds-110']" + title_elements = self.driver.find_elements(By.XPATH, title_xpath) + title = title_elements[0].text + return title + + def get_week_urls(self) -> None: + """Initialize the URLs for each week of the course""" + self.landing_page = self.driver.current_url + # Coursera defaults to saving the user's last accessed week, so need to get the true landing + # page once it's been navigated to + self.landing_page = self.landing_page.split("week")[0] + + week_url_list = [] + if "https://www.coursera.org/learn/" in self.landing_page: + self.driver.get(self.landing_page) + week_list_xpath_pattern = "//*[@class='cds-108 css-1mxkpit cds-110']" + # Need to make sure the element loads on the page before it can be scraped + try: + _ = WebDriverWait(self.driver, 2).until( + EC.presence_of_element_located((By.XPATH, week_list_xpath_pattern)) + ) + except TimeoutException: + print("Loading took too much time!") + # Get all elements from the sidebare containing links to the course's week lectures + week_elements = self.driver.find_elements(By.XPATH, week_list_xpath_pattern) + + for week_number in range(1, len(week_elements) + 1): + week_url_list.append(self.landing_page + f"week/{week_number}") + else: + self.get_week_urls() + + self.week_urls = week_url_list + + +class CourseraWeekParser: + def __init__(self, driver: webdriver.Chrome, week_url: str) -> None: + self.driver = driver + self.week_url = week_url + self.get_lecture_urls() + + def get_lecture_urls(self): + lecture_urls = [] + soup = self.get_page_soup(self.week_url) + elements = soup.find_all("div", attrs={"data-test": "WeekSingleItemDisplay-lecture"}) + + for element in elements: + a_tag = element.find("a") + if a_tag and "href" in a_tag.attrs: + href_value = a_tag["href"] + lecture_urls.append("https://www.coursera.org" + href_value) + else: + print("href attribute not found") + self.lecture_urls = lecture_urls + + def get_lecture_subtitles(self, lecture_url): + soup = self.get_page_soup(lecture_url) + subtitles = [] + + # Find all div elements contain subtitles + pattern = re.compile(r"\bcss-1shylkf\b") + elements = soup.find_all("div", class_=pattern) + if len(elements) == 0: + print("No value retrieved") + else: + print("Retrieved") + + for element in elements: + # Extract the timestamp + button = element.find("button", class_="timestamp") + timestamp = button.contents[-1].strip() + + # Extract all phrase elements and concatenate the text of all subtitles + phrases = element.find_all("div", class_="phrases") + text_content = " ".join(phrase.get_text().strip() for phrase in phrases) + + # Append the subtitles to the list as a dictionary + subtitles.append({"time": timestamp, "text": text_content, "url": lecture_url}) + + # Process the subtitles + return subtitles + + def get_page_soup(self, url: str) -> BeautifulSoup: + # Take driver to specified URL + self.driver.get(url) + # Need to make sure the element loads on the page before it can be scraped + try: + transcript_xpath = "//*[@class='phrases']" + _ = WebDriverWait(self.driver, 2).until( + EC.presence_of_element_located((By.XPATH, transcript_xpath)) + ) + except TimeoutException: + print("Loading took too much time!") + + # get the page source and parse the HTML content into a BeautifulSoup object + parge_source = self.driver.page_source + soup = BeautifulSoup(parge_source, "html.parser") + + return soup diff --git a/CourseraTranscriptScraper/ElasticSearchJSONWriter.py b/CourseraTranscriptScraper/ElasticSearchJSONWriter.py new file mode 100644 index 000000000..b00948ec4 --- /dev/null +++ b/CourseraTranscriptScraper/ElasticSearchJSONWriter.py @@ -0,0 +1,50 @@ +import json +import os +from elasticsearch import Elasticsearch + + +class ElasticSearchJSONWriter: + """ + Class to take a JSON script and write it to ElasticSearch, so it can be used in the Coursera + search extension. + The current implementation uses the project team's ElasticSearch instance, but this can be + changed by modifying the 'ES_URL' default value in the class __init__() method below. + """ + + def __init__(self, json_path: str = "./subtitles.json"): + self.url = os.environ.get( + "ES_URL", "https://ac55987c83844faa90726d4e5efe92b9.us-central1.gcp.cloud.es.io" + ) + self.user = os.environ.get("ES_USER", "elastic") + self.password = os.environ.get("ES_PASSWORD", "pciWclpLNdXuicUhXV8bhgk2") + self.json_path = json_path + self.subtitles_json = self.load_json() + + def load_json(self) -> json: + """Load JSON file from saved scraped results in preparation to be pusehd to ElasticSearch""" + try: + with open(self.json_path) as f: + subtitles_doc = f.read() + subtitles_json = json.loads(subtitles_doc) + # Should always work unless the file doesn't exist, in which case the user should be warned + except FileNotFoundError: + print(f"{self.json_path} was not found") + + return subtitles_json + + def index_subtitles(self, course_name: str) -> None: + for weeks in self.subtitles_json[course_name]: + week_val = list(weeks.keys())[0] + for week in weeks.values(): + for lecture_titles in week: + for lecture_title in lecture_titles: + for subtitles in lecture_titles[lecture_title]: + subtitles["lecture_title"] = lecture_title + subtitles["week"] = week_val + subtitles['course_name'] = course_name + self.write_to_elasticsearch(subtitles) + + def write_to_elasticsearch(self, doc) -> None: + es = Elasticsearch(self.url, http_auth=(self.user, self.password)) + resp = es.index(index="subtitles", document=doc) + print(resp["result"]) diff --git a/CourseraTranscriptScraper/__pycache__/CourseraScraper.cpython-312.pyc b/CourseraTranscriptScraper/__pycache__/CourseraScraper.cpython-312.pyc new file mode 100644 index 000000000..763ac10c4 Binary files /dev/null and b/CourseraTranscriptScraper/__pycache__/CourseraScraper.cpython-312.pyc differ diff --git a/CourseraTranscriptScraper/chat_coursera.py b/CourseraTranscriptScraper/chat_coursera.py new file mode 100644 index 000000000..8b74cf121 --- /dev/null +++ b/CourseraTranscriptScraper/chat_coursera.py @@ -0,0 +1,46 @@ +#! /usr/bin/env python3 + +import openai +import os +from langchain.document_loaders import JSONLoader +from langchain.text_splitter import ( + MarkdownHeaderTextSplitter, + RecursiveCharacterTextSplitter, +) +from langchain import Query + + +from dotenv import load_dotenv, find_dotenv +_ = load_dotenv(find_dotenv()) # read local .env file +openai.api_key = os.environ[""] + +loader = JSONLoader( + file_path='./chat_subtitles.json', + jq_schema='.introduction-to-text-mining-and-analytics[].content', + text_content=False) + +docs = loader.load() +trans_docs = r_splitter.split_documents(docs) + +# print(trans_docs) + +from langchain.embeddings.openai import OpenAIEmbeddings +import pinecone +from langchain.retrievers.self_query.base import SelfQueryRetriever + + +from langchain.chat_models import ChatOpenAI +from langchain.chains import RetrievalQA + +llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0) + +# qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever) +while True: + question = input() + docs = retriever.get_relevant_documents(question) + for d in docs: + print(d.metadata) + # print(len(docs)) + # print(docs) + # result = qa_chain({"query": question}) + # print(result["result"]) \ No newline at end of file diff --git a/CourseraTranscriptScraper/chat_subtitles.json b/CourseraTranscriptScraper/chat_subtitles.json new file mode 100644 index 000000000..3e596ee1a --- /dev/null +++ b/CourseraTranscriptScraper/chat_subtitles.json @@ -0,0 +1,124 @@ +{ + "introduction-to-text-mining-and-analytics": [ + { + "time": "0:00", + "text": "[SOUND] Hello. Welcome to the course Text Mining and Analytics. My name is ChengXiang Zhai. I have a nickname, Cheng. I am a professor of the Department of Computer Science at the University of Illinois at Urbana-Champaign. This course is a part of a data mining specialization offered by the University of Illinois at Urbana-Champaign. In addition to this course, there are four other courses offered by", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "0:39", + "text": "Professor Jiawei Han, Professor John Hart and me, followed by a capstone project course that all of us will teach together.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "0:51", + "text": "This course is particularly related to another course in the specialization, mainly text retrieval and search engines in that both courses are about text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:07", + "text": "In contrast, pattern discovery and cluster analysis are about algorithms more applicable to all kinds of data in general. The visualization course is also relatively general in that the techniques can be applied to all kinds of data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:28", + "text": "This course addresses a pressing need for harnessing big text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:35", + "text": "Text data has been growing dramatically recently, mostly because of the advance of technologies deployed on the web that would enable people to quickly generate text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:50", + "text": "So, I listed some of the examples on this slide", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "1:57", + "text": "that can show a variety of text data that are available today. For example, if you think about the data on the internet, on the web,", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "2:07", + "text": "everyday we are seeing many web pages being created.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "2:13", + "text": "Blogs are another kind of new text data that are being generated quickly by people. Anyone can write a blog article on the web. New articles of course have always been a main kind of text data that being generated everyday.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "2:31", + "text": "Emails are yet another kind of text data. And literature is also representing a large portion of text data. It's also especially very important because of the high quality in the data. That is, we encode our knowledge about the word using text data represented by all the literature articles. It's a vast amount of knowledge of", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:08", + "text": "all the text and data in these literature articles.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:14", + "text": "Twitter is another representative text data representing social media. Of course there are forums as well.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:24", + "text": "People are generating tweets very quickly indeed as we are speaking perhaps many people have already written many tweets. So, as you can see there are all kinds of text data that are being generated very quickly.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:38", + "text": "Now these text data present some challenges for people.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "3:43", + "text": "It's very hard for anyone to digest all the text data quickly. In particular, it's impossible for scientists to read all of the for example or for anyone to read all the tweets.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "4:01", + "text": "So there's a need for tools to help people digest text data more efficiently.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "4:09", + "text": "There is also another interesting opportunity provided by such big text data, and that is it's possible to leverage the amount of text data to discover interesting patterns to turn text data into actionable knowledge that can be useful for decision making.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "4:27", + "text": "So for example, product managers may be interested in knowing the feedback of customers about their products, knowing how well their products are being received as compared with the products of competitors. This can be a good opportunity for leveraging text data as we have seen a lot of reviews of product on the web. So if we can develop a master text mining techniques to tap into such a [INAUDIBLE] to extract the knowledge and opinions of people about these products, then we can help these product managers to gain business intelligence or to essentially feedback from their customers.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "5:18", + "text": "In scientific research, for example, scientists are interested in knowing the trends of research topics, knowing", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "5:29", + "text": "about what related fields have discovered. This problem is especially important in biology research as well. Different communities tend to use different terminologies, yet they're starting very similar problems. So how can we integrate the knowledge that is covered in different communities to help study a particular problem? It's very important, and it can speed up scientific discovery.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "5:57", + "text": "So there are many such examples where we can leverage the text data to discover useable knowledge to optimize our decision.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "6:06", + "text": "The main techniques for harnessing big text data are text retrieval and text mining. So these are two very much related technologies.Yet, they have somewhat different purposes. These two kinds of techniques are covered in the tool in this specialization. So, text retrieval on search engines covers text retrieval, and this is necessary to turn big text data into a much smaller but more relevant text data, which are often the data that we need to handle a particular problem or to optimize a particular decision. This course covers text mining which is a second step in this pipeline that can be used to further process the small amount of relevant data to extract the knowledge or to help people digest the text data easily. So the two courses are clearly related, in fact, some of the techniques are shared by both text retrieval and text mining. If you have already taken the text retrieval course, then you might see some of the content being repeated in this text mining course, although we'll be talking about the techniques from a very different perspective. If you have not taken the text retrieval course, it's also fine because this course is self-contained and you can certainly understand all of the materials without a problem. Of course, you might find it beneficial to take both courses and that will give you a very complete set of skills to handle big text data.", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + }, + { + "time": "8:02", + "text": "[MUSIC]", + "url": "https://www.coursera.org/learn/text-mining/lecture/Osat9/introduction-to-text-mining-and-analytics" + } + ] +} \ No newline at end of file diff --git a/CourseraTranscriptScraper/requirements.txt b/CourseraTranscriptScraper/requirements.txt new file mode 100644 index 000000000..eb2c29c85 --- /dev/null +++ b/CourseraTranscriptScraper/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.12.2 +elasticsearch==8.11.0 +Requests==2.31.0 +selenium==4.9.0 +webdriver_manager==4.0.1 +jq==1.6.0 +langchain==0.0.348 +openai==1.3.7 diff --git a/CourseraTranscriptScraper/scrape_coursera_course.py b/CourseraTranscriptScraper/scrape_coursera_course.py new file mode 100644 index 000000000..6945460e7 --- /dev/null +++ b/CourseraTranscriptScraper/scrape_coursera_course.py @@ -0,0 +1,47 @@ +import argparse +import json +from CourseraScraper import CourseraScraper +from ElasticSearchJSONWriter import ElasticSearchJSONWriter + + +def scrape_course_pipeline( + course_url: str, username: str, password: str, output_path: str, elastic_search_push: bool +) -> None: + # Scrape a Coursera course's transcripts into a JSON file + scraper = CourseraScraper(course_url, username, password) + scraper.run_scraper() + course_name = scraper.course_name + + # Writing a JSON file + with open(output_path, "w") as json_file: + json.dump(scraper.course_transcript_for_json, json_file, indent=4) + if elastic_search_push: + writer = ElasticSearchJSONWriter(args.output_path) + writer.index_subtitles(course_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", + "--course_url", + required=True, + type=str, + help="URL to the landing page of the course you want to scrape. \ + Ex: https://www.coursera.org/learn/cs-410/home/", + ) + parser.add_argument("-u", "--username", required=True, type=str, help="Coursera Username") + parser.add_argument("-p", "--password", required=True, type=str, help="Coursera Password") + parser.add_argument("-e", "--elastic_search_push", action="store_true") + parser.add_argument( + "-o", + "--output_path", + type=str, + default="./subtitles.json", + help="Path to write JSON file containing scraped transcripts to", + ) + args = parser.parse_args() + + scrape_course_pipeline( + args.course_url, args.username, args.password, args.output_path, args.elastic_search_push + ) diff --git a/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProgressReport.pdf b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProgressReport.pdf new file mode 100644 index 000000000..06f2c215f Binary files /dev/null and b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProgressReport.pdf differ diff --git a/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProposal.pdf b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProposal.pdf new file mode 100644 index 000000000..2dada5861 Binary files /dev/null and b/Documentation/CS410_Deliverables/TeamCAHJ_ProjectProposal.pdf differ diff --git a/Documentation/README_images/Chrome Developer Mode.png b/Documentation/README_images/Chrome Developer Mode.png new file mode 100644 index 000000000..e027bac3c Binary files /dev/null and b/Documentation/README_images/Chrome Developer Mode.png differ diff --git a/Documentation/README_images/Chrome Extension Directory.png b/Documentation/README_images/Chrome Extension Directory.png new file mode 100644 index 000000000..131143d61 Binary files /dev/null and b/Documentation/README_images/Chrome Extension Directory.png differ diff --git a/Documentation/README_images/Chrome Load Unpacked.png b/Documentation/README_images/Chrome Load Unpacked.png new file mode 100644 index 000000000..0c5beaa14 Binary files /dev/null and b/Documentation/README_images/Chrome Load Unpacked.png differ diff --git a/Documentation/README_images/CourseraScraper_LoginCaptcha.png b/Documentation/README_images/CourseraScraper_LoginCaptcha.png new file mode 100644 index 000000000..656af0542 Binary files /dev/null and b/Documentation/README_images/CourseraScraper_LoginCaptcha.png differ diff --git a/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png b/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png new file mode 100644 index 000000000..9611e853c Binary files /dev/null and b/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png differ diff --git a/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png b/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png new file mode 100644 index 000000000..31e27c79c Binary files /dev/null and b/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png differ diff --git a/README.md b/README.md index a7b40d2cc..732c5dfbc 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,62 @@ -# CourseProject +# CS410 CourseProject (Team CAHJ) - Coursera Search with ChatGPT Extension -Please fork this repository and paste the github link of your fork on Microsoft CMT. Detailed instructions are on Coursera under Week 1: Course Project Overview/Week 9 Activities. +## Project Overview + + + +## Requirements +This project is fairly straightforward with regards to requirements on the user's machine, but there are a few baselines that are required to be hit: +- The project requires Google Chrome to work. +- The project requires ChromeDriver, maintained by Chronium, to be installed in the root directory of the project in order to enable scraping (see Step 2 under Installation Instructions, below). +- The project requires a working installation of Python to scrape new course content. The file `requirements.txt` includes the packages necessary for the script to run. If you plan to scrape new course content into the project ElasticSearch index, please ensure your Python environment satisfies these requirements. (TODO - Create requirements.txt file for Python packages) +- As the extension is not deployed to the Google Chrome Web Store, it requires a local copy of the codebase on the user's computer (see Step 1 under Installation Instructions, below). + + +## Installation Instructions +Installing the extension is quite simple; all you need to do is download the code from GitHub and then activate the extension in Chrome. +A step-by-step guide for the above is below.: + +1. Pull the code from GitHub to `desiredDirectory` using your shell: + ``` + cd desiredDirectory + git clone https://github.com/christianopperman/CS410_Fall2023_CourseProject_TeamCAHJ.git + ``` +2. Install the appropriate ChromeDriver for your computer's enviornment from [this linke](https://googlechromelabs.github.io/chrome-for-testing/#stable), unzip it, and move the `Google Chrome for Testing` application to the `CS410__Fall2023_CourseProject_TeamCAHJ` directory created in Step 1, above. +3. Open Google Chrome. +4. Go to the Extensions page on Google Chrome by following [this link](chrome://extensions). +5. Activate Developer Mode by toggling the switch in the upper right corner labeled `Developer mode`.
+![Screenshot of Devloper Mode toggle](/project/CS410_Fall2023_CourseProject_TeamCAHJ/Documentation/README_images/Chrome%20Developer%20Mode.png) +6. Load the extension from the codebase pulled to your computer in Step 1 by clicking the `Load unpacked` button in the top left corner:
+![Screenshot of load unpacked button](/project/CS410_Fall2023_CourseProject_TeamCAHJ/Documentation/README_images/Chrome%20Load%20Unpacked.png) +7. Select the `desiredDirectory/CS410_Fall2023_CourseProject_TeamCAHJ/ChromeExtension` directory in the popup and click `Select`
+![Screenshot of load unpacked button](/project/CS410_Fall2023_CourseProject_TeamCAHJ/Documentation/README_images/Chrome%20Extension%20Directory.png) +8. The extension should now be available to you in your Google Chrome Extensions list. + +## Usage Instructions + +### Coursera Transcript Scraper +As mentioned in [Requirements](#requirements) above, in order to scrape your own Coursera course transcripts into the extension, you will need a working version of Python that satisfies the required packages outlined in the `CourseraTranscriptScraper\requirements.txt` file. +Once you have that, scraping a new course into ElasticSearch is very easy: +1. Navigate to `desiredDirectory/CS410_Fall2023_CourseProject_TeamCAHJ/CourseraTranscriptScraper` in your shell +2. Call the course scraper script with, with the following command line arguments: +``` +python scrape_coursera_course.py -c "course_url" -u "coursera_username" -p "coursera_password" [-e] +``` +* Required Arguments + * -c : The link to the landing page of the Coursera course you'd like to scrape + * -u : The username to your Coursera account which has access to the course you'd like to scrape + * -p : The password to your Coursera account which has access to the course you'd like to scrape + +* Optional Arguments: + * -e : A boolean flag. If included, the script will automatically push the scraped course transcriptions to ElasticSearch after saving them to disk. If not included, the transcriptions will be saved to disk but not pushed to ElasticSearch. + * -o : The output path to write the transcriptions to, if you would like to save the transcriptions to a specific filename. + +3. Once you run the above command, a window will pop up and automatically log you into Coursera. It is likely that you will be required to complete a CAPTCHA. +4. Once you complete the CAPTCHA, return to your shell and press Enter, as prompted. +![Screenshot of running the Coursera course scraper from the command line](/project/CS410_Fall2023_CourseProject_TeamCAHJ/Documentation/README_images/CourseraScraper_LoginPostCaptcha.png) +5. The script will begin scraping, as evidenced by the pop-up window navigating between video pages in the course and the `Retrieved` messages in the shell window. +![Screenshot of running the Coursera course scraper from the command line](/project/CS410_Fall2023_CourseProject_TeamCAHJ/Documentation/README_images/CourseraScraper_SuccessfulScrapes.png) +6. The script will write any scraped transcriptions to the filepath specified by the `-o` command line argument, if present, and to `subtitles.json` if not. +7. If the `-e` flag was passed to the script, the script will automatically push the scraped course's transcriptions to ElasticSearch. + +### Chrome Extension diff --git a/package.json b/package.json new file mode 100644 index 000000000..43e9854c8 --- /dev/null +++ b/package.json @@ -0,0 +1,8 @@ +{ + "name": "CS410_Fall2023_CourseProject_TeamCAHJ", + "version": "1.0.0", + "dependencies": { + "@elastic/elasticsearch": "^8.10.0", + "elasticsearch": "^16.7.3" + } +}