diff --git a/api.py b/api.py new file mode 100644 index 000000000..55fe2af76 --- /dev/null +++ b/api.py @@ -0,0 +1,47 @@ +from flask import Flask, request, jsonify +import psycopg2 + +app = Flask(__name__) + +# Database connection +DB_CONFIG = { + "dbname": "your_db", + "user": "your_user", + "password": "your_password", + "host": "your_host", + "port": "your_port" +} + +def connect_db(): + try: + conn = psycopg2.connect(**DB_CONFIG) + cursor = conn.cursor() + return conn, cursor + except Exception as e: + print(f"Database connection error: {e}") + return None, None + +@app.route('/conversation-summary', methods=['GET']) +def conversation_summary(): + conn, cursor = connect_db() + if conn is None: + return jsonify({"error": "Database connection failed"}), 500 + + cursor.execute("SELECT user_id, COUNT(*) FROM conversations GROUP BY user_id") + data = cursor.fetchall() + conn.close() + return jsonify({"summary": data}) + +@app.route('/data-stats', methods=['GET']) +def data_stats(): + conn, cursor = connect_db() + if conn is None: + return jsonify({"error": "Database connection failed"}), 500 + + cursor.execute("SELECT COUNT(*) FROM conversations") + count = cursor.fetchone() + conn.close() + return jsonify({"total_conversations": count[0]}) + +if __name__ == '__main__': + app.run(debug=True) diff --git a/data_pipeline.py b/data_pipeline.py new file mode 100644 index 000000000..a33a1e21e --- /dev/null +++ b/data_pipeline.py @@ -0,0 +1,70 @@ +# Task 1: Data Pipeline Setup (data_pipeline.py) +import psycopg2 +import pandas as pd +import requests + +# Database connection +DB_CONFIG = { + "dbname": "your_db", + "user": "your_user", + "password": "your_password", + "host": "your_host", + "port": "your_port" +} + +def connect_db(): + try: + conn = psycopg2.connect(**DB_CONFIG) + cursor = conn.cursor() + return conn, cursor + except Exception as e: + print(f"Database connection error: {e}") + return None, None + +# Create table +def setup_database(): + conn, cursor = connect_db() + if conn is None: + return + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS conversations ( + id SERIAL PRIMARY KEY, + user_id VARCHAR(50), + timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + query TEXT, + generated_response TEXT + ); + ''') + conn.commit() + conn.close() + print("Database setup completed.") + +# ETL Pipeline +def etl_pipeline(): + url = "https://example.com/amazon_reviews.json" # Replace with actual dataset link + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + df = pd.DataFrame(data) + except Exception as e: + print(f"Error fetching data: {e}") + return + + conn, cursor = connect_db() + if conn is None: + return + + for _, row in df.iterrows(): + cursor.execute(""" + INSERT INTO conversations (user_id, query, generated_response) + VALUES (%s, %s, %s) + """, (row['user_id'], row['review_text'], row['response_text'])) + conn.commit() + conn.close() + print("ETL Process Completed") + +if __name__ == "__main__": + setup_database() + etl_pipeline() diff --git a/web_crawler.py b/web_crawler.py new file mode 100644 index 000000000..512aaa844 --- /dev/null +++ b/web_crawler.py @@ -0,0 +1,19 @@ +import requests +from bs4 import BeautifulSoup + +def crawl_wikipedia(): + url = "https://en.wikipedia.org/wiki/Natural_language_processing" + try: + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + text = soup.get_text() + return text[:1000] # Return only first 1000 characters for preview + except Exception as e: + print(f"Error fetching Wikipedia data: {e}") + return None + +if __name__ == "__main__": + data = crawl_wikipedia() + if data: + print("Extracted Wikipedia Data:", data) \ No newline at end of file