testingRAG/pilot.py at main · Samuser1/testingRAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# pilot.py - Utility functions for the Persona Interview Simulator
# ----------------------------------------
# This module now contains utility functions that can be imported by app.py
# The main execution logic has been moved to app.py for better user experience

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_cohere import CohereEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion
import os
import json
import textwrap
import re

def load_and_clean_transcript(pdf_path="interview.pdf"):
    """Load and clean the interview transcript"""
    doc = PyPDFLoader(pdf_path).load()[0]
    clean = doc.page_content.replace("researcher:", "").replace("Interviewee:", "")
    return clean

def extract_persona_metadata(clean_text, openrouter_key):
    """Extract persona information from the interview text"""
    os.environ["OPENAI_API_KEY"] = openrouter_key
    extract_llm = ChatOpenAI(
        model_name="gpt-4o-mini",
        base_url="https://openrouter.ai/api/v1"
    )

    EXTRACT_PROMPT = """
    You are a data extractor. Read the interview text delimited by <doc>.
    Return strict JSON with keys:
    name  – full name or null
    bio   – 1-sentence bio (job/age/location if stated) or null
    style – 2-3 adjectives describing speaking style or null
    <doc>{document}</doc>
    """

    meta_json = extract_llm.invoke(EXTRACT_PROMPT.format(document=textwrap.shorten(clean_text, 12000))).content

    # Extract JSON from markdown code blocks if present
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', meta_json, re.DOTALL)
    if json_match:
        meta_json = json_match.group(1)

    try:
        profile = json.loads(meta_json)
    except json.JSONDecodeError as e:
        print(f"JSON parsing failed: {e}")
        profile = {
            "name": "Unknown Speaker",
            "bio": "Interviewee in the provided transcript.",
            "style": "neutral"
        }

    # Fall-backs for missing fields
    profile.setdefault("name", "Unknown Speaker")
    profile.setdefault("bio", "Interviewee in the provided transcript.")
    profile.setdefault("style", "neutral")

    return profile

def create_text_chunks(clean_text, chunk_size=800, chunk_overlap=100):
    """Split text into chunks for processing"""
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.create_documents([clean_text])
    return chunks

def setup_cohere_embeddings(cohere_api_key):
    """Set up Cohere embeddings"""
    if not cohere_api_key:
        raise ValueError("COHERE_API_KEY is required")

    embeds = CohereEmbeddings(
        model="embed-english-v3.0",
        cohere_api_key=cohere_api_key
    )
    return embeds, 1024  # Cohere embed-english-v3.0 dimension

def setup_pinecone_vectorstore(chunks, embeds, dimension, pinecone_api_key, index_name="persona-pilot"):
    """Set up Pinecone vector store"""
    pc = Pinecone(api_key=pinecone_api_key)

    try:
        existing_indexes = pc.list_indexes()
        index_exists = index_name in [idx.name for idx in existing_indexes]

        if index_exists:
            index_description = pc.describe_index(index_name)
            existing_dimension = index_description.dimension

            if existing_dimension != dimension:
                pc.delete_index(index_name)
                index_exists = False

        if not index_exists:
            pc.create_index(
                name=index_name,
                dimension=dimension,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud=CloudProvider.AWS,
                    region=AwsRegion.US_EAST_1
                )
            )

        # Create vector store
        vdb = PineconeVectorStore.from_documents(
            documents=chunks,
            embedding=embeds,
            index_name=index_name
        )

        return vdb

    except Exception as e:
        raise Exception(f"Pinecone setup failed: {e}")

def create_qa_chain(vdb, profile, openrouter_key):
    """Create the conversational Q&A chain"""
    retriever = vdb.as_retriever(search_kwargs={"k": 3})

    # Create system message with profile information
    system_message = f"""
    You are {profile['name']}, responding in first person with your own voice and style.

Your perspective, thoughts, and tone should reflect your lived experience, personality, and communication style. Stay consistent with how you speak — whether that's careful and reflective, direct and sarcastic, optimistic, skeptical, etc. Refer to past statements when relevant, and lean into your persona's worldview.

Persona Bio: {profile['bio']}
Style: {profile['style']}

Use the excerpts below as your main source of truth. If the answer is covered, quote or paraphrase directly.
If it’s partially covered, reason it out in your own words based on what you said before.
If it’s not mentioned at all, respond using your own reasoning as {profile['name']} — but stay in character.
If you don’t know or feel uncertain, say so honestly.

Important: Speak naturally, like you're talking to another person — use pauses, hesitations, strong opinions, or even contradictions, just as someone like you would.
    """

    prompt = PromptTemplate(
        input_variables=["context", "chat_history", "question"],
        template=system_message + """

    Relevant excerpts from the interview:
    {context}

    Chat History:
    {chat_history}

    Human: {question}
    Assistant:"""
    )

    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key="answer"
    )

    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(
            model_name="gpt-4o-mini",
            base_url="https://openrouter.ai/api/v1"
        ),
        retriever=retriever,
        memory=memory,
        combine_docs_chain_kwargs={"prompt": prompt}
    )

    return qa_chain

# Legacy support - if this file is run directly, show a message
if __name__ == "__main__":
    print("✅ Persona Q&A utilities ready for import.")
    print("This module is now used by app.py for the Streamlit interface.")
    print("Run 'streamlit run app.py' to start the application.")