File size: 4,302 Bytes
35d7369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import json
import asyncio
from dotenv import load_dotenv
import pandas as pd
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential
from qa_system import generate_answer  # Import the QA system

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
def generate_testset(generator, documents, test_size, distributions):
    return generator.generate_with_langchain_docs(
        documents, 
        test_size=test_size, 
        distributions=distributions
    )

async def main():
    # Load environment variables from .env file
    load_dotenv()

    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")      

    if not OPENAI_API_KEY:
        raise ValueError("OpenAI API key not found in environment variables.")

    # Initialize OpenAI API key
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    openai.api_key = OPENAI_API_KEY  # Explicitly set the API key

    # Check if testset.json exists
    if os.path.exists("testset.json"):
        print("Loading existing testset from testset.json")
        with open("testset.json", "r") as f:
            testset_dict = json.load(f)
        df = pd.DataFrame(testset_dict)
    else:
        print("Generating new testset")
        # Load PDF documents
        documents = []
        pdf_dir = "resources"
        for filename in os.listdir(pdf_dir):
            if filename.lower().endswith(".pdf"):
                loader = PyPDFLoader(os.path.join(pdf_dir, filename))
                docs = loader.load()
                for doc in docs:
                    doc.metadata['filename'] = filename
            documents.extend(docs)
        
        for document in documents:
            document.metadata['filename'] = document.metadata['source']

        # Initialize OpenAI models (using gpt-3.5-turbo for both to reduce costs)
        generator_llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
        critic_llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
        embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

        # Initialize the Testset Generator
        generator = TestsetGenerator.from_langchain(
            generator_llm,
            critic_llm,
            embeddings
        )

        try:
            # Generate testset with retry logic
            testset = generate_testset(
                generator,
                documents, 
                test_size=3,
                distributions={simple: 0.6, reasoning: 0.2, multi_context: 0.2}
            )
            
            # Convert testset to pandas DataFrame
            df = testset.to_pandas()
            
            # Save testset to JSON file
            testset_dict = df.to_dict(orient='records')
            with open("testset.json", "w") as f:
                json.dump(testset_dict, f, indent=2)
            print(f"New testset saved to testset.json")

        except Exception as e:
            print(f"An unexpected error occurred while generating testset: {e}")
            return

    try:
        # Generate new answers for each question using the existing QA system
        print("Generating new answers for all questions")
        df['answer'] = df['question'].apply(generate_answer)
        
        # Update JSON file with new answers
        testset_dict = df.to_dict(orient='records')
        with open("testset_with_answers.json", "w") as f:
            json.dump(testset_dict, f, indent=2)
        print(f"Testset with new answers saved to testset_with_answers.json")

        # Save testset with new answers to CSV file
        df.to_csv("testset_with_answers.csv", index=False)
        print(f"Testset with new answers saved to testset_with_answers.csv")

    except openai.APIConnectionError as e:
        print(f"Failed to connect to OpenAI API after multiple attempts: {e}")
    except RuntimeError as e:
        print(f"Runtime error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
     asyncio.run(main())