File size: 5,232 Bytes
d9b68ea
2d3a993
 
 
 
 
 
 
99b37ed
 
7b3c60c
99b37ed
2d3a993
99b37ed
2d3a993
 
 
 
99b37ed
2d3a993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b3c60c
2d3a993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b3c60c
2d3a993
 
 
 
 
 
 
7b3c60c
 
 
 
 
 
 
 
 
 
 
 
 
 
2d3a993
 
7b3c60c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d3a993
 
7b3c60c
2d3a993
 
 
 
 
 
 
 
 
 
 
 
b480514
 
2d3a993
 
 
 
99b37ed
2d3a993
 
 
b480514
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import time, aiohttp, asyncio, json, os, multiprocessing, torch
from minivectordb.embedding_model import EmbeddingModel
from minivectordb.vector_database import VectorDatabase
from text_util_en_pt.cleaner import structurize_text, detect_language, Language
from webtextcrawler.webtextcrawler import extract_text_from_url
from duckduckgo_search import DDGS
import gradio as gr

torch.set_num_threads(2)

openrouter_key = os.environ.get("OPENROUTER_KEY")
model = EmbeddingModel(use_quantized_onnx_model=True)

def fetch_links(query, max_results=5):
    with DDGS() as ddgs:
        return [r['href'] for r in ddgs.text(query, max_results=max_results)]

def fetch_texts(links):
    with multiprocessing.Pool(5) as pool:
        texts = pool.map(extract_text_from_url, links)
    return '\n'.join([t for t in texts if t])

def index_and_search(query, text):
    start = time.time()
    query_embedding = model.extract_embeddings(query)

    # Indexing
    vector_db = VectorDatabase()
    sentences = [ s['sentence'] for s in structurize_text(text)]

    for idx, sentence in enumerate(sentences):
        sentence_embedding = model.extract_embeddings(sentence)
        vector_db.store_embedding(idx + 1, sentence_embedding, {'sentence': sentence})
    
    embedding_time = time.time() - start

    # Retrieval
    start = time.time()
    search_results = vector_db.find_most_similar(query_embedding, k = 12)
    retrieval_time = time.time() - start
    return '\n'.join([s['sentence'] for s in search_results[2]]), embedding_time, retrieval_time

def retrieval_pipeline(query):
    start = time.time()
    links = fetch_links(query)
    websearch_time = time.time() - start

    start = time.time()
    text = fetch_texts(links)
    webcrawl_time = time.time() - start

    context, embedding_time, retrieval_time = index_and_search(query, text)

    return context, websearch_time, webcrawl_time, embedding_time, retrieval_time, links

async def predict(message, history):
    context, websearch_time, webcrawl_time, embedding_time, retrieval_time, links = retrieval_pipeline(message)

    if detect_language(message) == Language.ptbr:
        prompt = f"Contexto:\n\n{context}\n\nBaseado no contexto, responda: {message}"
    else:
        prompt = f"Context:\n\n{context}\n\nBased on the context, answer: {message}"

    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = { "Content-Type": "application/json",
                "Authorization": f"Bearer {openrouter_key}" }
    body = { "stream": True,
             "models": [
                    "mistralai/mistral-7b-instruct:free", 
                    "nousresearch/nous-capybara-7b:free",
                    "huggingfaceh4/zephyr-7b-beta:free"
             ],
             "route": "fallback",
             "max_tokens": 768,
             "messages": [
                 {"role": "user", "content": prompt}
             ] }

    full_response = ""
    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=body) as response:
            buffer = ""  # A buffer to hold incomplete lines of data
            async for chunk in response.content.iter_any():
                buffer += chunk.decode()
                while "\n" in buffer:  # Process as long as there are complete lines in the buffer
                    line, buffer = buffer.split("\n", 1)

                    if line.startswith("data: "):
                        event_data = line[len("data: "):]
                        if event_data != '[DONE]':
                            try:
                                current_text = json.loads(event_data)['choices'][0]['delta']['content']
                                full_response += current_text
                                yield full_response
                                await asyncio.sleep(0.01)
                            except Exception:
                                try:
                                    current_text = json.loads(event_data)['choices'][0]['text']
                                    full_response += current_text
                                    yield full_response
                                    await asyncio.sleep(0.01)
                                except Exception:
                                    pass
    
    final_metadata_block = ""

    final_metadata_block += f"Links visited:\n"
    for link in links:
        final_metadata_block += f"{link}\n"
    final_metadata_block += f"\nWeb search time: {websearch_time:.4f} seconds\n"
    final_metadata_block += f"\nText extraction: {webcrawl_time:.4f} seconds\n"
    final_metadata_block += f"\nEmbedding time: {embedding_time:.4f} seconds\n"
    final_metadata_block += f"\nRetrieval from VectorDB time: {retrieval_time:.4f} seconds"

    yield f"{full_response}\n\n{final_metadata_block}"

gr.ChatInterface(
    predict,
    title="Web Search with LLM",
    description="Ask any question, and I will try to answer it using web search",
    retry_btn=None,
    undo_btn=None,
    examples=[
        'When did the first human land on the moon?',
        'Liquid vs solid vs gas?',
        'What is the capital of France?',
        'Why does Brazil has a high tax rate?'
    ]
).launch()