Spaces:
Running
Running
File size: 5,233 Bytes
c7143b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import spacy
from itertools import groupby
from operator import itemgetter
from langsmith import traceable
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
def get_nlp_model():
if not spacy.util.is_package("en_core_web_md"):
print("Downloading en_core_web_md model...")
spacy.cli.download("en_core_web_md")
print("Model downloaded successfully!")
nlp = spacy.load("en_core_web_md")
return nlp
def recursive_split_documents(contents, max_chunk_size=1000, overlap=100):
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
documents = []
for content in contents:
try:
page_content = content['page_content']
if page_content:
metadata = {'title': content['title'], 'source': content['link']}
doc = Document(page_content=content['page_content'], metadata=metadata)
documents.append(doc)
except Exception as e:
print(f"Error processing content for {content['link']}: {e}")
# Initialize recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=overlap)
# Split documents
split_documents = text_splitter.split_documents(documents)
# Convert split documents to the same format as recursive_split
chunks = []
for doc in split_documents:
chunk = {
'text': doc.page_content,
'metadata': {
'title': doc.metadata.get('title', ''),
'source': doc.metadata.get('source', '')
}
}
chunks.append(chunk)
return chunks
def semantic_search(query, chunks, nlp, similarity_threshold=0.5, top_n=10):
# Precompute query vector and its norm
query_vector = nlp(query).vector
query_norm = np.linalg.norm(query_vector) + 1e-8 # Add epsilon to avoid division by zero
# Check if chunks have precomputed vectors; if not, compute them
if 'vector' not in chunks[0]:
texts = [chunk['text'] for chunk in chunks]
# Process texts in batches using nlp.pipe()
batch_size = 1000 # Adjust based on available memory
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'tok2vec']):
docs = nlp.pipe(texts, batch_size=batch_size)
# Add vectors to chunks
for chunk, doc in zip(chunks, docs):
chunk['vector'] = doc.vector
# Prepare chunk vectors and norms
chunk_vectors = np.array([chunk['vector'] for chunk in chunks])
chunk_norms = np.linalg.norm(chunk_vectors, axis=1) + 1e-8 # Add epsilon to avoid division by zero
# Compute similarities
similarities = np.dot(chunk_vectors, query_vector) / (chunk_norms * query_norm)
# Filter and sort results
relevant_chunks = [
(chunk, sim) for chunk, sim in zip(chunks, similarities) if sim > similarity_threshold
]
relevant_chunks.sort(key=lambda x: x[1], reverse=True)
return relevant_chunks[:top_n]
# Perform semantic search using spaCy
def semantic_search(query, chunks, nlp, similarity_threshold=0.5, top_n=10):
import numpy as np
from concurrent.futures import ThreadPoolExecutor
# Precompute query vector and its norm with epsilon to prevent division by zero
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'tok2vec']):
query_vector = nlp(query).vector
query_norm = np.linalg.norm(query_vector) + 1e-8 # Add epsilon
# Prepare texts from chunks
texts = [chunk['text'] for chunk in chunks]
# Function to process each text and compute its vector
def compute_vector(text):
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'tok2vec']):
doc = nlp(text)
vector = doc.vector
return vector
# Process texts in parallel using ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
chunk_vectors = list(executor.map(compute_vector, texts))
chunk_vectors = np.array(chunk_vectors)
chunk_norms = np.linalg.norm(chunk_vectors, axis=1) + 1e-8 # Add epsilon
# Compute similarities using vectorized operations
similarities = np.dot(chunk_vectors, query_vector) / (chunk_norms * query_norm)
# Filter and sort results
relevant_chunks = [
(chunk, sim) for chunk, sim in zip(chunks, similarities) if sim > similarity_threshold
]
relevant_chunks.sort(key=lambda x: x[1], reverse=True)
return relevant_chunks[:top_n]
@traceable(run_type="llm", name="nlp_rag")
def query_rag(chat_llm, query, relevant_results):
import web_rag as wr
formatted_chunks = ""
for chunk, similarity in relevant_results:
formatted_chunk = f"""
<source>
<url>{chunk['metadata']['source']}</url>
<title>{chunk['metadata']['title']}</title>
<text>{chunk['text']}</text>
</source>
"""
formatted_chunks += formatted_chunk
prompt = wr.get_rag_prompt_template().format(query=query, context=formatted_chunks)
draft = chat_llm.invoke(prompt).content
return draft |