Spaces:
Running
Running
import { ChatOpenAI } from '@langchain/openai'; | |
import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; | |
import type { Embeddings } from '@langchain/core/embeddings'; | |
import { | |
ChatPromptTemplate, | |
MessagesPlaceholder, | |
PromptTemplate, | |
} from '@langchain/core/prompts'; | |
import { | |
RunnableLambda, | |
RunnableMap, | |
RunnableSequence, | |
} from '@langchain/core/runnables'; | |
import { BaseMessage } from '@langchain/core/messages'; | |
import { StringOutputParser } from '@langchain/core/output_parsers'; | |
import LineListOutputParser from '../lib/outputParsers/listLineOutputParser'; | |
import LineOutputParser from '../lib/outputParsers/lineOutputParser'; | |
import { getDocumentsFromLinks } from '../utils/documents'; | |
import { Document } from 'langchain/document'; | |
import { searchSearxng } from '../lib/searxng'; | |
import path from 'path'; | |
import fs from 'fs'; | |
import computeSimilarity from '../utils/computeSimilarity'; | |
import formatChatHistoryAsString from '../utils/formatHistory'; | |
import eventEmitter from 'events'; | |
import { StreamEvent } from '@langchain/core/tracers/log_stream'; | |
import { IterableReadableStream } from '@langchain/core/utils/stream'; | |
export interface MetaSearchAgentType { | |
searchAndAnswer: ( | |
message: string, | |
history: BaseMessage[], | |
llm: BaseChatModel, | |
embeddings: Embeddings, | |
optimizationMode: 'speed' | 'balanced' | 'quality', | |
fileIds: string[], | |
) => Promise<eventEmitter>; | |
} | |
interface Config { | |
searchWeb: boolean; | |
rerank: boolean; | |
summarizer: boolean; | |
rerankThreshold: number; | |
queryGeneratorPrompt: string; | |
responsePrompt: string; | |
activeEngines: string[]; | |
} | |
type BasicChainInput = { | |
chat_history: BaseMessage[]; | |
query: string; | |
}; | |
class MetaSearchAgent implements MetaSearchAgentType { | |
private config: Config; | |
private strParser = new StringOutputParser(); | |
constructor(config: Config) { | |
this.config = config; | |
} | |
private async createSearchRetrieverChain(llm: BaseChatModel) { | |
(llm as unknown as ChatOpenAI).temperature = 0; | |
return RunnableSequence.from([ | |
PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt), | |
llm, | |
this.strParser, | |
RunnableLambda.from(async (input: string) => { | |
const linksOutputParser = new LineListOutputParser({ | |
key: 'links', | |
}); | |
const questionOutputParser = new LineOutputParser({ | |
key: 'question', | |
}); | |
const links = await linksOutputParser.parse(input); | |
let question = this.config.summarizer | |
? await questionOutputParser.parse(input) | |
: input; | |
if (question === 'not_needed') { | |
return { query: '', docs: [] }; | |
} | |
if (links.length > 0) { | |
if (question.length === 0) { | |
question = 'summarize'; | |
} | |
let docs = []; | |
const linkDocs = await getDocumentsFromLinks({ links }); | |
const docGroups: Document[] = []; | |
linkDocs.map((doc) => { | |
const URLDocExists = docGroups.find( | |
(d) => | |
d.metadata.url === doc.metadata.url && | |
d.metadata.totalDocs < 10, | |
); | |
if (!URLDocExists) { | |
docGroups.push({ | |
...doc, | |
metadata: { | |
...doc.metadata, | |
totalDocs: 1, | |
}, | |
}); | |
} | |
const docIndex = docGroups.findIndex( | |
(d) => | |
d.metadata.url === doc.metadata.url && | |
d.metadata.totalDocs < 10, | |
); | |
if (docIndex !== -1) { | |
docGroups[docIndex].pageContent = | |
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; | |
docGroups[docIndex].metadata.totalDocs += 1; | |
} | |
}); | |
await Promise.all( | |
docGroups.map(async (doc) => { | |
const res = await llm.invoke(` | |
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the | |
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query. | |
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary. | |
- **Journalistic tone**: The summary should sound professional and journalistic, not too casual or vague. | |
- **Thorough and detailed**: Ensure that every key point from the text is captured and that the summary directly answers the query. | |
- **Not too lengthy, but detailed**: The summary should be informative but not excessively long. Focus on providing detailed information in a concise format. | |
The text will be shared inside the \`text\` XML tag, and the query inside the \`query\` XML tag. | |
<example> | |
1. \`<text> | |
Docker is a set of platform-as-a-service products that use OS-level virtualization to deliver software in packages called containers. | |
It was first released in 2013 and is developed by Docker, Inc. Docker is designed to make it easier to create, deploy, and run applications | |
by using containers. | |
</text> | |
<query> | |
What is Docker and how does it work? | |
</query> | |
Response: | |
Docker is a revolutionary platform-as-a-service product developed by Docker, Inc., that uses container technology to make application | |
deployment more efficient. It allows developers to package their software with all necessary dependencies, making it easier to run in | |
any environment. Released in 2013, Docker has transformed the way applications are built, deployed, and managed. | |
\` | |
2. \`<text> | |
The theory of relativity, or simply relativity, encompasses two interrelated theories of Albert Einstein: special relativity and general | |
relativity. However, the word "relativity" is sometimes used in reference to Galilean invariance. The term "theory of relativity" was based | |
on the expression "relative theory" used by Max Planck in 1906. The theory of relativity usually encompasses two interrelated theories by | |
Albert Einstein: special relativity and general relativity. Special relativity applies to all physical phenomena in the absence of gravity. | |
General relativity explains the law of gravitation and its relation to other forces of nature. It applies to the cosmological and astrophysical | |
realm, including astronomy. | |
</text> | |
<query> | |
summarize | |
</query> | |
Response: | |
The theory of relativity, developed by Albert Einstein, encompasses two main theories: special relativity and general relativity. Special | |
relativity applies to all physical phenomena in the absence of gravity, while general relativity explains the law of gravitation and its | |
relation to other forces of nature. The theory of relativity is based on the concept of "relative theory," as introduced by Max Planck in | |
1906. It is a fundamental theory in physics that has revolutionized our understanding of the universe. | |
\` | |
</example> | |
Everything below is the actual data you will be working with. Good luck! | |
<query> | |
${question} | |
</query> | |
<text> | |
${doc.pageContent} | |
</text> | |
Make sure to answer the query in the summary. | |
`); | |
const document = new Document({ | |
pageContent: res.content as string, | |
metadata: { | |
title: doc.metadata.title, | |
url: doc.metadata.url, | |
}, | |
}); | |
docs.push(document); | |
}), | |
); | |
return { query: question, docs: docs }; | |
} else { | |
const res = await searchSearxng(question, { | |
language: 'en', | |
engines: this.config.activeEngines, | |
}); | |
const documents = res.results.map( | |
(result) => | |
new Document({ | |
pageContent: | |
result.content || | |
(this.config.activeEngines.includes('youtube') | |
? result.title | |
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */, | |
metadata: { | |
title: result.title, | |
url: result.url, | |
...(result.img_src && { img_src: result.img_src }), | |
}, | |
}), | |
); | |
return { query: question, docs: documents }; | |
} | |
}), | |
]); | |
} | |
private async createAnsweringChain( | |
llm: BaseChatModel, | |
fileIds: string[], | |
embeddings: Embeddings, | |
optimizationMode: 'speed' | 'balanced' | 'quality', | |
) { | |
return RunnableSequence.from([ | |
RunnableMap.from({ | |
query: (input: BasicChainInput) => input.query, | |
chat_history: (input: BasicChainInput) => input.chat_history, | |
date: () => new Date().toISOString(), | |
context: RunnableLambda.from(async (input: BasicChainInput) => { | |
const processedHistory = formatChatHistoryAsString( | |
input.chat_history, | |
); | |
let docs: Document[] | null = null; | |
let query = input.query; | |
if (this.config.searchWeb) { | |
const searchRetrieverChain = | |
await this.createSearchRetrieverChain(llm); | |
const searchRetrieverResult = await searchRetrieverChain.invoke({ | |
chat_history: processedHistory, | |
query, | |
}); | |
query = searchRetrieverResult.query; | |
docs = searchRetrieverResult.docs; | |
} | |
const sortedDocs = await this.rerankDocs( | |
query, | |
docs ?? [], | |
fileIds, | |
embeddings, | |
optimizationMode, | |
); | |
return sortedDocs; | |
}) | |
.withConfig({ | |
runName: 'FinalSourceRetriever', | |
}) | |
.pipe(this.processDocs), | |
}), | |
ChatPromptTemplate.fromMessages([ | |
['system', this.config.responsePrompt], | |
new MessagesPlaceholder('chat_history'), | |
['user', '{query}'], | |
]), | |
llm, | |
this.strParser, | |
]).withConfig({ | |
runName: 'FinalResponseGenerator', | |
}); | |
} | |
private async rerankDocs( | |
query: string, | |
docs: Document[], | |
fileIds: string[], | |
embeddings: Embeddings, | |
optimizationMode: 'speed' | 'balanced' | 'quality', | |
) { | |
if (docs.length === 0 && fileIds.length === 0) { | |
return docs; | |
} | |
const filesData = fileIds | |
.map((file) => { | |
const filePath = path.join(process.cwd(), 'uploads', file); | |
const contentPath = filePath + '-extracted.json'; | |
const embeddingsPath = filePath + '-embeddings.json'; | |
const content = JSON.parse(fs.readFileSync(contentPath, 'utf8')); | |
const embeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8')); | |
const fileSimilaritySearchObject = content.contents.map( | |
(c: string, i) => { | |
return { | |
fileName: content.title, | |
content: c, | |
embeddings: embeddings.embeddings[i], | |
}; | |
}, | |
); | |
return fileSimilaritySearchObject; | |
}) | |
.flat(); | |
if (query.toLocaleLowerCase() === 'summarize') { | |
return docs.slice(0, 15); | |
} | |
const docsWithContent = docs.filter( | |
(doc) => doc.pageContent && doc.pageContent.length > 0, | |
); | |
if (optimizationMode === 'speed' || this.config.rerank === false) { | |
if (filesData.length > 0) { | |
const [queryEmbedding] = await Promise.all([ | |
embeddings.embedQuery(query), | |
]); | |
const fileDocs = filesData.map((fileData) => { | |
return new Document({ | |
pageContent: fileData.content, | |
metadata: { | |
title: fileData.fileName, | |
url: `File`, | |
}, | |
}); | |
}); | |
const similarity = filesData.map((fileData, i) => { | |
const sim = computeSimilarity(queryEmbedding, fileData.embeddings); | |
return { | |
index: i, | |
similarity: sim, | |
}; | |
}); | |
let sortedDocs = similarity | |
.filter( | |
(sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3), | |
) | |
.sort((a, b) => b.similarity - a.similarity) | |
.slice(0, 15) | |
.map((sim) => fileDocs[sim.index]); | |
sortedDocs = | |
docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs; | |
return [ | |
...sortedDocs, | |
...docsWithContent.slice(0, 15 - sortedDocs.length), | |
]; | |
} else { | |
return docsWithContent.slice(0, 15); | |
} | |
} else if (optimizationMode === 'balanced') { | |
const [docEmbeddings, queryEmbedding] = await Promise.all([ | |
embeddings.embedDocuments( | |
docsWithContent.map((doc) => doc.pageContent), | |
), | |
embeddings.embedQuery(query), | |
]); | |
docsWithContent.push( | |
...filesData.map((fileData) => { | |
return new Document({ | |
pageContent: fileData.content, | |
metadata: { | |
title: fileData.fileName, | |
url: `File`, | |
}, | |
}); | |
}), | |
); | |
docEmbeddings.push(...filesData.map((fileData) => fileData.embeddings)); | |
const similarity = docEmbeddings.map((docEmbedding, i) => { | |
const sim = computeSimilarity(queryEmbedding, docEmbedding); | |
return { | |
index: i, | |
similarity: sim, | |
}; | |
}); | |
const sortedDocs = similarity | |
.filter((sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3)) | |
.sort((a, b) => b.similarity - a.similarity) | |
.slice(0, 15) | |
.map((sim) => docsWithContent[sim.index]); | |
return sortedDocs; | |
} | |
} | |
private processDocs(docs: Document[]) { | |
return docs | |
.map( | |
(_, index) => | |
`${index + 1}. ${docs[index].metadata.title} ${docs[index].pageContent}`, | |
) | |
.join('\n'); | |
} | |
private async handleStream( | |
stream: IterableReadableStream<StreamEvent>, | |
emitter: eventEmitter, | |
) { | |
for await (const event of stream) { | |
if ( | |
event.event === 'on_chain_end' && | |
event.name === 'FinalSourceRetriever' | |
) { | |
``; | |
emitter.emit( | |
'data', | |
JSON.stringify({ type: 'sources', data: event.data.output }), | |
); | |
} | |
if ( | |
event.event === 'on_chain_stream' && | |
event.name === 'FinalResponseGenerator' | |
) { | |
emitter.emit( | |
'data', | |
JSON.stringify({ type: 'response', data: event.data.chunk }), | |
); | |
} | |
if ( | |
event.event === 'on_chain_end' && | |
event.name === 'FinalResponseGenerator' | |
) { | |
emitter.emit('end'); | |
} | |
} | |
} | |
async searchAndAnswer( | |
message: string, | |
history: BaseMessage[], | |
llm: BaseChatModel, | |
embeddings: Embeddings, | |
optimizationMode: 'speed' | 'balanced' | 'quality', | |
fileIds: string[], | |
) { | |
const emitter = new eventEmitter(); | |
const answeringChain = await this.createAnsweringChain( | |
llm, | |
fileIds, | |
embeddings, | |
optimizationMode, | |
); | |
const stream = answeringChain.streamEvents( | |
{ | |
chat_history: history, | |
query: message, | |
}, | |
{ | |
version: 'v1', | |
}, | |
); | |
this.handleStream(stream, emitter); | |
return emitter; | |
} | |
} | |
export default MetaSearchAgent; | |