Spaces:
Running
Running
import express from 'express'; | |
import logger from '../utils/logger'; | |
import multer from 'multer'; | |
import path from 'path'; | |
import crypto from 'crypto'; | |
import fs from 'fs'; | |
import { Embeddings } from '@langchain/core/embeddings'; | |
import { getAvailableEmbeddingModelProviders } from '../lib/providers'; | |
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; | |
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'; | |
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'; | |
import { Document } from 'langchain/document'; | |
const router = express.Router(); | |
const splitter = new RecursiveCharacterTextSplitter({ | |
chunkSize: 500, | |
chunkOverlap: 100, | |
}); | |
const storage = multer.diskStorage({ | |
destination: (req, file, cb) => { | |
cb(null, path.join(process.cwd(), './uploads')); | |
}, | |
filename: (req, file, cb) => { | |
const splitedFileName = file.originalname.split('.'); | |
const fileExtension = splitedFileName[splitedFileName.length - 1]; | |
if (!['pdf', 'docx', 'txt'].includes(fileExtension)) { | |
return cb(new Error('File type is not supported'), ''); | |
} | |
cb(null, `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`); | |
}, | |
}); | |
const upload = multer({ storage }); | |
router.post( | |
'/', | |
upload.fields([ | |
{ name: 'files' }, | |
{ name: 'embedding_model', maxCount: 1 }, | |
{ name: 'embedding_model_provider', maxCount: 1 }, | |
]), | |
async (req, res) => { | |
try { | |
const { embedding_model, embedding_model_provider } = req.body; | |
if (!embedding_model || !embedding_model_provider) { | |
res | |
.status(400) | |
.json({ message: 'Missing embedding model or provider' }); | |
return; | |
} | |
const embeddingModels = await getAvailableEmbeddingModelProviders(); | |
const provider = | |
embedding_model_provider ?? Object.keys(embeddingModels)[0]; | |
const embeddingModel: Embeddings = | |
embedding_model ?? Object.keys(embeddingModels[provider])[0]; | |
let embeddingsModel: Embeddings | undefined; | |
if ( | |
embeddingModels[provider] && | |
embeddingModels[provider][embeddingModel] | |
) { | |
embeddingsModel = embeddingModels[provider][embeddingModel].model as | |
| Embeddings | |
| undefined; | |
} | |
if (!embeddingsModel) { | |
res.status(400).json({ message: 'Invalid LLM model selected' }); | |
return; | |
} | |
const files = req.files['files'] as Express.Multer.File[]; | |
if (!files || files.length === 0) { | |
res.status(400).json({ message: 'No files uploaded' }); | |
return; | |
} | |
await Promise.all( | |
files.map(async (file) => { | |
let docs: Document[] = []; | |
if (file.mimetype === 'application/pdf') { | |
const loader = new PDFLoader(file.path); | |
docs = await loader.load(); | |
} else if ( | |
file.mimetype === | |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | |
) { | |
const loader = new DocxLoader(file.path); | |
docs = await loader.load(); | |
} else if (file.mimetype === 'text/plain') { | |
const text = fs.readFileSync(file.path, 'utf-8'); | |
docs = [ | |
new Document({ | |
pageContent: text, | |
metadata: { | |
title: file.originalname, | |
}, | |
}), | |
]; | |
} | |
const splitted = await splitter.splitDocuments(docs); | |
const json = JSON.stringify({ | |
title: file.originalname, | |
contents: splitted.map((doc) => doc.pageContent), | |
}); | |
const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json'); | |
fs.writeFileSync(pathToSave, json); | |
const embeddings = await embeddingsModel.embedDocuments( | |
splitted.map((doc) => doc.pageContent), | |
); | |
const embeddingsJSON = JSON.stringify({ | |
title: file.originalname, | |
embeddings: embeddings, | |
}); | |
const pathToSaveEmbeddings = file.path.replace( | |
/\.\w+$/, | |
'-embeddings.json', | |
); | |
fs.writeFileSync(pathToSaveEmbeddings, embeddingsJSON); | |
}), | |
); | |
res.status(200).json({ | |
files: files.map((file) => { | |
return { | |
fileName: file.originalname, | |
fileExtension: file.filename.split('.').pop(), | |
fileId: file.filename.replace(/\.\w+$/, ''), | |
}; | |
}), | |
}); | |
} catch (err: any) { | |
logger.error(`Error in uploading file results: ${err.message}`); | |
res.status(500).json({ message: 'An error has occurred.' }); | |
} | |
}, | |
); | |
export default router; | |