perplexican / src /routes /uploads.ts
0Scottzilla0's picture
Upload folder using huggingface_hub
a80ecb8 verified
import express from 'express';
import logger from '../utils/logger';
import multer from 'multer';
import path from 'path';
import crypto from 'crypto';
import fs from 'fs';
import { Embeddings } from '@langchain/core/embeddings';
import { getAvailableEmbeddingModelProviders } from '../lib/providers';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
import { Document } from 'langchain/document';
const router = express.Router();
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
const storage = multer.diskStorage({
destination: (req, file, cb) => {
cb(null, path.join(process.cwd(), './uploads'));
},
filename: (req, file, cb) => {
const splitedFileName = file.originalname.split('.');
const fileExtension = splitedFileName[splitedFileName.length - 1];
if (!['pdf', 'docx', 'txt'].includes(fileExtension)) {
return cb(new Error('File type is not supported'), '');
}
cb(null, `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`);
},
});
const upload = multer({ storage });
router.post(
'/',
upload.fields([
{ name: 'files' },
{ name: 'embedding_model', maxCount: 1 },
{ name: 'embedding_model_provider', maxCount: 1 },
]),
async (req, res) => {
try {
const { embedding_model, embedding_model_provider } = req.body;
if (!embedding_model || !embedding_model_provider) {
res
.status(400)
.json({ message: 'Missing embedding model or provider' });
return;
}
const embeddingModels = await getAvailableEmbeddingModelProviders();
const provider =
embedding_model_provider ?? Object.keys(embeddingModels)[0];
const embeddingModel: Embeddings =
embedding_model ?? Object.keys(embeddingModels[provider])[0];
let embeddingsModel: Embeddings | undefined;
if (
embeddingModels[provider] &&
embeddingModels[provider][embeddingModel]
) {
embeddingsModel = embeddingModels[provider][embeddingModel].model as
| Embeddings
| undefined;
}
if (!embeddingsModel) {
res.status(400).json({ message: 'Invalid LLM model selected' });
return;
}
const files = req.files['files'] as Express.Multer.File[];
if (!files || files.length === 0) {
res.status(400).json({ message: 'No files uploaded' });
return;
}
await Promise.all(
files.map(async (file) => {
let docs: Document[] = [];
if (file.mimetype === 'application/pdf') {
const loader = new PDFLoader(file.path);
docs = await loader.load();
} else if (
file.mimetype ===
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
) {
const loader = new DocxLoader(file.path);
docs = await loader.load();
} else if (file.mimetype === 'text/plain') {
const text = fs.readFileSync(file.path, 'utf-8');
docs = [
new Document({
pageContent: text,
metadata: {
title: file.originalname,
},
}),
];
}
const splitted = await splitter.splitDocuments(docs);
const json = JSON.stringify({
title: file.originalname,
contents: splitted.map((doc) => doc.pageContent),
});
const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json');
fs.writeFileSync(pathToSave, json);
const embeddings = await embeddingsModel.embedDocuments(
splitted.map((doc) => doc.pageContent),
);
const embeddingsJSON = JSON.stringify({
title: file.originalname,
embeddings: embeddings,
});
const pathToSaveEmbeddings = file.path.replace(
/\.\w+$/,
'-embeddings.json',
);
fs.writeFileSync(pathToSaveEmbeddings, embeddingsJSON);
}),
);
res.status(200).json({
files: files.map((file) => {
return {
fileName: file.originalname,
fileExtension: file.filename.split('.').pop(),
fileId: file.filename.replace(/\.\w+$/, ''),
};
}),
});
} catch (err: any) {
logger.error(`Error in uploading file results: ${err.message}`);
res.status(500).json({ message: 'An error has occurred.' });
}
},
);
export default router;