Spaces:
Running
Running
File size: 2,028 Bytes
564e576 719022a 564e576 fd7f926 564e576 719022a 564e576 aa0485a 564e576 719022a 564e576 719022a 564e576 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import type { BackendTool } from ".";
import { callSpace, getIpToken } from "./utils";
import { downloadFile } from "$lib/server/files/downloadFile";
type PdfParserInput = [Blob /* pdf */, string /* filename */];
type PdfParserOutput = [string /* markdown */, Record<string, unknown> /* metadata */];
const documentParser: BackendTool = {
name: "document_parser",
displayName: "Document Parser",
description: "Use this tool to parse any document and get its content in markdown format.",
mimeTypes: ["application/*", "text/*"],
parameterDefinitions: {
fileMessageIndex: {
description: "Index of the message containing the document file to parse",
type: "number",
required: true,
},
fileIndex: {
description: "Index of the document file to parse",
type: "number",
required: true,
},
},
async *call({ fileMessageIndex, fileIndex }, { conv, messages, ip, username }) {
fileMessageIndex = Number(fileMessageIndex);
fileIndex = Number(fileIndex);
const message = messages[fileMessageIndex];
const files = message?.files ?? [];
if (!files || files.length === 0) throw Error("User did not provide a pdf to parse");
if (fileIndex >= files.length) throw Error("Model provided an invalid file index");
const file = files[fileIndex];
const fileBlob = await downloadFile(files[fileIndex].value, conv._id)
.then((file) => fetch(`data:${file.mime};base64,${file.value}`))
.then((res) => res.blob());
const ipToken = await getIpToken(ip, username);
const outputs = await callSpace<PdfParserInput, PdfParserOutput>(
"huggingchat/document-parser",
"predict",
[fileBlob, file.name],
ipToken
);
let documentMarkdown = outputs[0];
// TODO: quick fix for avoiding context limit. eventually should use the tokenizer
if (documentMarkdown.length > 30_000) {
documentMarkdown = documentMarkdown.slice(0, 30_000) + "\n\n... (truncated)";
}
return {
outputs: [{ [file.name]: documentMarkdown }],
display: false,
};
},
};
export default documentParser;
|