Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,051 Bytes
dda982a 9caa2c7 dda982a 9caa2c7 dda982a 5b7f920 dda982a 9caa2c7 dda982a 9b25e42 dda982a 9caa2c7 5b7f920 dda982a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import os
import json
import tempfile
import base64
from PIL import Image
import io
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
# Import the Google Gemini API client
try:
from google import genai
GEMINI_AVAILABLE = True
except ImportError:
GEMINI_AVAILABLE = False
# Load API key from environment variable
api_key = os.getenv("GOOGLE_API_KEY")
# Check if API key is available and print a message if not
if not api_key:
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
class GeminiFlashParser(DocumentParser):
"""Parser that uses Google's Gemini Flash 2.0 to convert documents to markdown."""
@classmethod
def get_name(cls) -> str:
return "Gemini Flash"
@classmethod
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
return [
{
"id": "none",
"name": "None",
"default_params": {}
}
]
@classmethod
def get_description(cls) -> str:
return "Gemini Flash 2.0 parser for converting documents and images to markdown"
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
"""Parse a document using Gemini Flash 2.0."""
if not GEMINI_AVAILABLE:
raise ImportError(
"The Google Gemini API client is not installed. "
"Please install it with 'pip install google-genai'."
)
# Use the globally loaded API key
if not api_key:
raise ValueError(
"GOOGLE_API_KEY environment variable is not set. "
"Please set it to your Gemini API key."
)
try:
# Determine file type based on extension
file_path = Path(file_path)
file_extension = file_path.suffix.lower()
# Read the file content
file_content = file_path.read_bytes()
# Determine MIME type based on file extension
mime_type = self._get_mime_type(file_extension)
# Create a client and use the model
client = genai.Client(api_key=api_key)
# Set up the prompt
prompt = """
Convert this document to markdown format.
Preserve the structure, headings, lists, tables, and formatting as much as possible.
For images, include a brief description in markdown image syntax.
Return only the markdown content, no other text.
"""
# Generate the response
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[
prompt,
genai.types.Part.from_bytes(
data=file_content,
mime_type=mime_type
)
],
config={
"temperature": 0.1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
}
)
# Extract the markdown text from the response
markdown_text = response.text
return markdown_text
except Exception as e:
error_message = f"Error parsing document with Gemini Flash: {str(e)}"
print(error_message)
return f"# Error\n\n{error_message}\n\nPlease check your API key and try again."
def _get_mime_type(self, file_extension: str) -> str:
"""Get the MIME type for a file extension."""
mime_types = {
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".doc": "application/msword",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
".ppt": "application/vnd.ms-powerpoint",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".xls": "application/vnd.ms-excel",
".txt": "text/plain",
".md": "text/markdown",
".html": "text/html",
".htm": "text/html",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".bmp": "image/bmp",
".tiff": "image/tiff",
".tif": "image/tiff",
}
return mime_types.get(file_extension, "application/octet-stream")
# Register the parser with the registry
if GEMINI_AVAILABLE:
ParserRegistry.register(GeminiFlashParser)
else:
print("Gemini Flash parser not registered: google-genai package not installed") |