Spaces:
Running
on
Zero
Running
on
Zero
from pathlib import Path | |
from typing import Dict, List, Optional, Any, Union | |
import os | |
import json | |
import tempfile | |
import base64 | |
from PIL import Image | |
import io | |
from src.parsers.parser_interface import DocumentParser | |
from src.parsers.parser_registry import ParserRegistry | |
# Import the Google Gemini API client | |
try: | |
from google import genai | |
GEMINI_AVAILABLE = True | |
except ImportError: | |
GEMINI_AVAILABLE = False | |
# Load API key from environment variable | |
api_key = os.getenv("GOOGLE_API_KEY") | |
# Check if API key is available and print a message if not | |
if not api_key: | |
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.") | |
class GeminiFlashParser(DocumentParser): | |
"""Parser that uses Google's Gemini Flash 2.0 to convert documents to markdown.""" | |
def get_name(cls) -> str: | |
return "Gemini Flash" | |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]: | |
return [ | |
{ | |
"id": "none", | |
"name": "None", | |
"default_params": {} | |
} | |
] | |
def get_description(cls) -> str: | |
return "Gemini Flash 2.0 parser for converting documents and images to markdown" | |
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str: | |
"""Parse a document using Gemini Flash 2.0.""" | |
if not GEMINI_AVAILABLE: | |
raise ImportError( | |
"The Google Gemini API client is not installed. " | |
"Please install it with 'pip install google-genai'." | |
) | |
# Use the globally loaded API key | |
if not api_key: | |
raise ValueError( | |
"GOOGLE_API_KEY environment variable is not set. " | |
"Please set it to your Gemini API key." | |
) | |
try: | |
# Determine file type based on extension | |
file_path = Path(file_path) | |
file_extension = file_path.suffix.lower() | |
# Read the file content | |
file_content = file_path.read_bytes() | |
# Determine MIME type based on file extension | |
mime_type = self._get_mime_type(file_extension) | |
# Create a client and use the model | |
client = genai.Client(api_key=api_key) | |
# Set up the prompt | |
prompt = """ | |
Convert this document to markdown format. | |
Preserve the structure, headings, lists, tables, and formatting as much as possible. | |
For images, include a brief description in markdown image syntax. | |
Return only the markdown content, no other text. | |
""" | |
# Generate the response | |
response = client.models.generate_content( | |
model="gemini-2.0-flash", | |
contents=[ | |
prompt, | |
genai.types.Part.from_bytes( | |
data=file_content, | |
mime_type=mime_type | |
) | |
], | |
config={ | |
"temperature": 0.1, | |
"top_p": 0.95, | |
"top_k": 40, | |
"max_output_tokens": 8192, | |
} | |
) | |
# Extract the markdown text from the response | |
markdown_text = response.text | |
return markdown_text | |
except Exception as e: | |
error_message = f"Error parsing document with Gemini Flash: {str(e)}" | |
print(error_message) | |
return f"# Error\n\n{error_message}\n\nPlease check your API key and try again." | |
def _get_mime_type(self, file_extension: str) -> str: | |
"""Get the MIME type for a file extension.""" | |
mime_types = { | |
".pdf": "application/pdf", | |
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
".doc": "application/msword", | |
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", | |
".ppt": "application/vnd.ms-powerpoint", | |
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
".xls": "application/vnd.ms-excel", | |
".txt": "text/plain", | |
".md": "text/markdown", | |
".html": "text/html", | |
".htm": "text/html", | |
".jpg": "image/jpeg", | |
".jpeg": "image/jpeg", | |
".png": "image/png", | |
".gif": "image/gif", | |
".bmp": "image/bmp", | |
".tiff": "image/tiff", | |
".tif": "image/tiff", | |
} | |
return mime_types.get(file_extension, "application/octet-stream") | |
# Register the parser with the registry | |
if GEMINI_AVAILABLE: | |
ParserRegistry.register(GeminiFlashParser) | |
else: | |
print("Gemini Flash parser not registered: google-genai package not installed") |