Spaces:

Ansemin101
/

Markit_v2

Running on Zero

App Files Files Community

Markit_v2 / src /parsers /gemini_flash_parser.py

AnseMin

Approach #2 -- converting latex output from GOT OCR to markdown

5b7f920 about 1 month ago

raw

history blame contribute delete

5.05 kB

	from pathlib import Path
	from typing import Dict, List, Optional, Any, Union
	import os
	import json
	import tempfile
	import base64
	from PIL import Image
	import io

	from src.parsers.parser_interface import DocumentParser
	from src.parsers.parser_registry import ParserRegistry

	# Import the Google Gemini API client
	try:
	from google import genai
	GEMINI_AVAILABLE = True
	except ImportError:
	GEMINI_AVAILABLE = False

	# Load API key from environment variable
	api_key = os.getenv("GOOGLE_API_KEY")

	# Check if API key is available and print a message if not
	if not api_key:
	print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")

	class GeminiFlashParser(DocumentParser):
	"""Parser that uses Google's Gemini Flash 2.0 to convert documents to markdown."""

	@classmethod
	def get_name(cls) -> str:
	return "Gemini Flash"

	@classmethod
	def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
	return [
	{
	"id": "none",
	"name": "None",
	"default_params": {}
	}
	]

	@classmethod
	def get_description(cls) -> str:
	return "Gemini Flash 2.0 parser for converting documents and images to markdown"

	def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
	"""Parse a document using Gemini Flash 2.0."""
	if not GEMINI_AVAILABLE:
	raise ImportError(
	"The Google Gemini API client is not installed. "
	"Please install it with 'pip install google-genai'."
	)

	# Use the globally loaded API key
	if not api_key:
	raise ValueError(
	"GOOGLE_API_KEY environment variable is not set. "
	"Please set it to your Gemini API key."
	)

	try:
	# Determine file type based on extension
	file_path = Path(file_path)
	file_extension = file_path.suffix.lower()

	# Read the file content
	file_content = file_path.read_bytes()

	# Determine MIME type based on file extension
	mime_type = self._get_mime_type(file_extension)

	# Create a client and use the model
	client = genai.Client(api_key=api_key)

	# Set up the prompt
	prompt = """
	Convert this document to markdown format.
	Preserve the structure, headings, lists, tables, and formatting as much as possible.
	For images, include a brief description in markdown image syntax.
	Return only the markdown content, no other text.
	"""

	# Generate the response
	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[
	prompt,
	genai.types.Part.from_bytes(
	data=file_content,
	mime_type=mime_type
	)
	],
	config={
	"temperature": 0.1,
	"top_p": 0.95,
	"top_k": 40,
	"max_output_tokens": 8192,
	}
	)

	# Extract the markdown text from the response
	markdown_text = response.text

	return markdown_text

	except Exception as e:
	error_message = f"Error parsing document with Gemini Flash: {str(e)}"
	print(error_message)
	return f"# Error\n\n{error_message}\n\nPlease check your API key and try again."

	def _get_mime_type(self, file_extension: str) -> str:
	"""Get the MIME type for a file extension."""
	mime_types = {
	".pdf": "application/pdf",
	".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	".doc": "application/msword",
	".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
	".ppt": "application/vnd.ms-powerpoint",
	".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	".xls": "application/vnd.ms-excel",
	".txt": "text/plain",
	".md": "text/markdown",
	".html": "text/html",
	".htm": "text/html",
	".jpg": "image/jpeg",
	".jpeg": "image/jpeg",
	".png": "image/png",
	".gif": "image/gif",
	".bmp": "image/bmp",
	".tiff": "image/tiff",
	".tif": "image/tiff",
	}

	return mime_types.get(file_extension, "application/octet-stream")


	# Register the parser with the registry
	if GEMINI_AVAILABLE:
	ParserRegistry.register(GeminiFlashParser)
	else:
	print("Gemini Flash parser not registered: google-genai package not installed")