File size: 5,051 Bytes
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
9caa2c7
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9caa2c7
 
dda982a
 
 
 
 
 
5b7f920
dda982a
 
 
9caa2c7
 
dda982a
 
9b25e42
 
 
 
dda982a
9caa2c7
5b7f920
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import os
import json
import tempfile
import base64
from PIL import Image
import io

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry

# Import the Google Gemini API client
try:
    from google import genai
    GEMINI_AVAILABLE = True
except ImportError:
    GEMINI_AVAILABLE = False

# Load API key from environment variable
api_key = os.getenv("GOOGLE_API_KEY")

# Check if API key is available and print a message if not
if not api_key:
    print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")

class GeminiFlashParser(DocumentParser):
    """Parser that uses Google's Gemini Flash 2.0 to convert documents to markdown."""

    @classmethod
    def get_name(cls) -> str:
        return "Gemini Flash"

    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "none",
                "name": "None",
                "default_params": {}
            }
        ]
    
    @classmethod
    def get_description(cls) -> str:
        return "Gemini Flash 2.0 parser for converting documents and images to markdown"
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """Parse a document using Gemini Flash 2.0."""
        if not GEMINI_AVAILABLE:
            raise ImportError(
                "The Google Gemini API client is not installed. "
                "Please install it with 'pip install google-genai'."
            )
        
        # Use the globally loaded API key
        if not api_key:
            raise ValueError(
                "GOOGLE_API_KEY environment variable is not set. "
                "Please set it to your Gemini API key."
            )
        
        try:
            # Determine file type based on extension
            file_path = Path(file_path)
            file_extension = file_path.suffix.lower()
            
            # Read the file content
            file_content = file_path.read_bytes()
            
            # Determine MIME type based on file extension
            mime_type = self._get_mime_type(file_extension)
            
            # Create a client and use the model
            client = genai.Client(api_key=api_key)
            
            # Set up the prompt
            prompt = """
            Convert this document to markdown format. 
            Preserve the structure, headings, lists, tables, and formatting as much as possible.
            For images, include a brief description in markdown image syntax.
            Return only the markdown content, no other text.
            """
            
            # Generate the response
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=[
                    prompt,
                    genai.types.Part.from_bytes(
                        data=file_content,
                        mime_type=mime_type
                    )
                ],
                config={
                    "temperature": 0.1,
                    "top_p": 0.95,
                    "top_k": 40,
                    "max_output_tokens": 8192,
                }
            )
            
            # Extract the markdown text from the response
            markdown_text = response.text
            
            return markdown_text
            
        except Exception as e:
            error_message = f"Error parsing document with Gemini Flash: {str(e)}"
            print(error_message)
            return f"# Error\n\n{error_message}\n\nPlease check your API key and try again."
    
    def _get_mime_type(self, file_extension: str) -> str:
        """Get the MIME type for a file extension."""
        mime_types = {
            ".pdf": "application/pdf",
            ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            ".doc": "application/msword",
            ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            ".ppt": "application/vnd.ms-powerpoint",
            ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            ".xls": "application/vnd.ms-excel",
            ".txt": "text/plain",
            ".md": "text/markdown",
            ".html": "text/html",
            ".htm": "text/html",
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".png": "image/png",
            ".gif": "image/gif",
            ".bmp": "image/bmp",
            ".tiff": "image/tiff",
            ".tif": "image/tiff",
        }
        
        return mime_types.get(file_extension, "application/octet-stream")


# Register the parser with the registry
if GEMINI_AVAILABLE:
    ParserRegistry.register(GeminiFlashParser)
else:
    print("Gemini Flash parser not registered: google-genai package not installed")