File size: 5,073 Bytes
2dc4c21
 
 
 
 
 
 
 
 
 
 
 
 
 
06351e4
2dc4c21
 
 
 
06351e4
 
 
 
 
 
2dc4c21
 
06351e4
 
2dc4c21
 
 
06351e4
2dc4c21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06351e4
2dc4c21
 
 
 
 
 
 
06351e4
 
 
 
 
 
 
 
 
2dc4c21
06351e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dc4c21
 
06351e4
 
 
2dc4c21
 
06351e4
2dc4c21
 
06351e4
 
 
 
 
 
2dc4c21
06351e4
2dc4c21
 
 
 
 
 
 
 
 
 
 
06351e4
2dc4c21
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import os
import json
import tempfile
import base64
from PIL import Image
import io

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry

# Import the Google Gemini API client
try:
    import google.generativeai as genai
    GEMINI_AVAILABLE = True
except ImportError:
    GEMINI_AVAILABLE = False

# Load API key from environment variable
api_key = os.getenv("GOOGLE_API_KEY")

# Check if API key is available and print a message if not
if not api_key:
    print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")

class GeminiFlashParser(DocumentParser):
    """Parser that uses Google's Gemini Flash 2.0 to convert documents to markdown."""

    @classmethod
    def get_name(cls) -> str:
        return "Gemini Flash"

    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "none",
                "name": "None",
                "default_params": {}
            }
        ]
    
    @classmethod
    def get_description(cls) -> str:
        return "Gemini Flash 2.0 parser for converting documents and images to markdown"
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """Parse a document using Gemini Flash 2.0."""
        if not GEMINI_AVAILABLE:
            raise ImportError(
                "The Google Gemini API client is not installed. "
                "Please install it with 'pip install google-genai'."
            )
        
        # Use the globally loaded API key
        if not api_key:
            raise ValueError(
                "GOOGLE_API_KEY environment variable is not set. "
                "Please set it to your Gemini API key."
            )
        
        try:
            # Configure the Gemini API with the API key
            genai.configure(api_key=api_key)
            
            # Determine file type based on extension
            file_path = Path(file_path)
            file_extension = file_path.suffix.lower()
            
            # Read the file content
            file_content = file_path.read_bytes()
            
            # Determine MIME type based on file extension
            mime_type = self._get_mime_type(file_extension)
            
            # Create a multipart content with the file
            model = genai.GenerativeModel('gemini-2.0-flash')
            
            # Set up the prompt
            prompt = """
            Convert this document to markdown format. 
            Preserve the structure, headings, lists, tables, and formatting as much as possible.
            For images, include a brief description in markdown image syntax.
            """
            
            # Generate the response
            response = model.generate_content(
                contents=[
                    prompt,
                    {
                        "mime_type": mime_type,
                        "data": file_content
                    }
                ],
                generation_config={
                    "temperature": 0.2,
                    "top_p": 0.95,
                    "top_k": 40,
                    "max_output_tokens": 8192,
                }
            )
            
            # Extract the markdown text from the response
            markdown_text = response.text
            
            return markdown_text
            
        except Exception as e:
            error_message = f"Error parsing document with Gemini Flash: {str(e)}"
            print(error_message)
            return f"# Error\n\n{error_message}\n\nPlease check your API key and try again."
    
    def _get_mime_type(self, file_extension: str) -> str:
        """Get the MIME type for a file extension."""
        mime_types = {
            ".pdf": "application/pdf",
            ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            ".doc": "application/msword",
            ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
            ".ppt": "application/vnd.ms-powerpoint",
            ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            ".xls": "application/vnd.ms-excel",
            ".txt": "text/plain",
            ".md": "text/markdown",
            ".html": "text/html",
            ".htm": "text/html",
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".png": "image/png",
            ".gif": "image/gif",
            ".bmp": "image/bmp",
            ".tiff": "image/tiff",
            ".tif": "image/tiff",
        }
        
        return mime_types.get(file_extension, "application/octet-stream")


# Register the parser with the registry
if GEMINI_AVAILABLE:
    ParserRegistry.register(GeminiFlashParser)
else:
    print("Gemini Flash parser not registered: google-genai package not installed")