github-actions[bot]
Sync from https://github.com/ryanlinjui/menu-text-detection
8af6af2
import json
import numpy as np
from PIL import Image
from google import genai
from google.genai import types
from .base import LLMBase
FUNCTION_CALL = json.load(open("tools/schema_gemini.json", "r"))
class GeminiAPI(LLMBase):
@classmethod
def call(cls, image: np.ndarray, model: str, token: str) -> dict:
client = genai.Client(api_key=token) # Initialize the client with the API key
encode_img = Image.fromarray(image) # Convert the image for the API
config = types.GenerateContentConfig(
tools=[types.Tool(function_declarations=[FUNCTION_CALL])],
tool_config={
"function_calling_config": {
"mode": "ANY",
"allowed_function_names": [FUNCTION_CALL["name"]]
}
}
)
response = client.models.generate_content(
model=model,
contents=[encode_img],
config=config
)
if response.candidates[0].content.parts[0].function_call:
function_call = response.candidates[0].content.parts[0].function_call
return function_call.args
return {}