import gradio as gr import easyocr import numpy as np from PIL import Image from transformers import AutoTokenizer, AutoModelForTokenClassification import torch import logging # Set up logging for debugging. logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info("Initializing EasyOCR...") # Initialize the EasyOCR reader for English. reader = easyocr.Reader(['en'], gpu=False) logger.info("EasyOCR initialized.") logger.info("Loading nutrition extraction model...") # Load the model using the Hugging Face Transformers pipeline. # Force CPU inference with device=-1. tokenizer = AutoTokenizer.from_pretrained("openfoodfacts/nutrition-extractor") model = AutoModelForTokenClassification.from_pretrained("openfoodfacts/nutrition-extractor") logger.info("Model loaded successfully.") def ocr_extract(image: Image.Image): """ Uses EasyOCR to extract text tokens and their bounding boxes from an image. Returns a list of tokens and corresponding boxes in [left, top, width, height] format. Bounding box coordinates are cast to int. """ # Convert PIL image to numpy array. np_image = np.array(image) results = reader.readtext(np_image) tokens = [] boxes = [] for bbox, text, confidence in results: if text.strip(): tokens.append(text) # Convert the bounding box (list of 4 points) to [left, top, width, height]. xs = [point[0] for point in bbox] ys = [point[1] for point in bbox] left = int(min(xs)) top = int(min(ys)) width = int(max(xs) - left) height = int(max(ys) - top) boxes.append([left, top, width, height]) logger.info(f"OCR extracted {len(tokens)} tokens.") return tokens, boxes def predict(image: Image.Image): """ Runs OCR with EasyOCR to extract tokens and bounding boxes, then uses the nutrition extraction model to classify tokens and aggregate nutritional values. """ tokens, boxes = ocr_extract(image) if len(tokens) == 0: logger.error("No text detected in the image.") return {"error": "No text detected in the image."} # Prepare inputs: pass the tokens and boxes to the tokenizer. encoding = tokenizer(tokens, boxes=boxes, return_tensors="pt", truncation=True, padding=True) try: outputs = model(**encoding) except Exception as e: logger.error(f"Error during model inference: {e}") return {"error": f"Model inference error: {e}"} # Get predicted labels for each token. predictions = torch.argmax(outputs.logits, dim=2) extracted_data = {} for token, pred in zip(tokens, predictions[0].tolist()): label = model.config.id2label.get(pred, "O").lower() if label == "o": continue # Extract numeric value from token. num_str = "".join(filter(lambda c: c.isdigit() or c == '.', token)) try: value = float(num_str) extracted_data[label] = extracted_data.get(label, 0) + value except ValueError: continue if not extracted_data: logger.warning("No nutritional information extracted.") return {"error": "No nutritional information extracted."} logger.info(f"Extracted data: {extracted_data}") return extracted_data # Create a Gradio interface that exposes the API. demo = gr.Interface( fn=predict, inputs=gr.Image(type="pil"), outputs="json", title="Nutrition Extractor API with EasyOCR", description="Upload an image of a nutrition table to extract nutritional values. The pipeline uses EasyOCR to extract tokens and bounding boxes, then processes them with the openfoodfacts/nutrition-extractor model." ) if __name__ == "__main__": demo.launch(share=True)