File size: 2,420 Bytes
d58981a a0ead68 1872809 c379e84 a74ff63 c379e84 eebad8b c379e84 19d685b a74ff63 cf7f7d8 c379e84 53f6c3a cf7f7d8 53f6c3a 27d29ad da48084 c379e84 27d29ad 53f6c3a c379e84 eebad8b c379e84 eebad8b c379e84 eebad8b c379e84 eebad8b c379e84 eebad8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import os
os.system("apt-get install tesseract-ocr")
from fastapi import FastAPI, File, Request, UploadFile, Body, Depends, HTTPException
from fastapi.security.api_key import APIKeyHeader
from typing import Optional, Annotated
from fastapi.encoders import jsonable_encoder
from PIL import Image
from io import BytesIO
import pytesseract
from nltk.tokenize import sent_tokenize
from transformers import MarianMTModel, MarianTokenizer
API_KEY = os.environ.get("API_KEY")
app = FastAPI()
api_key_header = APIKeyHeader(name="api_key", auto_error=False)
def get_api_key(api_key: Optional[str] = Depends(api_key_header)):
if api_key is None or api_key != API_KEY:
raise HTTPException(status_code=401, detail="Unauthorized access")
return api_key
@app.post("/api/ocr", response_model=dict)
async def ocr(
api_key: str = Depends(get_api_key),
image: UploadFile = File(...),
# languages: list = Body(["eng"])
):
try:
print("[1]",os.popen(f'cat /etc/debian_version').read())
print("[2]",os.popen(f'cat /etc/issue').read())
print("[3]",os.popen(f'apt search tesseract').read())
content = await image.read()
image = Image.open(BytesIO(content))
print("[text]",pytesseract.image_to_string(image, lang = 'eng'))
# text = pytesseract.image_to_string(image, lang="+".join(languages))
# text = pytesseract.image_to_string(image, lang = 'eng')
except Exception as e:
return {"error": str(e)}, 500
# return jsonable_encoder({"text": text})
return {"ImageText": "text"}
@app.post("/api/translate", response_model=dict)
async def translate(
api_key: str = Depends(get_api_key),
text: str = Body(...),
src: str = "en",
trg: str = "zh",
):
if api_key != API_KEY:
return {"error": "Invalid API key"}, 401
tokenizer, model = get_model(src, trg)
translated_text = ""
for sentence in sent_tokenize(text):
translated_sub = model.generate(**tokenizer(sentence, return_tensors="pt"))[0]
translated_text += tokenizer.decode(translated_sub, skip_special_tokens=True) + "\n"
return jsonable_encoder({"translated_text": translated_text})
def get_model(src: str, trg: str):
model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
return tokenizer, model
|