FatimaGr commited on
Commit
4210dc2
·
verified ·
1 Parent(s): 5bbdec3
Files changed (1) hide show
  1. app.py +151 -2
app.py CHANGED
@@ -1,9 +1,158 @@
1
- from fastapi import FastAPI
 
2
  from fastapi.staticfiles import StaticFiles
3
- from fastapi.responses import RedirectResponse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  app = FastAPI()
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Servir les fichiers statiques (HTML, CSS, JS)
8
  app.mount("/static", StaticFiles(directory="static", html=True), name="static")
9
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from fastapi.responses import JSONResponse, RedirectResponse
3
  from fastapi.staticfiles import StaticFiles
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer, MarianMTModel, MarianTokenizer
6
+ import shutil
7
+ #
8
+ import os
9
+ import logging
10
+ from PyPDF2 import PdfReader
11
+ import docx
12
+ from PIL import Image
13
+ import openpyxl # 📌 Pour lire les fichiers Excel (.xlsx)
14
+ from pptx import Presentation
15
+ import fitz # PyMuPDF
16
+ import io
17
+ from docx import Document
18
+ import matplotlib.pyplot as plt
19
+ import seaborn as sns
20
+ import torch
21
+ import re
22
+ import pandas as pd
23
+ from transformers import AutoTokenizer, AutoModelForCausalLM
24
+ from fastapi.responses import FileResponse
25
+ import os
26
+ from fastapi.middleware.cors import CORSMiddleware
27
+ import matplotlib
28
+ matplotlib.use('Agg')
29
+
30
+ import re
31
+ import torch
32
+ import pandas as pd
33
+ import matplotlib.pyplot as plt
34
+ import seaborn as sns
35
+ from transformers import AutoTokenizer, AutoModelForCausalLM
36
+ from fastapi import FastAPI, File, UploadFile, Form
37
+ from fastapi.responses import FileResponse
38
+ import os
39
+ from fastapi.middleware.cors import CORSMiddleware
40
+ from fastapi import FastAPI, File, UploadFile, Form
41
+ from fastapi.responses import JSONResponse, RedirectResponse
42
+ from fastapi.staticfiles import StaticFiles
43
+ from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer
44
+ import shutil
45
+ import os
46
+ import logging
47
+ from fastapi.middleware.cors import CORSMiddleware
48
+ from PyPDF2 import PdfReader
49
+ import docx
50
+ from PIL import Image # Pour ouvrir les images avant analyse
51
+ from transformers import MarianMTModel, MarianTokenizer
52
+ import os
53
+ import fitz
54
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
55
+
56
+ import logging
57
+ import openpyxl
58
+
59
+
60
+ # Configuration du logging
61
+ logging.basicConfig(level=logging.INFO)
62
 
63
  app = FastAPI()
64
 
65
+ # Configuration CORS
66
+ app.add_middleware(
67
+ CORSMiddleware,
68
+ allow_origins=["*"],
69
+ allow_credentials=True,
70
+ allow_methods=["*"],
71
+ allow_headers=["*"],
72
+ )
73
+
74
+ UPLOAD_DIR = "uploads"
75
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
76
+
77
+ #traduction-----------------------------------------------------------------------------------------------------------
78
+
79
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
80
+ model_name = "facebook/m2m100_418M"
81
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
82
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
83
+
84
+
85
+ # Fonction pour extraire le texte
86
+ def extract_text_from_pdf(file):
87
+ doc = fitz.open(stream=file.file.read(), filetype="pdf")
88
+ return "\n".join([page.get_text() for page in doc]).strip()
89
+
90
+ def extract_text_from_docx(file):
91
+ doc = Document(io.BytesIO(file.file.read()))
92
+ return "\n".join([para.text for para in doc.paragraphs]).strip()
93
+
94
+ def extract_text_from_pptx(file):
95
+ prs = Presentation(io.BytesIO(file.file.read()))
96
+ return "\n".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]).strip()
97
+
98
+ def extract_text_from_excel(file):
99
+ wb = openpyxl.load_workbook(io.BytesIO(file.file.read()), data_only=True)
100
+ text = [str(cell) for sheet in wb.worksheets for row in sheet.iter_rows(values_only=True) for cell in row if cell]
101
+ return "\n".join(text).strip()
102
+
103
+ @app.post("/translate/")
104
+ async def translate_document(file: UploadFile = File(...), target_lang: str = Form(...)):
105
+ """API pour traduire un document."""
106
+ try:
107
+ logging.info(f"📥 Fichier reçu : {file.filename}")
108
+ logging.info(f"🌍 Langue cible reçue : {target_lang}")
109
+
110
+ if model is None or tokenizer is None:
111
+ return JSONResponse(status_code=500, content={"error": "Modèle de traduction non chargé"})
112
+
113
+ # Extraction du texte
114
+ if file.filename.endswith(".pdf"):
115
+ text = extract_text_from_pdf(file)
116
+ elif file.filename.endswith(".docx"):
117
+ text = extract_text_from_docx(file)
118
+ elif file.filename.endswith(".pptx"):
119
+ text = extract_text_from_pptx(file)
120
+ elif file.filename.endswith(".xlsx"):
121
+ text = extract_text_from_excel(file)
122
+ else:
123
+ return JSONResponse(status_code=400, content={"error": "Format non supporté"})
124
+
125
+ logging.info(f"📜 Texte extrait : {text[:50]}...")
126
+
127
+ if not text:
128
+ return JSONResponse(status_code=400, content={"error": "Aucun texte trouvé dans le document"})
129
+
130
+ # Vérifier si la langue cible est supportée
131
+ target_lang_id = tokenizer.get_lang_id(target_lang)
132
+
133
+ if target_lang_id is None:
134
+ return JSONResponse(
135
+ status_code=400,
136
+ content={"error": f"Langue cible '{target_lang}' non supportée. Langues disponibles : {list(tokenizer.lang_code_to_id.keys())}"}
137
+ )
138
+
139
+ # Traduction
140
+ tokenizer.src_lang = "fr"
141
+ encoded_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
142
+
143
+ logging.info(f"🔍 ID de la langue cible : {target_lang_id}")
144
+
145
+ generated_tokens = model.generate(**encoded_text, forced_bos_token_id=target_lang_id)
146
+
147
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
148
+
149
+ logging.info(f"✅ Traduction réussie : {translated_text[:50]}...")
150
+ return {"translated_text": translated_text}
151
+
152
+ except Exception as e:
153
+ logging.error(f"❌ Erreur lors de la traduction : {e}")
154
+ return JSONResponse(status_code=500, content={"error": "Échec de la traduction"})
155
+
156
  # Servir les fichiers statiques (HTML, CSS, JS)
157
  app.mount("/static", StaticFiles(directory="static", html=True), name="static")
158