Spaces:

DSVmon
/

vdgb_project

Running

App Files Files Community

DSVmon commited on about 1 month ago

Commit

427fced

verified ·

1 Parent(s): dc8ed73

1

Browse files

Files changed (1) hide show

app.py +199 -0

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# === Загрузка библиотек ===
+from pypdf import PdfReader, PdfWriter
+import gradio as gr
+import fitz
+from PIL import Image
+import pandas as pd
+import cv2
+import numpy as np
+import os
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+import torch
+import difflib
+from pdf2image import convert_from_path
+# Загрузка TrOCR
+processor = TrOCRProcessor.from_pretrained('kazars24/trocr-base-handwritten-ru')
+model = VisionEncoderDecoderModel.from_pretrained('kazars24/trocr-base-handwritten-ru')
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+# === 1. Функция поиска и группировки линий ===
+def group_lines(contours, img_size, y_tolerance=10, is_horizontal=True):
+    line_groups = []
+    used = [False] * len(contours)
+    for i in range(len(contours)):
+        if used[i]:
+            continue
+        group = [contours[i]]
+        used[i] = True
+        x, y, w, h = cv2.boundingRect(contours[i])
+        for j in range(i + 1, len(contours)):
+            if used[j]:
+                continue
+            x2, y2, w2, h2 = cv2.boundingRect(contours[j])
+            if is_horizontal:
+                if abs(y2 - y) < y_tolerance:
+                    group.append(contours[j])
+                    used[j] = True
+            else:
+                if abs(x2 - x) < y_tolerance:
+                    group.append(contours[j])
+                    used[j] = True
+        line_groups.append(group)
+    return line_groups
+# === 2. Основная функция отрисовки линий и сохранения координат ===
+def detect_table_lines_and_cells(img, min_cell_size=15):
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    # Горизонтальные линии
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
+    detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
+    horizontal_contours = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    horizontal_contours = horizontal_contours[0] if len(horizontal_contours) == 2 else horizontal_contours[1]
+    horizontal_line_groups = group_lines(horizontal_contours, img.shape[0], is_horizontal=True)
+    horizontal_line_groups.sort(key=lambda g: np.mean([cv2.boundingRect(c)[1] for c in g]))
+    horizontal_coords = [int(np.mean([cv2.boundingRect(c)[1] + cv2.boundingRect(c)[3] / 2 for c in group])) for group in horizontal_line_groups[3:]]  # от 4-й линии
+    # Вертикальные линии
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
+    detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
+    vertical_contours = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    vertical_contours = vertical_contours[0] if len(vertical_contours) == 2 else vertical_contours[1]
+    vertical_line_groups = group_lines(vertical_contours, img.shape[1], is_horizontal=False)
+    vertical_coords = [int(np.mean([cv2.boundingRect(c)[0] + cv2.boundingRect(c)[2] / 2 for c in group])) for group in vertical_line_groups]
+    # Поиск ячеек
+    cells = []
+    horizontal_coords = sorted(horizontal_coords)
+    vertical_coords = sorted(vertical_coords)
+    for row_idx in range(len(horizontal_coords) - 1):
+        y1, y2 = horizontal_coords[row_idx], horizontal_coords[row_idx + 1]
+        for col_idx in range(len(vertical_coords) - 1):
+            x1, x2 = vertical_coords[col_idx], vertical_coords[col_idx + 1]
+            w = x2 - x1
+            h = y2 - y1
+            if w > min_cell_size and h > min_cell_size:
+                cells.append({'row': row_idx, 'col': col_idx, 'box': (x1, y1, w, h)})
+    return cells
+# === 3. Функция распознавания текста в ячейках ===
+def recognize_text(image, max_length=10):
+    if image is None:
+        return "Не удалось загрузить изображение"
+    try:
+        inputs = processor(images=image, return_tensors="pt").to(device)
+        generated_ids = model.generate(
+            **inputs,
+            max_length=max_length,
+            early_stopping=True,
+            num_beams=1,
+            use_cache=True
+        )
+        return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    except Exception as e:
+        print(f"Ошибка распознавания: {e}")
+        return "Ошибка распознавания"
+# === 4. Обрезка ячеек и OCR для таблицы ===
+def crop_and_recognize_cells(image, cells):
+    allowed_words = ["труба", "врезка", "зкл", "отвод", "арм", "переход", "тройник", "заглушка", "зад-ка", "т-т", "комп-р"]
+    recognized = {}
+    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+    for cell in cells:
+        x, y, w, h = cell['box']
+        cropped = pil_image.crop((x, y, x + w, y + h))
+        text = recognize_text(cropped, max_length=10).strip().lower()
+        # Заменяем все запятые на точки
+        text = text.replace(',', '.')
+        if any(char.isdigit() for char in text):
+            # Если есть цифры - оставляем как есть (уже заменили запятые)
+            final_text = text
+        else:
+            # Если текст состоит только из букв
+            if len(text) <= 2:
+                # Если длина 1 или 2 символа - заменяем на пустую строку
+                final_text = ""
+            else:
+                # Ищем наиболее похожее слово из словаря
+                matches = difflib.get_close_matches(text, allowed_words, n=1, cutoff=0.5)
+                final_text = matches[0] if matches else text
+        recognized[(cell['row'], cell['col'])] = final_text
+    return recognized
+# === 5. Полный процесс обработки изображения таблицы ===
+def process_pdf_table(pdf_path, output_excel='results.xlsx'):
+    images = convert_from_path(pdf_path, dpi=300)  # Загружаем первую страницу
+    if not images:
+        print("Ошибка: PDF пустой или не удалось сконвертировать.")
+        return
+    image = cv2.cvtColor(np.array(images[0]), cv2.COLOR_RGB2BGR)  # Переводим PIL -> OpenCV
+    cells = detect_table_lines_and_cells(image, min_cell_size=15)
+    print(f"Найдено ячеек: {len(cells)}")
+    recognized = crop_and_recognize_cells(image, cells)
+    # Собираем в DataFrame
+    data = {}
+    for (row, col), text in recognized.items():
+        data.setdefault(row, {})[col] = text
+    max_cols = max((max(cols.keys()) for cols in data.values()), default=0) + 1
+    rows = []
+    for row_idx in range(max(data.keys()) + 1):
+        row = []
+        for col_idx in range(max_cols):
+            row.append(data.get(row_idx, {}).get(col_idx, ""))
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    df.to_excel(output_excel, index=False, header=False)
+    print(f"Результат сохранён в {output_excel}")
+    return output_excel
+# === Gradio приложение ===
+def gradio_process(pdf_file, progress=gr.Progress()):
+    progress(0, desc="Чтение PDF...")
+    # Получаем имя без расширения и меняем его на .xlsx
+    base_name = os.path.splitext(os.path.basename(pdf_file.name))[0]
+    output_excel = f"{base_name}.xlsx"
+    progress(0.3, desc="Поиск ячеек таблицы...")
+    result_file = process_pdf_table(pdf_file.name, output_excel=output_excel)
+    progress(1.0, desc="Готово! Таблица сохранена.")
+    return result_file
+app = gr.Interface(
+    fn=gradio_process,
+    inputs=gr.File(label="Загрузите PDF файл таблицы"),
+    outputs=gr.File(label="Скачайте Excel с распознанными ячейками"),
+    title="📄 PDF → Excel распознавание таблиц",
+    description="Загрузите PDF-файл с таблицей. Программа найдет ячейки, распознает текст и сохранит результат в Excel-файл.",
+    allow_flagging="never"
+)
+app.launch(share=True)