Spaces:

mabil
/

norus-tool

Sleeping

App Files Files Community

mabil commited on Mar 26

Commit

8c42b67

0 Parent(s):

🚀 Deploy NORUS tool completo su Hugging Face senza binary files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
.gitignore +6 -0
Dockerfile +13 -0
README.md +71 -0
app.py +133 -0
models/similarity_model.py +32 -0
requirements.txt +12 -0
start_local.sh +8 -0
static/css/style.css +200 -0
static/js/script.js +119 -0
templates/NORUS.html +173 -0
templates/app.py +134 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ static/images/norus.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__/
+*.pyc
+.DS_Store
+venv/
+uploads/
+static/images/norus.png

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# NORUS Tool 🧠📄
+**NORUS** (Novelty and Originality Recognition Utility System) è uno strumento basato su intelligenza artificiale che consente l'analisi semantica di articoli scientifici in formato PDF, confrontandoli con articoli locali o pubblicati su PubMed. Il tool restituisce misure di **similarità semantica**, **token overlap** e un indice composito chiamato **OUI (Originality & Uniqueness Index)**.
+## 🚀 Funzionalità principali
+- ✅ Caricamento PDF da analizzare
+- 📂 Confronto con PDF locali o articoli da PubMed
+- 🤖 Estrazione di embedding semantici tramite SciBERT
+- 📊 Calcolo di:
+  - Similarità semantica (cosine similarity)
+  - Sovrapposizione testuale (token overlap)
+  - Indice OUI (originalità e novità)
+- 📈 Visualizzazione interattiva dei risultati via Chart.js
+## 🧪 OUI - Originality & Uniqueness Index
+\`\`\`math
+OUI = 1 - (α × semantic_similarity + β × token_overlap)
+\`\`\`
+- α = 0.7 → penalizza la somiglianza semantica
+- β = 0.3 → penalizza la ripetizione letterale
+- L'OUI misura **quanto un documento è originale**, sia nel contenuto che nella forma.
+## 🧱 Architettura
+- `Flask` come backend web
+- `pdfplumber` per l'estrazione del testo dai PDF
+- `nltk` per preprocessing linguistico
+- `sentence-transformers` con modello `allenai/scibert_scivocab_uncased`
+- `requests` per l'interfaccia con PubMed
+## 📂 Struttura del progetto
+```
+.
+├── app.py
+├── Dockerfile
+├── requirements.txt
+├── static/
+├── templates/
+├── uploads/
+├── README.md
+```
+## ▶️ Esecuzione locale
+Per eseguire localmente:
+1. Assicurati di avere Python 3.9+
+2. Installa le dipendenze:
+\`\`\`bash
+pip install -r requirements.txt
+\`\`\`
+3. Avvia l'app:
+\`\`\`bash
+python app.py
+\`\`\`
+Apri il browser su `http://localhost:7860`
+## 📡 Deploy su Hugging Face Spaces
+Puoi caricare questo progetto come Space Docker-based su Hugging Face. Il `Dockerfile` è già configurato.
+---
+Developed by Marina Bilotta – Computational Chemistry & AI Research

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import os
+import requests
+import pdfplumber
+import numpy as np
+from flask import Flask, render_template, request, redirect, url_for, flash
+from werkzeug.utils import secure_filename
+from sentence_transformers import SentenceTransformer, util
+import nltk
+from nltk.stem import WordNetLemmatizer, PorterStemmer
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+nltk.download("punkt")
+nltk.download("wordnet")
+nltk.download("stopwords")
+lemmatizer = WordNetLemmatizer()
+stemmer = PorterStemmer()
+stop_words = set(stopwords.words("english"))
+app = Flask(__name__)
+app.config["UPLOAD_FOLDER"] = "uploads"
+os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
+model = SentenceTransformer("allenai/scibert_scivocab_uncased")
+def extract_pdf_text(pdf_path):
+    text = ""
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() or " "
+    except Exception as e:
+        print(f"Errore estrazione testo: {e}")
+    return text.lower().strip()
+def preprocess_text(text):
+    text = text.lower()
+    words = word_tokenize(text)
+    words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
+    return " ".join(words)
+def calculate_token_overlap(text1, text2):
+    tokens1 = set(text1.split())
+    tokens2 = set(text2.split())
+    overlap = len(tokens1 & tokens2)
+    return round((overlap / max(len(tokens1), 1)) * 100, 2)
+def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
+    oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
+    return round(max(0, min(oui * 100, 100)), 2)
+def validate_document(pdf_path, comparison_sources, method="local", titles=None):
+    pdf_text = extract_pdf_text(pdf_path)
+    results = []
+    for i, doc in enumerate(comparison_sources):
+        doc_text = extract_pdf_text(doc) if method == "local" else doc
+        similarity = util.pytorch_cos_sim(
+            model.encode(pdf_text, convert_to_tensor=True),
+            model.encode(doc_text, convert_to_tensor=True)
+        ).item() * 100
+        token_overlap = calculate_token_overlap(pdf_text, doc_text)
+        oui = calculate_oui(similarity, token_overlap)
+        title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
+        results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
+    return results
+def fetch_pubmed_details(article_id):
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
+    try:
+        response = requests.get(base_url, params=params)
+        response.raise_for_status()
+        import xml.etree.ElementTree as ET
+        root = ET.fromstring(response.text)
+        title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
+        abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
+        keywords = root.findall(".//Keyword")
+        keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
+        print(f"\n🔍 ARTICOLO RECUPERATO\n📖 Titolo: {title}\n📝 Abstract: {abstract[:500]}...\n🔑 Keywords: {keyword_text}\n")
+        return title, f"{abstract} {keyword_text}"
+    except requests.exceptions.RequestException as e:
+        print(f"Errore recupero abstract: {e}")
+        return "No Title", "No Abstract"
+def fetch_pubmed(query, year_start, year_end, max_results=10):
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
+    try:
+        response = requests.get(base_url, params=params)
+        response.raise_for_status()
+        return response.json().get("esearchresult", {}).get("idlist", [])
+    except requests.exceptions.RequestException as e:
+        print(f"Errore recupero articoli PubMed: {e}")
+        return []
+@app.route("/")
+def index():
+    return render_template("NORUS.html")
+@app.route("/validate", methods=["POST"])
+def validate():
+    pdf_file = request.files.get("pdf_file")
+    analysis_type = request.form.get("analysis_type")
+    local_dir = request.form.get("local_directory", "").strip()
+    query = request.form.get("query", "").strip()
+    if not pdf_file:
+        flash("Carica un file PDF valido.", "error")
+        return redirect(url_for("index"))
+    filename = secure_filename(pdf_file.filename)
+    pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
+    pdf_file.save(pdf_path)
+    results = []
+    if analysis_type == "local":
+        if not os.path.isdir(local_dir):
+            flash("Seleziona una directory valida.", "error")
+            return redirect(url_for("index"))
+        comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
+        if not comparison_files:
+            flash("La directory non contiene PDF.", "error")
+            return redirect(url_for("index"))
+        results = validate_document(pdf_path, comparison_files, method="local")
+    elif analysis_type == "pubmed":
+        year_start = request.form.get("year_start", "2000")
+        year_end = request.form.get("year_end", "2025")
+        num_articles = int(request.form.get("num_articles", "10"))
+        pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
+        pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
+        results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
+    return render_template("NORUS.html", results=results)
+if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0", port=7860)

models/similarity_model.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+def compute_similarity(text1, text2):
+    """
+    Calcola la similarità del coseno tra due testi usando TF-IDF.
+    Parametri:
+        text1 (str): Primo testo.
+        text2 (str): Secondo testo.
+    Ritorna:
+        float: Valore di similarità (0 a 1).
+    """
+    try:
+        # Verifica che i testi non siano vuoti
+        if not text1.strip() or not text2.strip():
+            raise ValueError("Uno o entrambi i testi sono vuoti.")
+        # Vettorizzazione con TF-IDF
+        vectorizer = TfidfVectorizer(stop_words='english')
+        tfidf_matrix = vectorizer.fit_transform([text1, text2])
+        # Calcolo della similarità del coseno
+        similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
+        return similarity_matrix[0][0]  # Ritorna il valore della similarità
+    except Exception as e:
+        print(f"Errore durante il calcolo della similarità: {e}")
+        return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+flask
+pdfplumber
+nltk
+sentence-transformers
+scikit-learn
+pandas
+reportlab
+matplotlib
+requests
+keybert
+torch
+transformers

start_local.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/bash
+echo "⚙️ Avvio dell'ambiente NORUS..."
+python3 -m venv venv
+source venv/bin/activate
+pip install --upgrade pip
+pip install -r requirements.txt
+echo "✅ Ambiente pronto. Avvio del server Flask..."
+python app.py

static/css/style.css ADDED Viewed

	@@ -0,0 +1,200 @@

+/* Generale */
+html, body {
+    height: 100%;
+    margin: 0;
+    padding: 0;
+    overflow-y: auto; /* Permette lo scrolling */
+}
+body {
+    font-family: Arial, sans-serif;
+    background-color: #f8f8f8;
+    color: #333;
+    display: flex;
+    flex-direction: column;
+    min-height: 100vh;
+}
+/* Header */
+header {
+    background-color: rgba(42, 77, 111, 0.8); /* Aggiunge trasparenza */
+    color: #fff;
+    padding: 20px;
+    text-align: center;
+    position: relative;
+}
+header h1 {
+    margin-bottom: 10px;
+    font-size: 2em;
+}
+header p {
+    font-size: 1.2em;
+    margin-top: 0;
+}
+/* Logo */
+#logo {
+    max-width: 200px;
+    cursor: pointer;
+    transition: transform 0.3s ease;
+}
+#logo:hover {
+    transform: scale(1.2);
+}
+/* Container principale */
+.container {
+    flex: 1;
+    width: 100%;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    padding: 20px;
+}
+/* Form */
+form {
+    margin: 20px auto;
+    text-align: center;
+    width: 80%;
+    max-width: 800px;
+    padding: 25px;
+    background-color: #fff;
+    border-radius: 10px;
+    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
+    overflow-y: auto; /* Permette scrolling interno se necessario */
+    max-height: 90vh; /* Impedisce che il form esca dallo schermo */
+}
+/* Etichette e campi input */
+label {
+    display: block;
+    font-size: 1.1em;
+    margin-bottom: 8px;
+    font-weight: bold;
+}
+input, select {
+    width: 100%;
+    padding: 10px;
+    margin-bottom: 15px;
+    border: 1px solid #ccc;
+    border-radius: 5px;
+    font-size: 1.1em;
+}
+/* Pulsanti */
+button {
+    width: 100%;
+    background-color: #2a4d6f;
+    color: #fff;
+    padding: 12px 20px;
+    border: none;
+    border-radius: 5px;
+    cursor: pointer;
+    font-size: 1.2em;
+    transition: background-color 0.3s;
+    margin-top: 10px;
+}
+button:hover {
+    background-color: #1a3d56;
+}
+/* Sezione Risultati */
+.results {
+    padding: 25px;
+    background-color: #fff;
+    margin: 30px auto;
+    border-radius: 10px;
+    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
+    max-width: 1000px;
+    overflow-y: auto;
+    max-height: 80vh;
+}
+/* Tabelle */
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 25px;
+}
+th {
+    background-color: #2a4d6f;
+    color: #fff;
+    padding: 12px;
+    text-align: left;
+}
+td {
+    padding: 12px;
+    border-bottom: 1px solid #ddd;
+    background-color: #f8f9fa;
+}
+table tr:hover {
+    background-color: #f1f1f1;
+}
+table th, table td {
+    font-size: 1.1em;
+}
+/* Contenitore del grafico */
+#chart-container {
+    width: 90%;
+    max-width: 1000px;
+    height: 600px;
+    margin: 40px auto;
+    padding-bottom: 20px;
+}
+canvas {
+    width: 100% !important;
+    height: 100% !important;
+    display: block;
+}
+#logo {
+    display: block; /* Assicura che il logo sia trattato come un blocco */
+    margin: 0 auto; /* Lo centra orizzontalmente */
+    max-width: 200px;
+    height: auto;
+    cursor: pointer;
+    transition: transform 0.3s ease;
+    position: relative; /* Evita sovrapposizioni strane */
+    z-index: 10; /* Assicura che sia sopra altri elementi */
+}
+/* Barra di caricamento */
+#progress-container {
+    width: 100%;
+    background-color: #f3f3f3;
+    border-radius: 25px;
+    overflow: hidden;
+    margin: 20px 0;
+}
+#progress-bar {
+    height: 20px;
+    width: 0;
+    background-color: #4caf50;
+    text-align: center;
+    line-height: 20px;
+    color: white;
+}
+/* Footer */
+footer {
+    background-color: #2a4d6f;
+    color: #fff;
+    text-align: center;
+    padding: 15px;
+    width: 100%;
+    font-size: 1.1em;
+    margin-top: auto; /* Il footer si posiziona in fondo alla pagina */
+}

static/js/script.js ADDED Viewed

	@@ -0,0 +1,119 @@

+document.addEventListener("DOMContentLoaded", function () {
+    // Interazione con il logo
+    const logoLink = document.getElementById("logo-link");
+    if (logoLink) {
+        logoLink.addEventListener("click", function () {
+            const logo = document.getElementById("logo");
+            logo.style.transform = "scale(1.5)";
+            setTimeout(() => {
+                logo.style.transform = "scale(1)";
+            }, 500);
+        });
+    }
+    // Barra di avanzamento durante l'analisi
+    function startProgress() {
+        const progressBar = document.getElementById("progress-bar");
+        const progressContainer = document.getElementById("progress-container");
+        if (progressBar && progressContainer) {
+            progressContainer.style.display = "block";
+            let width = 0;
+            const interval = setInterval(() => {
+                if (width >= 100) {
+                    clearInterval(interval);
+                } else {
+                    width++;
+                    progressBar.style.width = width + "%";
+                    progressBar.textContent = width + "%";
+                }
+            }, 100);
+        }
+    }
+    // Inizializza il form per avviare la progress bar
+    const analysisForm = document.getElementById("analysisForm");
+    if (analysisForm) {
+        analysisForm.addEventListener("submit", function () {
+            startProgress();
+        });
+    }
+    // Configurazione e gestione dei grafici di Similarità, Token Overlap e OUI
+    const chartCanvas = document.getElementById("similarityChart");
+    if (chartCanvas) {
+        const ctx = chartCanvas.getContext("2d");
+        new Chart(ctx, {
+            type: "bar",
+            data: {
+                labels: {{ results | map(attribute='title') | list | safe }},
+                datasets: [
+                    {
+                        label: "Semantic Similarity (%)",
+                        data: {{ results | map(attribute='similarity') | list | safe }},
+                        backgroundColor: "rgba(54, 162, 235, 0.7)",
+                        borderColor: "rgba(54, 162, 235, 1)",
+                        borderWidth: 1
+                    },
+                    {
+                        label: "Token Overlap (%)",
+                        data: {{ results | map(attribute='token_overlap') | list | safe }},
+                        backgroundColor: "rgba(255, 159, 64, 0.7)",
+                        borderColor: "rgba(255, 159, 64, 1)",
+                        borderWidth: 1
+                    },
+                    {
+                        label: "OUI (%)",
+                        data: {{ results | map(attribute='oui') | list | safe }},
+                        backgroundColor: "rgba(153, 102, 255, 0.7)",
+                        borderColor: "rgba(153, 102, 255, 1)",
+                        borderWidth: 1
+                    }
+                ]
+            },
+            options: {
+                responsive: true,
+                maintainAspectRatio: false,
+                plugins: {
+                    legend: { position: "top" },
+                    tooltip: { mode: "index", intersect: false }
+                },
+                scales: {
+                    y: { beginAtZero: true },
+                    x: {
+                        ticks: { autoSkip: false, maxRotation: 45, minRotation: 45 }
+                    }
+                }
+            }
+        });
+    }
+    // Gestione del caricamento del file
+    const fileInput = document.getElementById("pdf_file");
+    if (fileInput) {
+        fileInput.addEventListener("change", function () {
+            const fileLabel = document.querySelector('label[for="pdf_file"]');
+            if (fileInput.files.length > 0 && fileLabel) {
+                fileLabel.textContent = `File selected: ${fileInput.files[0].name}`;
+            }
+        });
+    }
+    // Miglioramento dell'usabilità dei messaggi di errore
+    const flashMessages = document.querySelectorAll(".error");
+    if (flashMessages.length > 0) {
+        setTimeout(() => {
+            flashMessages.forEach(message => message.remove());
+        }, 5000);
+    }
+    // Permettere la scelta tra analisi locale e PubMed
+    const analysisType = document.getElementById("analysis_type");
+    if (analysisType) {
+        analysisType.addEventListener("change", function () {
+            document.getElementById("pubmed-options").style.display =
+                this.value === "pubmed" ? "block" : "none";
+            document.getElementById("local-options").style.display =
+                this.value === "local" ? "block" : "none";
+        });
+    }
+});

templates/NORUS.html ADDED Viewed

	@@ -0,0 +1,173 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>NORUS Tool</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <script src="{{ url_for('static', filename='js/script.js') }}"></script>
+</head>
+<body>
+    <header>
+        <div style="text-align: center; margin-top: 20px;">
+            <a href="#" id="logo-link">
+                <img src="{{ url_for('static', filename='images/NORUS.PNG') }}" alt="NORUS Logo" id="logo" style="max-width: 200px; height: auto;">
+            </a>
+        </div>
+        <h1>NORUS Tool</h1>
+        <p>Analyze your PDF and discover originality and similarity</p>
+    </header>
+    <main>
+        <form action="/validate" method="POST" enctype="multipart/form-data" onsubmit="startProgress()">
+            <label for="analysis_type">Choose Analysis Type:</label>
+            <select name="analysis_type" id="analysis_type" required>
+                <option value="local">Local Directory</option>
+                <option value="pubmed">PubMed Search</option>
+            </select>
+            <div id="pubmed-options" style="display: none;">
+                <label for="query">PubMed Query:</label>
+                <input type="text" name="query" id="query">
+                <label for="year_start">Start Year:</label>
+                <input type="number" name="year_start" id="year_start" min="1900" max="2025" value="2000">
+                <label for="year_end">End Year:</label>
+                <input type="number" name="year_end" id="year_end" min="1900" max="2025" value="2025">
+                <label for="num_articles">Number of Articles:</label>
+                <input type="number" name="num_articles" id="num_articles" min="1" value="10">
+            </div>
+            <div id="local-options" style="display: none;">
+                <label for="local_directory">Select Local Directory:</label>
+                <input type="text" name="local_directory" id="local_directory" placeholder="Enter directory path">
+            </div>
+            <label for="pdf_file">Upload PDF:</label>
+            <input type="file" name="pdf_file" id="pdf_file" required>
+            <button type="submit">Analyze</button>
+        </form>
+        <div id="progress-container" style="display: none;">
+            <div id="progress-bar">0%</div>
+        </div>
+        {% if results %}
+        <section>
+            <h2>Analysis Results</h2>
+            <table>
+                <thead>
+                    <tr>
+                        <th>Title</th>
+                        <th>Semantic Similarity (%)</th>
+                        <th>Token Overlap (%)</th>
+                        <th>OUI (Originality & Uniqueness Index)</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {% for result in results %}
+                    <tr>
+                        <td style="max-width: 400px; word-wrap: break-word;">{{ result.title }}</td>
+                        <td>{{ "%.2f"|format(result.similarity) }}</td>
+                        <td>{{ "%.2f"|format(result.token_overlap) }}</td>
+                        <td>{{ "%.2f"|format(result.oui * 100) }}</td>
+                    </tr>
+                    {% endfor %}
+                </tbody>
+            </table>
+            <div id="chart-container" style="margin-top: 50px;">
+                <canvas id="similarityChart"></canvas>
+            </div>
+            <script>
+                const labels = {{ results | map(attribute='title') | list | safe }};
+                const semanticData = {{ results | map(attribute='similarity') | list | safe }};
+                const tokenData = {{ results | map(attribute='token_overlap') | list | safe }};
+                const ouiData = {{ results | map(attribute='oui') | list | safe }}.map(x => x * 100);
+                new Chart(document.getElementById('similarityChart'), {
+                    type: 'bar',
+                    data: {
+                        labels: labels,
+                        datasets: [
+                            {
+                                label: 'Semantic Similarity (%)',
+                                data: semanticData,
+                                backgroundColor: 'rgba(54, 162, 235, 0.7)',
+                                borderColor: 'rgba(54, 162, 235, 1)',
+                                borderWidth: 1
+                            },
+                            {
+                                label: 'Token Overlap (%)',
+                                data: tokenData,
+                                backgroundColor: 'rgba(255, 159, 64, 0.7)',
+                                borderColor: 'rgba(255, 159, 64, 1)',
+                                borderWidth: 1
+                            },
+                            {
+                                label: 'OUI (%)',
+                                data: ouiData,
+                                backgroundColor: 'rgba(153, 102, 255, 0.7)',
+                                borderColor: 'rgba(153, 102, 255, 1)',
+                                borderWidth: 1
+                            }
+                        ]
+                    },
+                    options: {
+                        responsive: true,
+                        maintainAspectRatio: false,
+                        plugins: {
+                            legend: {
+                                position: 'top',
+                            },
+                            tooltip: {
+                                mode: 'index',
+                                intersect: false
+                            }
+                        },
+                        scales: {
+                            y: { beginAtZero: true },
+                            x: {
+                                ticks: {
+                                    autoSkip: false,
+                                    maxRotation: 45,
+                                    minRotation: 45
+                                }
+                            }
+                        }
+                    }
+                });
+            </script>
+        </section>
+        {% endif %}
+    </main>
+    <footer>
+        <p>&copy; 2025 NORUS Tool. All rights reserved.</p>
+    </footer>
+    <script>
+        document.addEventListener("DOMContentLoaded", function() {
+            const analysisType = document.getElementById("analysis_type");
+            const pubmedOptions = document.getElementById("pubmed-options");
+            const localOptions = document.getElementById("local-options");
+            function toggleOptions() {
+                if (analysisType.value === "pubmed") {
+                    pubmedOptions.style.display = "block";
+                    localOptions.style.display = "none";
+                } else {
+                    pubmedOptions.style.display = "none";
+                    localOptions.style.display = "block";
+                }
+            }
+            analysisType.addEventListener("change", toggleOptions);
+            toggleOptions();
+        });
+    </script>
+</body>
+</html>

templates/app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import requests
+import pdfplumber
+import numpy as np
+from flask import Flask, render_template, request, redirect, url_for, flash
+from werkzeug.utils import secure_filename
+from sentence_transformers import SentenceTransformer, util
+import nltk
+from nltk.stem import WordNetLemmatizer, PorterStemmer
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+nltk.download("punkt")
+nltk.download("wordnet")
+nltk.download("stopwords")
+lemmatizer = WordNetLemmatizer()
+stemmer = PorterStemmer()
+stop_words = set(stopwords.words("english"))
+app = Flask(__name__)
+app.config["UPLOAD_FOLDER"] = "uploads"
+os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
+model = SentenceTransformer("allenai/scibert_scivocab_uncased")
+def extract_pdf_text(pdf_path):
+    text = ""
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() or " "
+    except Exception as e:
+        print(f"Errore estrazione testo: {e}")
+    return text.lower().strip()
+def preprocess_text(text):
+    text = text.lower()
+    words = word_tokenize(text)
+    words = [stemmer.stem(lemmatizer.lemmatize(w)) for w in words if w.isalnum() and w not in stop_words]
+    return " ".join(words)
+def calculate_token_overlap(text1, text2):
+    tokens1 = set(text1.split())
+    tokens2 = set(text2.split())
+    overlap = len(tokens1 & tokens2)
+    return round((overlap / max(len(tokens1), 1)) * 100, 2)
+def calculate_oui(similarity, token_overlap, alpha=0.7, beta=0.3):
+    oui = alpha * (1 - similarity / 100) + beta * (1 - token_overlap / 100)
+    return round(max(0, min(oui * 100, 100)), 2)
+def validate_document(pdf_path, comparison_sources, method="local", titles=None):
+    pdf_text = extract_pdf_text(pdf_path)
+    results = []
+    for i, doc in enumerate(comparison_sources):
+        doc_text = extract_pdf_text(doc) if method == "local" else doc
+        similarity = util.pytorch_cos_sim(
+            model.encode(pdf_text, convert_to_tensor=True),
+            model.encode(doc_text, convert_to_tensor=True)
+        ).item() * 100
+        token_overlap = calculate_token_overlap(pdf_text, doc_text)
+        oui = calculate_oui(similarity, token_overlap)
+        title = titles[i] if titles and i < len(titles) else os.path.basename(doc) if method == "local" else "Unknown Title"
+        results.append({"title": title, "similarity": round(similarity, 2), "token_overlap": round(token_overlap, 2), "oui": round(oui, 2)})
+    return results
+def fetch_pubmed_details(article_id):
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    params = {"db": "pubmed", "id": article_id, "retmode": "xml"}
+    try:
+        response = requests.get(base_url, params=params)
+        response.raise_for_status()
+        import xml.etree.ElementTree as ET
+        root = ET.fromstring(response.text)
+        title = root.find(".//ArticleTitle").text if root.find(".//ArticleTitle") is not None else "No Title"
+        abstract = root.find(".//AbstractText").text if root.find(".//AbstractText") is not None else "No Abstract"
+        keywords = root.findall(".//Keyword")
+        keyword_text = " ".join([kw.text for kw in keywords if kw.text]) if keywords else "No Keywords"
+        print(f"\n🔍 ARTICOLO RECUPERATO\n📖 Titolo: {title}\n📝 Abstract: {abstract[:500]}...\n🔑 Keywords: {keyword_text}\n")
+        return title, f"{abstract} {keyword_text}"
+    except requests.exceptions.RequestException as e:
+        print(f"Errore recupero abstract: {e}")
+        return "No Title", "No Abstract"
+def fetch_pubmed(query, year_start, year_end, max_results=10):
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {"db": "pubmed", "term": f"{query} AND ({year_start}[PDAT] : {year_end}[PDAT])", "retmax": max_results, "retmode": "json"}
+    try:
+        response = requests.get(base_url, params=params)
+        response.raise_for_status()
+        return response.json().get("esearchresult", {}).get("idlist", [])
+    except requests.exceptions.RequestException as e:
+        print(f"Errore recupero articoli PubMed: {e}")
+        return []
+@app.route("/")
+def index():
+    return render_template("NORUS.html")
+@app.route("/validate", methods=["POST"])
+def validate():
+    pdf_file = request.files.get("pdf_file")
+    analysis_type = request.form.get("analysis_type")
+    local_dir = request.form.get("local_directory", "").strip()
+    query = request.form.get("query", "").strip()
+    if not pdf_file:
+        flash("Carica un file PDF valido.", "error")
+        return redirect(url_for("index"))
+    filename = secure_filename(pdf_file.filename)
+    pdf_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
+    pdf_file.save(pdf_path)
+    results = []
+    if analysis_type == "local":
+        if not os.path.isdir(local_dir):
+            flash("Seleziona una directory valida.", "error")
+            return redirect(url_for("index"))
+        comparison_files = [os.path.join(local_dir, f) for f in os.listdir(local_dir) if f.endswith(".pdf")]
+        if not comparison_files:
+            flash("La directory non contiene PDF.", "error")
+            return redirect(url_for("index"))
+        results = validate_document(pdf_path, comparison_files, method="local")
+    elif analysis_type == "pubmed":
+        year_start = request.form.get("year_start", "2000")
+        year_end = request.form.get("year_end", "2025")
+        num_articles = int(request.form.get("num_articles", "10"))
+        pubmed_ids = fetch_pubmed(query, year_start, year_end, num_articles)
+        pubmed_results = [fetch_pubmed_details(article_id) for article_id in pubmed_ids]
+        results = validate_document(pdf_path, [result[1] for result in pubmed_results], method="pubmed", titles=[result[0] for result in pubmed_results])
+    return render_template("NORUS.html", results=results)
+if __name__ == "__main__":
+    app.run(debug=True, port=7860)