ApsidalSolid4 commited on
Commit
0c3e4c0
·
verified ·
1 Parent(s): c2529af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -202
app.py CHANGED
@@ -18,6 +18,10 @@ from openpyxl.utils import get_column_letter
18
  from io import BytesIO
19
  import base64
20
  import hashlib
 
 
 
 
21
 
22
  # Configure logging
23
  logging.basicConfig(level=logging.INFO)
@@ -32,6 +36,17 @@ CONFIDENCE_THRESHOLD = 0.65
32
  BATCH_SIZE = 8 # Reduced batch size for CPU
33
  MAX_WORKERS = 4 # Number of worker threads for processing
34
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Get password hash from environment variable (more secure)
36
  ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
37
 
@@ -41,17 +56,6 @@ if not ADMIN_PASSWORD_HASH:
41
  # Excel file path for logs
42
  EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
43
 
44
-
45
- import requests
46
- import base64
47
- import os
48
- import tempfile
49
- from typing import Dict, List, Optional, Union, Tuple
50
- import mimetypes
51
- import logging
52
- import time
53
- from pathlib import Path
54
-
55
  # OCR API settings
56
  OCR_API_KEY = "9e11346f1288957" # This is a partial key - replace with the full one
57
  OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
@@ -205,172 +209,6 @@ class OCRProcessor:
205
  return mime_type
206
 
207
 
208
- # Function to be integrated with the main application
209
- def handle_file_upload_and_analyze(file_obj, mode: str, classifier) -> tuple:
210
- """
211
- Handle file upload, OCR processing, and text analysis
212
-
213
- Args:
214
- file_obj: Uploaded file object from Gradio
215
- mode: Analysis mode (quick or detailed)
216
- classifier: The TextClassifier instance
217
-
218
- Returns:
219
- Analysis results as a tuple (same format as original analyze_text function)
220
- """
221
- if file_obj is None:
222
- return (
223
- "No file uploaded",
224
- "Please upload a file to analyze",
225
- "No file uploaded for analysis"
226
- )
227
-
228
- # Create a temporary file
229
- with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file_obj.name).suffix) as temp_file:
230
- temp_file_path = temp_file.name
231
- # Write uploaded file to the temporary file
232
- temp_file.write(file_obj.read())
233
-
234
- try:
235
- # Process the file with OCR
236
- ocr_processor = OCRProcessor()
237
- ocr_result = ocr_processor.process_file(temp_file_path)
238
-
239
- if not ocr_result["success"]:
240
- return (
241
- "OCR Processing Error",
242
- ocr_result["error"],
243
- "Failed to extract text from the uploaded file"
244
- )
245
-
246
- # Get the extracted text
247
- extracted_text = ocr_result["text"]
248
-
249
- # If no text was extracted
250
- if not extracted_text.strip():
251
- return (
252
- "No text extracted",
253
- "The OCR process did not extract any text from the uploaded file.",
254
- "No text was found in the uploaded file"
255
- )
256
-
257
- # Call the original text analysis function with the extracted text
258
- return analyze_text(extracted_text, mode, classifier)
259
-
260
- finally:
261
- # Clean up the temporary file
262
- if os.path.exists(temp_file_path):
263
- os.remove(temp_file_path)
264
-
265
-
266
- # Modified Gradio interface setup function to include file upload
267
- def setup_gradio_interface(classifier):
268
- """
269
- Set up Gradio interface with text input and file upload options
270
-
271
- Args:
272
- classifier: The TextClassifier instance
273
-
274
- Returns:
275
- Gradio Interface object
276
- """
277
- import gradio as gr
278
-
279
- with gr.Blocks(title="AI Text Detector") as demo:
280
- gr.Markdown("# AI Text Detector with Document Upload")
281
- gr.Markdown("Analyze text to detect if it was written by a human or AI. You can paste text directly or upload images, PDFs, or Word documents.")
282
-
283
- with gr.Tab("Text Input"):
284
- text_input = gr.Textbox(
285
- lines=8,
286
- placeholder="Enter text to analyze...",
287
- label="Input Text"
288
- )
289
-
290
- mode_selection = gr.Radio(
291
- choices=["quick", "detailed"],
292
- value="quick",
293
- label="Analysis Mode",
294
- info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
295
- )
296
-
297
- text_submit_button = gr.Button("Analyze Text")
298
-
299
- output_html = gr.HTML(label="Highlighted Analysis")
300
- output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
301
- output_result = gr.Textbox(label="Overall Result", lines=4)
302
-
303
- text_submit_button.click(
304
- analyze_text,
305
- inputs=[text_input, mode_selection, classifier],
306
- outputs=[output_html, output_sentences, output_result]
307
- )
308
-
309
- with gr.Tab("File Upload"):
310
- file_upload = gr.File(
311
- label="Upload Document",
312
- file_types=["image", "pdf", "doc", "docx"],
313
- type="file"
314
- )
315
-
316
- file_mode_selection = gr.Radio(
317
- choices=["quick", "detailed"],
318
- value="quick",
319
- label="Analysis Mode",
320
- info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
321
- )
322
-
323
- upload_submit_button = gr.Button("Process and Analyze")
324
-
325
- file_output_html = gr.HTML(label="Highlighted Analysis")
326
- file_output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
327
- file_output_result = gr.Textbox(label="Overall Result", lines=4)
328
-
329
- upload_submit_button.click(
330
- handle_file_upload_and_analyze,
331
- inputs=[file_upload, file_mode_selection, classifier],
332
- outputs=[file_output_html, file_output_sentences, file_output_result]
333
- )
334
-
335
- gr.Markdown("""
336
- ### File Upload Limitations
337
- - Maximum file size: 1MB
338
- - PDF files: Maximum 3 pages (OCR.space API limitation)
339
- - Supported formats: Images (PNG, JPG, GIF), PDF, Word documents (DOCX, DOC)
340
- """)
341
-
342
- return demo
343
-
344
-
345
- # This function is a replacement for the original main app setup
346
- def setup_app_with_ocr():
347
- """
348
- Setup the application with OCR capabilities
349
- """
350
- # Initialize the classifier (use existing code)
351
- classifier = TextClassifier()
352
-
353
- # Create the Gradio interface with file upload functionality
354
- demo = setup_gradio_interface(classifier)
355
-
356
- # Get the FastAPI app from Gradio
357
- app = demo.app
358
-
359
- # Add CORS middleware (same as original code)
360
- from fastapi.middleware.cors import CORSMiddleware
361
- app.add_middleware(
362
- CORSMiddleware,
363
- allow_origins=["*"], # For development
364
- allow_credentials=True,
365
- allow_methods=["GET", "POST", "OPTIONS"],
366
- allow_headers=["*"],
367
- )
368
-
369
- # Return the demo for launching
370
- return demo
371
-
372
-
373
-
374
  def is_admin_password(input_text: str) -> bool:
375
  """
376
  Check if the input text matches the admin password using secure hash comparison.
@@ -382,6 +220,7 @@ def is_admin_password(input_text: str) -> bool:
382
  # Compare hashes (constant-time comparison to prevent timing attacks)
383
  return input_hash == ADMIN_PASSWORD_HASH
384
 
 
385
  class TextWindowProcessor:
386
  def __init__(self):
387
  try:
@@ -433,13 +272,10 @@ class TextWindowProcessor:
433
 
434
  return windows, window_sentence_indices
435
 
 
436
  class TextClassifier:
437
  def __init__(self):
438
- # Set thread configuration before any model loading or parallel work
439
- if not torch.cuda.is_available():
440
- torch.set_num_threads(MAX_WORKERS)
441
- torch.set_num_interop_threads(MAX_WORKERS)
442
-
443
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
444
  self.model_name = MODEL_NAME
445
  self.tokenizer = None
@@ -583,7 +419,7 @@ class TextClassifier:
583
  for window_idx, indices in enumerate(batch_indices):
584
  center_idx = len(indices) // 2
585
  center_weight = 0.7 # Higher weight for center sentence
586
- edge_weight = 0.3 / (len(indices) - 1) # Distribute remaining weight
587
 
588
  for pos, sent_idx in enumerate(indices):
589
  # Apply higher weight to center sentence
@@ -606,10 +442,10 @@ class TextClassifier:
606
 
607
  # Apply minimal smoothing at prediction boundaries
608
  if i > 0 and i < len(sentences) - 1:
609
- prev_human = sentence_scores[i-1]['human_prob'] / sentence_appearances[i-1]
610
- prev_ai = sentence_scores[i-1]['ai_prob'] / sentence_appearances[i-1]
611
- next_human = sentence_scores[i+1]['human_prob'] / sentence_appearances[i+1]
612
- next_ai = sentence_scores[i+1]['ai_prob'] / sentence_appearances[i+1]
613
 
614
  # Check if we're at a prediction boundary
615
  current_pred = 'human' if human_prob > ai_prob else 'ai'
@@ -684,6 +520,65 @@ class TextClassifier:
684
  'num_sentences': num_sentences
685
  }
686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
  def initialize_excel_log():
688
  """Initialize the Excel log file if it doesn't exist."""
689
  if not os.path.exists(EXCEL_LOG_PATH):
@@ -711,6 +606,7 @@ def initialize_excel_log():
711
  wb.save(EXCEL_LOG_PATH)
712
  logger.info(f"Initialized Excel log file at {EXCEL_LOG_PATH}")
713
 
 
714
  def log_prediction_data(input_text, word_count, prediction, confidence, execution_time, mode):
715
  """Log prediction data to an Excel file in the /tmp directory."""
716
  # Initialize the Excel file if it doesn't exist
@@ -753,6 +649,7 @@ def log_prediction_data(input_text, word_count, prediction, confidence, executio
753
  logger.error(f"Error logging prediction data to Excel: {str(e)}")
754
  return False
755
 
 
756
  def get_logs_as_base64():
757
  """Read the Excel logs file and return as base64 for downloading."""
758
  if not os.path.exists(EXCEL_LOG_PATH):
@@ -771,6 +668,7 @@ def get_logs_as_base64():
771
  logger.error(f"Error reading Excel logs: {str(e)}")
772
  return None
773
 
 
774
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
775
  """Analyze text using specified mode and return formatted results."""
776
  # Check if the input text matches the admin password using secure comparison
@@ -890,27 +788,120 @@ def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
890
  overall_result
891
  )
892
 
893
- # Initialize the classifier globally
894
- classifier = TextClassifier()
895
 
896
- # Create Gradio interface
897
- demo = setup_app_with_ocr()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
898
 
899
 
900
- # Get the FastAPI app from Gradio
901
- app = demo.app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
 
903
- # Add CORS middleware
904
- app.add_middleware(
905
- CORSMiddleware,
906
- allow_origins=["*"], # For development
907
- allow_credentials=True,
908
- allow_methods=["GET", "POST", "OPTIONS"],
909
- allow_headers=["*"],
910
- )
911
 
912
- # Ensure CORS is applied before launching
913
  if __name__ == "__main__":
 
 
 
 
914
  demo.queue()
915
  demo.launch(
916
  server_name="0.0.0.0",
 
18
  from io import BytesIO
19
  import base64
20
  import hashlib
21
+ import requests
22
+ import tempfile
23
+ from pathlib import Path
24
+ import mimetypes
25
 
26
  # Configure logging
27
  logging.basicConfig(level=logging.INFO)
 
36
  BATCH_SIZE = 8 # Reduced batch size for CPU
37
  MAX_WORKERS = 4 # Number of worker threads for processing
38
 
39
+ # IMPORTANT: Set PyTorch thread configuration at the module level
40
+ # before any parallel work starts
41
+ if not torch.cuda.is_available():
42
+ # Set thread configuration only once at the beginning
43
+ torch.set_num_threads(MAX_WORKERS)
44
+ try:
45
+ # Only set interop threads if it hasn't been set already
46
+ torch.set_num_interop_threads(MAX_WORKERS)
47
+ except RuntimeError as e:
48
+ logger.warning(f"Could not set interop threads: {str(e)}")
49
+
50
  # Get password hash from environment variable (more secure)
51
  ADMIN_PASSWORD_HASH = os.environ.get('ADMIN_PASSWORD_HASH')
52
 
 
56
  # Excel file path for logs
57
  EXCEL_LOG_PATH = "/tmp/prediction_logs.xlsx"
58
 
 
 
 
 
 
 
 
 
 
 
 
59
  # OCR API settings
60
  OCR_API_KEY = "9e11346f1288957" # This is a partial key - replace with the full one
61
  OCR_API_ENDPOINT = "https://api.ocr.space/parse/image"
 
209
  return mime_type
210
 
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def is_admin_password(input_text: str) -> bool:
213
  """
214
  Check if the input text matches the admin password using secure hash comparison.
 
220
  # Compare hashes (constant-time comparison to prevent timing attacks)
221
  return input_hash == ADMIN_PASSWORD_HASH
222
 
223
+
224
  class TextWindowProcessor:
225
  def __init__(self):
226
  try:
 
272
 
273
  return windows, window_sentence_indices
274
 
275
+
276
  class TextClassifier:
277
  def __init__(self):
278
+ # FIXED: Removed the thread configuration here, as it's now at the module level
 
 
 
 
279
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
280
  self.model_name = MODEL_NAME
281
  self.tokenizer = None
 
419
  for window_idx, indices in enumerate(batch_indices):
420
  center_idx = len(indices) // 2
421
  center_weight = 0.7 # Higher weight for center sentence
422
+ edge_weight = 0.3 / (len(indices) - 1) if len(indices) > 1 else 0 # Distribute remaining weight
423
 
424
  for pos, sent_idx in enumerate(indices):
425
  # Apply higher weight to center sentence
 
442
 
443
  # Apply minimal smoothing at prediction boundaries
444
  if i > 0 and i < len(sentences) - 1:
445
+ prev_human = sentence_scores[i-1]['human_prob'] / max(sentence_appearances[i-1], 1e-10)
446
+ prev_ai = sentence_scores[i-1]['ai_prob'] / max(sentence_appearances[i-1], 1e-10)
447
+ next_human = sentence_scores[i+1]['human_prob'] / max(sentence_appearances[i+1], 1e-10)
448
+ next_ai = sentence_scores[i+1]['ai_prob'] / max(sentence_appearances[i+1], 1e-10)
449
 
450
  # Check if we're at a prediction boundary
451
  current_pred = 'human' if human_prob > ai_prob else 'ai'
 
520
  'num_sentences': num_sentences
521
  }
522
 
523
+
524
+ # Function to handle file upload, OCR processing, and text analysis
525
+ def handle_file_upload_and_analyze(file_obj, mode: str, classifier) -> tuple:
526
+ """
527
+ Handle file upload, OCR processing, and text analysis
528
+
529
+ Args:
530
+ file_obj: Uploaded file object from Gradio
531
+ mode: Analysis mode (quick or detailed)
532
+ classifier: The TextClassifier instance
533
+
534
+ Returns:
535
+ Analysis results as a tuple (same format as original analyze_text function)
536
+ """
537
+ if file_obj is None:
538
+ return (
539
+ "No file uploaded",
540
+ "Please upload a file to analyze",
541
+ "No file uploaded for analysis"
542
+ )
543
+
544
+ # Create a temporary file
545
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file_obj.name).suffix) as temp_file:
546
+ temp_file_path = temp_file.name
547
+ # Write uploaded file to the temporary file
548
+ temp_file.write(file_obj.read())
549
+
550
+ try:
551
+ # Process the file with OCR
552
+ ocr_processor = OCRProcessor()
553
+ ocr_result = ocr_processor.process_file(temp_file_path)
554
+
555
+ if not ocr_result["success"]:
556
+ return (
557
+ "OCR Processing Error",
558
+ ocr_result["error"],
559
+ "Failed to extract text from the uploaded file"
560
+ )
561
+
562
+ # Get the extracted text
563
+ extracted_text = ocr_result["text"]
564
+
565
+ # If no text was extracted
566
+ if not extracted_text.strip():
567
+ return (
568
+ "No text extracted",
569
+ "The OCR process did not extract any text from the uploaded file.",
570
+ "No text was found in the uploaded file"
571
+ )
572
+
573
+ # Call the original text analysis function with the extracted text
574
+ return analyze_text(extracted_text, mode, classifier)
575
+
576
+ finally:
577
+ # Clean up the temporary file
578
+ if os.path.exists(temp_file_path):
579
+ os.remove(temp_file_path)
580
+
581
+
582
  def initialize_excel_log():
583
  """Initialize the Excel log file if it doesn't exist."""
584
  if not os.path.exists(EXCEL_LOG_PATH):
 
606
  wb.save(EXCEL_LOG_PATH)
607
  logger.info(f"Initialized Excel log file at {EXCEL_LOG_PATH}")
608
 
609
+
610
  def log_prediction_data(input_text, word_count, prediction, confidence, execution_time, mode):
611
  """Log prediction data to an Excel file in the /tmp directory."""
612
  # Initialize the Excel file if it doesn't exist
 
649
  logger.error(f"Error logging prediction data to Excel: {str(e)}")
650
  return False
651
 
652
+
653
  def get_logs_as_base64():
654
  """Read the Excel logs file and return as base64 for downloading."""
655
  if not os.path.exists(EXCEL_LOG_PATH):
 
668
  logger.error(f"Error reading Excel logs: {str(e)}")
669
  return None
670
 
671
+
672
  def analyze_text(text: str, mode: str, classifier: TextClassifier) -> tuple:
673
  """Analyze text using specified mode and return formatted results."""
674
  # Check if the input text matches the admin password using secure comparison
 
788
  overall_result
789
  )
790
 
 
 
791
 
792
+ # Modified Gradio interface setup function to include file upload
793
+ def setup_gradio_interface(classifier):
794
+ """
795
+ Set up Gradio interface with text input and file upload options
796
+
797
+ Args:
798
+ classifier: The TextClassifier instance
799
+
800
+ Returns:
801
+ Gradio Interface object
802
+ """
803
+ import gradio as gr
804
+
805
+ with gr.Blocks(title="AI Text Detector") as demo:
806
+ gr.Markdown("# AI Text Detector with Document Upload")
807
+ gr.Markdown("Analyze text to detect if it was written by a human or AI. You can paste text directly or upload images, PDFs, or Word documents.")
808
+
809
+ with gr.Tab("Text Input"):
810
+ text_input = gr.Textbox(
811
+ lines=8,
812
+ placeholder="Enter text to analyze...",
813
+ label="Input Text"
814
+ )
815
+
816
+ mode_selection = gr.Radio(
817
+ choices=["quick", "detailed"],
818
+ value="quick",
819
+ label="Analysis Mode",
820
+ info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
821
+ )
822
+
823
+ text_submit_button = gr.Button("Analyze Text")
824
+
825
+ output_html = gr.HTML(label="Highlighted Analysis")
826
+ output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
827
+ output_result = gr.Textbox(label="Overall Result", lines=4)
828
+
829
+ text_submit_button.click(
830
+ analyze_text,
831
+ inputs=[text_input, mode_selection, classifier],
832
+ outputs=[output_html, output_sentences, output_result]
833
+ )
834
+
835
+ with gr.Tab("File Upload"):
836
+ file_upload = gr.File(
837
+ label="Upload Document",
838
+ file_types=["image", "pdf", "doc", "docx"],
839
+ type="file"
840
+ )
841
+
842
+ file_mode_selection = gr.Radio(
843
+ choices=["quick", "detailed"],
844
+ value="quick",
845
+ label="Analysis Mode",
846
+ info="Quick mode for faster analysis, Detailed mode for sentence-level analysis"
847
+ )
848
+
849
+ upload_submit_button = gr.Button("Process and Analyze")
850
+
851
+ file_output_html = gr.HTML(label="Highlighted Analysis")
852
+ file_output_sentences = gr.Textbox(label="Sentence-by-Sentence Analysis", lines=10)
853
+ file_output_result = gr.Textbox(label="Overall Result", lines=4)
854
+
855
+ upload_submit_button.click(
856
+ handle_file_upload_and_analyze,
857
+ inputs=[file_upload, file_mode_selection, classifier],
858
+ outputs=[file_output_html, file_output_sentences, file_output_result]
859
+ )
860
+
861
+ gr.Markdown("""
862
+ ### File Upload Limitations
863
+ - Maximum file size: 1MB
864
+ - PDF files: Maximum 3 pages (OCR.space API limitation)
865
+ - Supported formats: Images (PNG, JPG, GIF), PDF, Word documents (DOCX, DOC)
866
+ """)
867
+
868
+ return demo
869
 
870
 
871
+ # This function is a replacement for the original main app setup
872
+ def setup_app_with_ocr():
873
+ """
874
+ Setup the application with OCR capabilities
875
+ """
876
+ # Initialize the classifier (uses the fixed class)
877
+ classifier = TextClassifier()
878
+
879
+ # Create the Gradio interface with file upload functionality
880
+ demo = setup_gradio_interface(classifier)
881
+
882
+ # Get the FastAPI app from Gradio
883
+ app = demo.app
884
+
885
+ # Add CORS middleware (same as original code)
886
+ from fastapi.middleware.cors import CORSMiddleware
887
+ app.add_middleware(
888
+ CORSMiddleware,
889
+ allow_origins=["*"], # For development
890
+ allow_credentials=True,
891
+ allow_methods=["GET", "POST", "OPTIONS"],
892
+ allow_headers=["*"],
893
+ )
894
+
895
+ # Return the demo for launching
896
+ return demo
897
 
 
 
 
 
 
 
 
 
898
 
899
+ # Initialize the application
900
  if __name__ == "__main__":
901
+ # Create the app with OCR functionality
902
+ demo = setup_app_with_ocr()
903
+
904
+ # Start the server
905
  demo.queue()
906
  demo.launch(
907
  server_name="0.0.0.0",