Spaces:

rwillats
/

hatespeech

Sleeping

App Files Files Community

rwillats commited on 14 days ago

Commit

b833658

verified ·

1 Parent(s): c82d2ac

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

hate_speech_demo.py +46 -80

hate_speech_demo.py CHANGED Viewed

@@ -36,7 +36,7 @@ function openPolicyPopup() {{
 </script>
 """
-# Replace your process_retrieval_text function with this final version
 def process_retrieval_text(retrieval_text, user_input):
     """
     Process the retrieval text by identifying proper document boundaries
@@ -67,41 +67,16 @@ def process_retrieval_text(retrieval_text, user_input):
                 content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
                 content = content_match.group(1).strip() if content_match else "No content available"
-                # Try to parse the document title, section and subsection from content
-                doc_title = "Hate Speech Policy"  # Default
-                section_title = ""
-                subsection_title = ""
-                # Look for document title pattern
-                doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', content, re.DOTALL)
-                if doc_title_match:
-                    doc_title = doc_title_match.group(1).strip()
-                    # Remove this part from content
-                    content = content.replace(doc_title_match.group(0), "").strip()
-                # Look for section pattern
-                section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', content, re.DOTALL)
-                if section_match:
-                    section_title = section_match.group(1).strip()
-                    # Remove this part from content
-                    content = content.replace(section_match.group(0), "").strip()
-                # Look for subsection pattern
-                subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', content, re.DOTALL)
-                if subsection_match:
-                    subsection_title = subsection_match.group(1).strip()
-                    # Remove this part from content
-                    content = content.replace(subsection_match.group(0), "").strip()
                 # Format with clear section headers and better spacing
                 formatted_html = f"""
                 <div class='doc-section'>
                     <h3 class="doc-number">Evidence Section {i+1}</h3>
                     <div class="doc-section-info">
-                        <p><strong>Document Title:</strong> {doc_title}</p>
-                        <p><strong>Section:</strong> {section_title if section_title else doc_name}</p>
-                        <p><strong>Subsection:</strong> {subsection_title if subsection_title else "Page " + page}</p>
                     </div>
                     <div class="doc-content-container">
@@ -113,57 +88,37 @@ def process_retrieval_text(retrieval_text, user_input):
                 chunks.append(formatted_html)
     else:
         # Fallback to a simpler approach - split by double newlines
         chunks = []
         # Format each chunk with better section styling
-        for i, chunk in enumerate(retrieval_text.strip().split("\n\n")):
-            if not chunk.strip():
-                continue
-            # Try to extract document structure if possible
-            doc_title = "Hate Speech Policy"  # Default
-            section_title = ""
-            subsection_title = ""
-            content = chunk
-            # Look for document title pattern
-            doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', chunk, re.DOTALL)
-            if doc_title_match:
-                doc_title = doc_title_match.group(1).strip()
-                # Remove this part from content
-                content = content.replace(doc_title_match.group(0), "").strip()
-            # Look for section pattern
-            section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', chunk, re.DOTALL)
-            if section_match:
-                section_title = section_match.group(1).strip()
-                # Remove this part from content
-                content = content.replace(section_match.group(0), "").strip()
-            # Look for subsection pattern
-            subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', chunk, re.DOTALL)
-            if subsection_match:
-                subsection_title = subsection_match.group(1).strip()
-                # Remove this part from content
-                content = content.replace(subsection_match.group(0), "").strip()
-            formatted_html = f"""
-            <div class='doc-section'>
-                <h3 class="doc-title">Evidence Section {i+1}</h3>
-                <div class="doc-section-info">
-                    <p><strong>Document Title:</strong> {doc_title}</p>
-                    <p><strong>Section:</strong> {section_title if section_title else "Section " + str(i+1)}</p>
-                    <p><strong>Subsection:</strong> {subsection_title if subsection_title else "N/A"}</p>
-                </div>
-                <div class="doc-content-container">
-                    <h4>Content:</h4>
-                    <div class='doc-content'>{content}</div>
-                </div>
-            </div>
-            """
-            chunks.append(formatted_html)
     # Extract keywords from user input (longer than 3 chars)
     keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
@@ -210,14 +165,26 @@ def process_retrieval_text(retrieval_text, user_input):
     .doc-section-info {
         margin: 10px 0;
-        padding: 10px;
         background: #f5f5f5;
         border-radius: 4px;
     }
     .doc-section-info p {
         margin: 5px 0;
         font-size: 14px;
     }
     .doc-content-container {
@@ -227,7 +194,6 @@ def process_retrieval_text(retrieval_text, user_input):
     .doc-content-container h4 {
         margin-bottom: 8px;
         font-size: 16px;
-        color: #333;
     }
     .doc-content {

 </script>
 """
+# Replace your process_retrieval_text function with this updated version
 def process_retrieval_text(retrieval_text, user_input):
     """
     Process the retrieval text by identifying proper document boundaries
                 content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
                 content = content_match.group(1).strip() if content_match else "No content available"
                 # Format with clear section headers and better spacing
                 formatted_html = f"""
                 <div class='doc-section'>
                     <h3 class="doc-number">Evidence Section {i+1}</h3>
                     <div class="doc-section-info">
+                        <p><strong>Section Title:</strong> {doc_name}</p>
+                        <div class="subsection-info">
+                            <p><strong>Subsection:</strong> Page {page}</p>
+                        </div>
                     </div>
                     <div class="doc-content-container">
                 chunks.append(formatted_html)
     else:
         # Fallback to a simpler approach - split by double newlines
+        # but combine any small chunks that appear to be part of the same document
+        raw_chunks = retrieval_text.strip().split("\n\n")
         chunks = []
+        current_chunk = ""
+        for chunk in raw_chunks:
+            # If it's a short chunk without a clear document marker, or appears to be a continuation,
+            # append to previous chunk
+            if (len(chunk) < 50 and not re.search(r'doc|document|evidence', chunk.lower())) or \
+               not chunk.strip().startswith(("Doc", "Document", "Evidence", "Source", "Content")):
+                if current_chunk:
+                    current_chunk += "\n\n" + chunk
+                else:
+                    current_chunk = chunk
+            else:
+                # This looks like a new document chunk
+                if current_chunk:
+                    chunks.append(current_chunk)
+                current_chunk = chunk
+        # Add the last chunk if there is one
+        if current_chunk:
+            chunks.append(current_chunk)
         # Format each chunk with better section styling
+        chunks = [f"""
+        <div class='doc-section'>
+            <h3 class="doc-title">Evidence Section {i+1}</h3>
+            <div class='doc-content'>{chunk.strip()}</div>
+        </div>
+        """ for i, chunk in enumerate(chunks)]
     # Extract keywords from user input (longer than 3 chars)
     keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
     .doc-section-info {
         margin: 10px 0;
+        padding: 8px;
         background: #f5f5f5;
         border-radius: 4px;
     }
     .doc-section-info p {
         margin: 5px 0;
+        font-size: 16px;
+    }
+    .subsection-info {
+        margin-left: 15px;
+        padding-left: 10px;
+        border-left: 2px solid #ddd;
+        margin-top: 5px;
+    }
+    .subsection-info p {
         font-size: 14px;
+        color: #555;
     }
     .doc-content-container {
     .doc-content-container h4 {
         margin-bottom: 8px;
         font-size: 16px;
     }
     .doc-content {