Spaces:

rwillats
/

hatespeech

Sleeping

App Files Files Community

rwillats commited on 18 days ago

Commit

c82d2ac

verified ·

1 Parent(s): fecb216

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

hate_speech_demo.py +80 -46

hate_speech_demo.py CHANGED Viewed

@@ -36,7 +36,7 @@ function openPolicyPopup() {{
 </script>
 """
-# Replace your process_retrieval_text function with this updated version
 def process_retrieval_text(retrieval_text, user_input):
     """
     Process the retrieval text by identifying proper document boundaries
@@ -67,16 +67,41 @@ def process_retrieval_text(retrieval_text, user_input):
                 content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
                 content = content_match.group(1).strip() if content_match else "No content available"
                 # Format with clear section headers and better spacing
                 formatted_html = f"""
                 <div class='doc-section'>
                     <h3 class="doc-number">Evidence Section {i+1}</h3>
                     <div class="doc-section-info">
-                        <p><strong>Section Title:</strong> {doc_name}</p>
-                        <div class="subsection-info">
-                            <p><strong>Subsection:</strong> Page {page}</p>
-                        </div>
                     </div>
                     <div class="doc-content-container">
@@ -88,37 +113,57 @@ def process_retrieval_text(retrieval_text, user_input):
                 chunks.append(formatted_html)
     else:
         # Fallback to a simpler approach - split by double newlines
-        # but combine any small chunks that appear to be part of the same document
-        raw_chunks = retrieval_text.strip().split("\n\n")
         chunks = []
-        current_chunk = ""
-        for chunk in raw_chunks:
-            # If it's a short chunk without a clear document marker, or appears to be a continuation,
-            # append to previous chunk
-            if (len(chunk) < 50 and not re.search(r'doc|document|evidence', chunk.lower())) or \
-               not chunk.strip().startswith(("Doc", "Document", "Evidence", "Source", "Content")):
-                if current_chunk:
-                    current_chunk += "\n\n" + chunk
-                else:
-                    current_chunk = chunk
-            else:
-                # This looks like a new document chunk
-                if current_chunk:
-                    chunks.append(current_chunk)
-                current_chunk = chunk
-        # Add the last chunk if there is one
-        if current_chunk:
-            chunks.append(current_chunk)
         # Format each chunk with better section styling
-        chunks = [f"""
-        <div class='doc-section'>
-            <h3 class="doc-title">Evidence Section {i+1}</h3>
-            <div class='doc-content'>{chunk.strip()}</div>
-        </div>
-        """ for i, chunk in enumerate(chunks)]
     # Extract keywords from user input (longer than 3 chars)
     keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
@@ -165,26 +210,14 @@ def process_retrieval_text(retrieval_text, user_input):
     .doc-section-info {
         margin: 10px 0;
-        padding: 8px;
         background: #f5f5f5;
         border-radius: 4px;
     }
     .doc-section-info p {
         margin: 5px 0;
-        font-size: 16px;
-    }
-    .subsection-info {
-        margin-left: 15px;
-        padding-left: 10px;
-        border-left: 2px solid #ddd;
-        margin-top: 5px;
-    }
-    .subsection-info p {
         font-size: 14px;
-        color: #555;
     }
     .doc-content-container {
@@ -194,6 +227,7 @@ def process_retrieval_text(retrieval_text, user_input):
     .doc-content-container h4 {
         margin-bottom: 8px;
         font-size: 16px;
     }
     .doc-content {

 </script>
 """
+# Replace your process_retrieval_text function with this final version
 def process_retrieval_text(retrieval_text, user_input):
     """
     Process the retrieval text by identifying proper document boundaries
                 content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
                 content = content_match.group(1).strip() if content_match else "No content available"
+                # Try to parse the document title, section and subsection from content
+                doc_title = "Hate Speech Policy"  # Default
+                section_title = ""
+                subsection_title = ""
+                # Look for document title pattern
+                doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', content, re.DOTALL)
+                if doc_title_match:
+                    doc_title = doc_title_match.group(1).strip()
+                    # Remove this part from content
+                    content = content.replace(doc_title_match.group(0), "").strip()
+                # Look for section pattern
+                section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', content, re.DOTALL)
+                if section_match:
+                    section_title = section_match.group(1).strip()
+                    # Remove this part from content
+                    content = content.replace(section_match.group(0), "").strip()
+                # Look for subsection pattern
+                subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', content, re.DOTALL)
+                if subsection_match:
+                    subsection_title = subsection_match.group(1).strip()
+                    # Remove this part from content
+                    content = content.replace(subsection_match.group(0), "").strip()
                 # Format with clear section headers and better spacing
                 formatted_html = f"""
                 <div class='doc-section'>
                     <h3 class="doc-number">Evidence Section {i+1}</h3>
                     <div class="doc-section-info">
+                        <p><strong>Document Title:</strong> {doc_title}</p>
+                        <p><strong>Section:</strong> {section_title if section_title else doc_name}</p>
+                        <p><strong>Subsection:</strong> {subsection_title if subsection_title else "Page " + page}</p>
                     </div>
                     <div class="doc-content-container">
                 chunks.append(formatted_html)
     else:
         # Fallback to a simpler approach - split by double newlines
         chunks = []
         # Format each chunk with better section styling
+        for i, chunk in enumerate(retrieval_text.strip().split("\n\n")):
+            if not chunk.strip():
+                continue
+            # Try to extract document structure if possible
+            doc_title = "Hate Speech Policy"  # Default
+            section_title = ""
+            subsection_title = ""
+            content = chunk
+            # Look for document title pattern
+            doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', chunk, re.DOTALL)
+            if doc_title_match:
+                doc_title = doc_title_match.group(1).strip()
+                # Remove this part from content
+                content = content.replace(doc_title_match.group(0), "").strip()
+            # Look for section pattern
+            section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', chunk, re.DOTALL)
+            if section_match:
+                section_title = section_match.group(1).strip()
+                # Remove this part from content
+                content = content.replace(section_match.group(0), "").strip()
+            # Look for subsection pattern
+            subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', chunk, re.DOTALL)
+            if subsection_match:
+                subsection_title = subsection_match.group(1).strip()
+                # Remove this part from content
+                content = content.replace(subsection_match.group(0), "").strip()
+            formatted_html = f"""
+            <div class='doc-section'>
+                <h3 class="doc-title">Evidence Section {i+1}</h3>
+                <div class="doc-section-info">
+                    <p><strong>Document Title:</strong> {doc_title}</p>
+                    <p><strong>Section:</strong> {section_title if section_title else "Section " + str(i+1)}</p>
+                    <p><strong>Subsection:</strong> {subsection_title if subsection_title else "N/A"}</p>
+                </div>
+                <div class="doc-content-container">
+                    <h4>Content:</h4>
+                    <div class='doc-content'>{content}</div>
+                </div>
+            </div>
+            """
+            chunks.append(formatted_html)
     # Extract keywords from user input (longer than 3 chars)
     keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
     .doc-section-info {
         margin: 10px 0;
+        padding: 10px;
         background: #f5f5f5;
         border-radius: 4px;
     }
     .doc-section-info p {
         margin: 5px 0;
         font-size: 14px;
     }
     .doc-content-container {
     .doc-content-container h4 {
         margin-bottom: 8px;
         font-size: 16px;
+        color: #333;
     }
     .doc-content {