Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- hate_speech_demo.py +46 -80
hate_speech_demo.py
CHANGED
@@ -36,7 +36,7 @@ function openPolicyPopup() {{
|
|
36 |
</script>
|
37 |
"""
|
38 |
|
39 |
-
# Replace your process_retrieval_text function with this
|
40 |
def process_retrieval_text(retrieval_text, user_input):
|
41 |
"""
|
42 |
Process the retrieval text by identifying proper document boundaries
|
@@ -67,41 +67,16 @@ def process_retrieval_text(retrieval_text, user_input):
|
|
67 |
content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
|
68 |
content = content_match.group(1).strip() if content_match else "No content available"
|
69 |
|
70 |
-
# Try to parse the document title, section and subsection from content
|
71 |
-
doc_title = "Hate Speech Policy" # Default
|
72 |
-
section_title = ""
|
73 |
-
subsection_title = ""
|
74 |
-
|
75 |
-
# Look for document title pattern
|
76 |
-
doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', content, re.DOTALL)
|
77 |
-
if doc_title_match:
|
78 |
-
doc_title = doc_title_match.group(1).strip()
|
79 |
-
# Remove this part from content
|
80 |
-
content = content.replace(doc_title_match.group(0), "").strip()
|
81 |
-
|
82 |
-
# Look for section pattern
|
83 |
-
section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', content, re.DOTALL)
|
84 |
-
if section_match:
|
85 |
-
section_title = section_match.group(1).strip()
|
86 |
-
# Remove this part from content
|
87 |
-
content = content.replace(section_match.group(0), "").strip()
|
88 |
-
|
89 |
-
# Look for subsection pattern
|
90 |
-
subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', content, re.DOTALL)
|
91 |
-
if subsection_match:
|
92 |
-
subsection_title = subsection_match.group(1).strip()
|
93 |
-
# Remove this part from content
|
94 |
-
content = content.replace(subsection_match.group(0), "").strip()
|
95 |
-
|
96 |
# Format with clear section headers and better spacing
|
97 |
formatted_html = f"""
|
98 |
<div class='doc-section'>
|
99 |
<h3 class="doc-number">Evidence Section {i+1}</h3>
|
100 |
|
101 |
<div class="doc-section-info">
|
102 |
-
<p><strong>
|
103 |
-
<
|
104 |
-
|
|
|
105 |
</div>
|
106 |
|
107 |
<div class="doc-content-container">
|
@@ -113,57 +88,37 @@ def process_retrieval_text(retrieval_text, user_input):
|
|
113 |
chunks.append(formatted_html)
|
114 |
else:
|
115 |
# Fallback to a simpler approach - split by double newlines
|
|
|
|
|
116 |
chunks = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
# Format each chunk with better section styling
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
section_title = ""
|
126 |
-
subsection_title = ""
|
127 |
-
content = chunk
|
128 |
-
|
129 |
-
# Look for document title pattern
|
130 |
-
doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', chunk, re.DOTALL)
|
131 |
-
if doc_title_match:
|
132 |
-
doc_title = doc_title_match.group(1).strip()
|
133 |
-
# Remove this part from content
|
134 |
-
content = content.replace(doc_title_match.group(0), "").strip()
|
135 |
-
|
136 |
-
# Look for section pattern
|
137 |
-
section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', chunk, re.DOTALL)
|
138 |
-
if section_match:
|
139 |
-
section_title = section_match.group(1).strip()
|
140 |
-
# Remove this part from content
|
141 |
-
content = content.replace(section_match.group(0), "").strip()
|
142 |
-
|
143 |
-
# Look for subsection pattern
|
144 |
-
subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', chunk, re.DOTALL)
|
145 |
-
if subsection_match:
|
146 |
-
subsection_title = subsection_match.group(1).strip()
|
147 |
-
# Remove this part from content
|
148 |
-
content = content.replace(subsection_match.group(0), "").strip()
|
149 |
-
|
150 |
-
formatted_html = f"""
|
151 |
-
<div class='doc-section'>
|
152 |
-
<h3 class="doc-title">Evidence Section {i+1}</h3>
|
153 |
-
|
154 |
-
<div class="doc-section-info">
|
155 |
-
<p><strong>Document Title:</strong> {doc_title}</p>
|
156 |
-
<p><strong>Section:</strong> {section_title if section_title else "Section " + str(i+1)}</p>
|
157 |
-
<p><strong>Subsection:</strong> {subsection_title if subsection_title else "N/A"}</p>
|
158 |
-
</div>
|
159 |
-
|
160 |
-
<div class="doc-content-container">
|
161 |
-
<h4>Content:</h4>
|
162 |
-
<div class='doc-content'>{content}</div>
|
163 |
-
</div>
|
164 |
-
</div>
|
165 |
-
"""
|
166 |
-
chunks.append(formatted_html)
|
167 |
|
168 |
# Extract keywords from user input (longer than 3 chars)
|
169 |
keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
|
@@ -210,14 +165,26 @@ def process_retrieval_text(retrieval_text, user_input):
|
|
210 |
|
211 |
.doc-section-info {
|
212 |
margin: 10px 0;
|
213 |
-
padding:
|
214 |
background: #f5f5f5;
|
215 |
border-radius: 4px;
|
216 |
}
|
217 |
|
218 |
.doc-section-info p {
|
219 |
margin: 5px 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
font-size: 14px;
|
|
|
221 |
}
|
222 |
|
223 |
.doc-content-container {
|
@@ -227,7 +194,6 @@ def process_retrieval_text(retrieval_text, user_input):
|
|
227 |
.doc-content-container h4 {
|
228 |
margin-bottom: 8px;
|
229 |
font-size: 16px;
|
230 |
-
color: #333;
|
231 |
}
|
232 |
|
233 |
.doc-content {
|
|
|
36 |
</script>
|
37 |
"""
|
38 |
|
39 |
+
# Replace your process_retrieval_text function with this updated version
|
40 |
def process_retrieval_text(retrieval_text, user_input):
|
41 |
"""
|
42 |
Process the retrieval text by identifying proper document boundaries
|
|
|
67 |
content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
|
68 |
content = content_match.group(1).strip() if content_match else "No content available"
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
# Format with clear section headers and better spacing
|
71 |
formatted_html = f"""
|
72 |
<div class='doc-section'>
|
73 |
<h3 class="doc-number">Evidence Section {i+1}</h3>
|
74 |
|
75 |
<div class="doc-section-info">
|
76 |
+
<p><strong>Section Title:</strong> {doc_name}</p>
|
77 |
+
<div class="subsection-info">
|
78 |
+
<p><strong>Subsection:</strong> Page {page}</p>
|
79 |
+
</div>
|
80 |
</div>
|
81 |
|
82 |
<div class="doc-content-container">
|
|
|
88 |
chunks.append(formatted_html)
|
89 |
else:
|
90 |
# Fallback to a simpler approach - split by double newlines
|
91 |
+
# but combine any small chunks that appear to be part of the same document
|
92 |
+
raw_chunks = retrieval_text.strip().split("\n\n")
|
93 |
chunks = []
|
94 |
+
current_chunk = ""
|
95 |
+
|
96 |
+
for chunk in raw_chunks:
|
97 |
+
# If it's a short chunk without a clear document marker, or appears to be a continuation,
|
98 |
+
# append to previous chunk
|
99 |
+
if (len(chunk) < 50 and not re.search(r'doc|document|evidence', chunk.lower())) or \
|
100 |
+
not chunk.strip().startswith(("Doc", "Document", "Evidence", "Source", "Content")):
|
101 |
+
if current_chunk:
|
102 |
+
current_chunk += "\n\n" + chunk
|
103 |
+
else:
|
104 |
+
current_chunk = chunk
|
105 |
+
else:
|
106 |
+
# This looks like a new document chunk
|
107 |
+
if current_chunk:
|
108 |
+
chunks.append(current_chunk)
|
109 |
+
current_chunk = chunk
|
110 |
+
|
111 |
+
# Add the last chunk if there is one
|
112 |
+
if current_chunk:
|
113 |
+
chunks.append(current_chunk)
|
114 |
|
115 |
# Format each chunk with better section styling
|
116 |
+
chunks = [f"""
|
117 |
+
<div class='doc-section'>
|
118 |
+
<h3 class="doc-title">Evidence Section {i+1}</h3>
|
119 |
+
<div class='doc-content'>{chunk.strip()}</div>
|
120 |
+
</div>
|
121 |
+
""" for i, chunk in enumerate(chunks)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
# Extract keywords from user input (longer than 3 chars)
|
124 |
keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
|
|
|
165 |
|
166 |
.doc-section-info {
|
167 |
margin: 10px 0;
|
168 |
+
padding: 8px;
|
169 |
background: #f5f5f5;
|
170 |
border-radius: 4px;
|
171 |
}
|
172 |
|
173 |
.doc-section-info p {
|
174 |
margin: 5px 0;
|
175 |
+
font-size: 16px;
|
176 |
+
}
|
177 |
+
|
178 |
+
.subsection-info {
|
179 |
+
margin-left: 15px;
|
180 |
+
padding-left: 10px;
|
181 |
+
border-left: 2px solid #ddd;
|
182 |
+
margin-top: 5px;
|
183 |
+
}
|
184 |
+
|
185 |
+
.subsection-info p {
|
186 |
font-size: 14px;
|
187 |
+
color: #555;
|
188 |
}
|
189 |
|
190 |
.doc-content-container {
|
|
|
194 |
.doc-content-container h4 {
|
195 |
margin-bottom: 8px;
|
196 |
font-size: 16px;
|
|
|
197 |
}
|
198 |
|
199 |
.doc-content {
|