rwillats commited on
Commit
b833658
·
verified ·
1 Parent(s): c82d2ac

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. hate_speech_demo.py +46 -80
hate_speech_demo.py CHANGED
@@ -36,7 +36,7 @@ function openPolicyPopup() {{
36
  </script>
37
  """
38
 
39
- # Replace your process_retrieval_text function with this final version
40
  def process_retrieval_text(retrieval_text, user_input):
41
  """
42
  Process the retrieval text by identifying proper document boundaries
@@ -67,41 +67,16 @@ def process_retrieval_text(retrieval_text, user_input):
67
  content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
68
  content = content_match.group(1).strip() if content_match else "No content available"
69
 
70
- # Try to parse the document title, section and subsection from content
71
- doc_title = "Hate Speech Policy" # Default
72
- section_title = ""
73
- subsection_title = ""
74
-
75
- # Look for document title pattern
76
- doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', content, re.DOTALL)
77
- if doc_title_match:
78
- doc_title = doc_title_match.group(1).strip()
79
- # Remove this part from content
80
- content = content.replace(doc_title_match.group(0), "").strip()
81
-
82
- # Look for section pattern
83
- section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', content, re.DOTALL)
84
- if section_match:
85
- section_title = section_match.group(1).strip()
86
- # Remove this part from content
87
- content = content.replace(section_match.group(0), "").strip()
88
-
89
- # Look for subsection pattern
90
- subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', content, re.DOTALL)
91
- if subsection_match:
92
- subsection_title = subsection_match.group(1).strip()
93
- # Remove this part from content
94
- content = content.replace(subsection_match.group(0), "").strip()
95
-
96
  # Format with clear section headers and better spacing
97
  formatted_html = f"""
98
  <div class='doc-section'>
99
  <h3 class="doc-number">Evidence Section {i+1}</h3>
100
 
101
  <div class="doc-section-info">
102
- <p><strong>Document Title:</strong> {doc_title}</p>
103
- <p><strong>Section:</strong> {section_title if section_title else doc_name}</p>
104
- <p><strong>Subsection:</strong> {subsection_title if subsection_title else "Page " + page}</p>
 
105
  </div>
106
 
107
  <div class="doc-content-container">
@@ -113,57 +88,37 @@ def process_retrieval_text(retrieval_text, user_input):
113
  chunks.append(formatted_html)
114
  else:
115
  # Fallback to a simpler approach - split by double newlines
 
 
116
  chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # Format each chunk with better section styling
119
- for i, chunk in enumerate(retrieval_text.strip().split("\n\n")):
120
- if not chunk.strip():
121
- continue
122
-
123
- # Try to extract document structure if possible
124
- doc_title = "Hate Speech Policy" # Default
125
- section_title = ""
126
- subsection_title = ""
127
- content = chunk
128
-
129
- # Look for document title pattern
130
- doc_title_match = re.search(r'Document Title:\s*(.*?)(?:Section:|$)', chunk, re.DOTALL)
131
- if doc_title_match:
132
- doc_title = doc_title_match.group(1).strip()
133
- # Remove this part from content
134
- content = content.replace(doc_title_match.group(0), "").strip()
135
-
136
- # Look for section pattern
137
- section_match = re.search(r'Section:\s*(.*?)(?:Sub-Section:|$)', chunk, re.DOTALL)
138
- if section_match:
139
- section_title = section_match.group(1).strip()
140
- # Remove this part from content
141
- content = content.replace(section_match.group(0), "").strip()
142
-
143
- # Look for subsection pattern
144
- subsection_match = re.search(r'Sub-Section:\s*(.*?)(?:\n|$)', chunk, re.DOTALL)
145
- if subsection_match:
146
- subsection_title = subsection_match.group(1).strip()
147
- # Remove this part from content
148
- content = content.replace(subsection_match.group(0), "").strip()
149
-
150
- formatted_html = f"""
151
- <div class='doc-section'>
152
- <h3 class="doc-title">Evidence Section {i+1}</h3>
153
-
154
- <div class="doc-section-info">
155
- <p><strong>Document Title:</strong> {doc_title}</p>
156
- <p><strong>Section:</strong> {section_title if section_title else "Section " + str(i+1)}</p>
157
- <p><strong>Subsection:</strong> {subsection_title if subsection_title else "N/A"}</p>
158
- </div>
159
-
160
- <div class="doc-content-container">
161
- <h4>Content:</h4>
162
- <div class='doc-content'>{content}</div>
163
- </div>
164
- </div>
165
- """
166
- chunks.append(formatted_html)
167
 
168
  # Extract keywords from user input (longer than 3 chars)
169
  keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
@@ -210,14 +165,26 @@ def process_retrieval_text(retrieval_text, user_input):
210
 
211
  .doc-section-info {
212
  margin: 10px 0;
213
- padding: 10px;
214
  background: #f5f5f5;
215
  border-radius: 4px;
216
  }
217
 
218
  .doc-section-info p {
219
  margin: 5px 0;
 
 
 
 
 
 
 
 
 
 
 
220
  font-size: 14px;
 
221
  }
222
 
223
  .doc-content-container {
@@ -227,7 +194,6 @@ def process_retrieval_text(retrieval_text, user_input):
227
  .doc-content-container h4 {
228
  margin-bottom: 8px;
229
  font-size: 16px;
230
- color: #333;
231
  }
232
 
233
  .doc-content {
 
36
  </script>
37
  """
38
 
39
+ # Replace your process_retrieval_text function with this updated version
40
  def process_retrieval_text(retrieval_text, user_input):
41
  """
42
  Process the retrieval text by identifying proper document boundaries
 
67
  content_match = re.search(r'Content:\s*(.*)', doc_info, re.DOTALL)
68
  content = content_match.group(1).strip() if content_match else "No content available"
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Format with clear section headers and better spacing
71
  formatted_html = f"""
72
  <div class='doc-section'>
73
  <h3 class="doc-number">Evidence Section {i+1}</h3>
74
 
75
  <div class="doc-section-info">
76
+ <p><strong>Section Title:</strong> {doc_name}</p>
77
+ <div class="subsection-info">
78
+ <p><strong>Subsection:</strong> Page {page}</p>
79
+ </div>
80
  </div>
81
 
82
  <div class="doc-content-container">
 
88
  chunks.append(formatted_html)
89
  else:
90
  # Fallback to a simpler approach - split by double newlines
91
+ # but combine any small chunks that appear to be part of the same document
92
+ raw_chunks = retrieval_text.strip().split("\n\n")
93
  chunks = []
94
+ current_chunk = ""
95
+
96
+ for chunk in raw_chunks:
97
+ # If it's a short chunk without a clear document marker, or appears to be a continuation,
98
+ # append to previous chunk
99
+ if (len(chunk) < 50 and not re.search(r'doc|document|evidence', chunk.lower())) or \
100
+ not chunk.strip().startswith(("Doc", "Document", "Evidence", "Source", "Content")):
101
+ if current_chunk:
102
+ current_chunk += "\n\n" + chunk
103
+ else:
104
+ current_chunk = chunk
105
+ else:
106
+ # This looks like a new document chunk
107
+ if current_chunk:
108
+ chunks.append(current_chunk)
109
+ current_chunk = chunk
110
+
111
+ # Add the last chunk if there is one
112
+ if current_chunk:
113
+ chunks.append(current_chunk)
114
 
115
  # Format each chunk with better section styling
116
+ chunks = [f"""
117
+ <div class='doc-section'>
118
+ <h3 class="doc-title">Evidence Section {i+1}</h3>
119
+ <div class='doc-content'>{chunk.strip()}</div>
120
+ </div>
121
+ """ for i, chunk in enumerate(chunks)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # Extract keywords from user input (longer than 3 chars)
124
  keywords = re.findall(r'\b\w{4,}\b', user_input.lower())
 
165
 
166
  .doc-section-info {
167
  margin: 10px 0;
168
+ padding: 8px;
169
  background: #f5f5f5;
170
  border-radius: 4px;
171
  }
172
 
173
  .doc-section-info p {
174
  margin: 5px 0;
175
+ font-size: 16px;
176
+ }
177
+
178
+ .subsection-info {
179
+ margin-left: 15px;
180
+ padding-left: 10px;
181
+ border-left: 2px solid #ddd;
182
+ margin-top: 5px;
183
+ }
184
+
185
+ .subsection-info p {
186
  font-size: 14px;
187
+ color: #555;
188
  }
189
 
190
  .doc-content-container {
 
194
  .doc-content-container h4 {
195
  margin-bottom: 8px;
196
  font-size: 16px;
 
197
  }
198
 
199
  .doc-content {