zaldivards commited on
Commit
b715dbd
·
1 Parent(s): f713e02

feat: further chunk normalization

Browse files

- Normalize article names
- Remove more than one consecutive spaces
- Add a prefix identifier to chunks that are
article continuations

Files changed (2) hide show
  1. app.py +51 -12
  2. sources/Constitucion de la Republica.pdf +0 -0
app.py CHANGED
@@ -11,11 +11,13 @@ from pypdf import PdfReader
11
  from transformers import AutoModel
12
 
13
 
14
- chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
15
  default_k = int(os.environ.get("DEFAULT_K", 5))
16
 
17
  model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
18
 
 
 
19
  docs = {}
20
 
21
 
@@ -66,9 +68,35 @@ def convert(filename: str) -> str:
66
  raise ValueError(f"Unsupported file type: {filename}")
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def generate_chunks(text: str, max_length: int) -> list[str]:
70
  """Generate chunks from a file's raw text. Chunks are calculated based
71
- on the `max_lenght` parameter and the split character (.)
72
 
73
  Parameters
74
  ----------
@@ -76,7 +104,7 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
76
  The raw text
77
  max_length : int
78
  Maximum number of characters a chunk can have. Note that chunks
79
- may not have this exact lenght, as another component is also
80
  involved in the splitting process
81
 
82
  Returns
@@ -85,20 +113,31 @@ def generate_chunks(text: str, max_length: int) -> list[str]:
85
  A list of chunks/nodes
86
  """
87
 
88
- segments = text.split(".")
 
 
 
 
 
89
  chunks = []
90
  chunk = ""
91
 
92
- for current_segment in segments:
93
- # try to normalize the current chunk
94
- current_segment = re.sub(r"\s+", " ", current_segment).strip()
95
- if len(chunk) < max_length:
 
 
 
 
96
  chunk += f". {current_segment}"
97
- else:
98
- chunks.append(chunk)
99
- chunk = current_segment
 
100
  if chunk:
101
- chunks.append(chunk)
 
102
  return chunks
103
 
104
 
 
11
  from transformers import AutoModel
12
 
13
 
14
+ chunk_size = int(os.environ.get("CHUNK_SIZE", 250))
15
  default_k = int(os.environ.get("DEFAULT_K", 5))
16
 
17
  model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-es", trust_remote_code=True)
18
 
19
+ headers = ("INDICE LEGISLATIVO", "ASAMBLEA LEGISLATIVA - REPUBLICA DE EL SALVADOR")
20
+
21
  docs = {}
22
 
23
 
 
68
  raise ValueError(f"Unsupported file type: {filename}")
69
 
70
 
71
+ def add_prefix(chunk: str, art_prefix: str) -> tuple[str, str]:
72
+ """Add prefix to chunks that are continuation of a certain article
73
+
74
+ Parameters
75
+ ----------
76
+ chunk : str
77
+ original chunk
78
+ art_prefix : str
79
+ current prefix
80
+
81
+ Returns
82
+ -------
83
+ tuple[str, str]
84
+ The updated chunk and the new prefix
85
+ """
86
+ results = re.findall(r"(Articulo \d+)\s+-", chunk)
87
+ ignore_results = False
88
+ if (len(results) == 1 and chunk.find(results[0]) > 4 and art_prefix) or not results:
89
+ results.insert(0, art_prefix)
90
+ elif len(results) == 1 and chunk.find(results[0]) <= 4:
91
+ ignore_results = True
92
+ art_prefix = results[-1]
93
+ # if the current chunk is a continuation of a certain article, an identifier prefix will be added to it
94
+ return (f"<<{'|'.join(results)}>>{chunk}" if results and not ignore_results else chunk), art_prefix
95
+
96
+
97
  def generate_chunks(text: str, max_length: int) -> list[str]:
98
  """Generate chunks from a file's raw text. Chunks are calculated based
99
+ on the `max_length` parameter and the split character (.)
100
 
101
  Parameters
102
  ----------
 
104
  The raw text
105
  max_length : int
106
  Maximum number of characters a chunk can have. Note that chunks
107
+ may not have this exact length, as another component is also
108
  involved in the splitting process
109
 
110
  Returns
 
113
  A list of chunks/nodes
114
  """
115
 
116
+ for match_result in re.finditer(r"Art\. (\d+)\.", text):
117
+ # replace Art. X. with Articulo X
118
+ text = text.replace(match_result.group(), f"Articulo {match_result.group(1)} ")
119
+
120
+ # remove more than one line break, multiple underscores and unwanted headers or footers
121
+ text = re.sub(rf"(?<!\w)\n|_+|{headers[0]}|{headers[1]}", "", text)
122
  chunks = []
123
  chunk = ""
124
 
125
+ art_prefix = ""
126
+ # split using period (.) but ignoring number such as 1.0, 2.000, etc
127
+ for current_segment in re.split(r"(?<!\d)\.", text):
128
+ # Attempt to normalize the current chunk by removing more than one consecutive space,
129
+ # while preserving single spaces within words
130
+ current_segment = re.sub(r"(?<!\w|[.,;]) +", " ", current_segment).strip()
131
+
132
+ if len(chunk) + len(current_segment) + 2 < max_length:
133
  chunk += f". {current_segment}"
134
+ continue
135
+ chunk, art_prefix = add_prefix(chunk, art_prefix)
136
+ chunks.append(chunk.lower())
137
+ chunk = current_segment
138
  if chunk:
139
+ chunk, _ = add_prefix(chunk, art_prefix)
140
+ chunks.append(chunk.lower())
141
  return chunks
142
 
143
 
sources/Constitucion de la Republica.pdf CHANGED
Binary files a/sources/Constitucion de la Republica.pdf and b/sources/Constitucion de la Republica.pdf differ