peter2000 commited on
Commit
5a67f08
·
1 Parent(s): 9423e4e

Delete udfPreprocess/cleaning,py

Browse files
Files changed (1) hide show
  1. udfPreprocess/cleaning,py +0 -144
udfPreprocess/cleaning,py DELETED
@@ -1,144 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- import string
4
- import nltk
5
- import spacy
6
- import en_core_web_sm
7
- import re
8
- import streamlit as st
9
-
10
- from haystack.nodes import PreProcessor
11
-
12
- '''basic cleaning - suitable for transformer models'''
13
- def basic(s):
14
- """
15
- :param s: string to be processed
16
- :return: processed string: see comments in the source code for more info
17
- """
18
- # Text Lowercase
19
- #s = s.lower()
20
- # Remove punctuation
21
- #translator = str.maketrans(' ', ' ', string.punctuation)
22
- #s = s.translate(translator)
23
- # Remove URLs
24
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
- s = re.sub(r"http\S+", " ", s)
26
- # Remove new line characters
27
- #s = re.sub('\n', ' ', s)
28
-
29
- # Remove distracting single quotes
30
- #s = re.sub("\'", " ", s)
31
- # Remove all remaining numbers and non alphanumeric characters
32
- #s = re.sub(r'\d+', ' ', s)
33
- #s = re.sub(r'\W+', ' ', s)
34
-
35
- # define custom words to replace:
36
- #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
37
-
38
- return s.strip()
39
-
40
-
41
- def preprocessingForSDG(document):
42
-
43
- """
44
- takes in haystack document object and splits it into paragraphs and applies simple cleaning.
45
- Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
46
- list that contains all text joined together.
47
- """
48
-
49
- preprocessor = PreProcessor(
50
- clean_empty_lines=True,
51
- clean_whitespace=True,
52
- clean_header_footer=True,
53
- split_by="word",
54
- split_length=100,
55
- split_respect_sentence_boundary=True,
56
- split_overlap=4
57
- )
58
- for i in document:
59
- docs_processed = preprocessor.process([i])
60
- for item in docs_processed:
61
- item.content = basic(item.content)
62
-
63
- st.write("your document has been splitted to", len(docs_processed), "paragraphs")
64
-
65
- # create dataframe of text and list of all text
66
- df = pd.DataFrame(docs_processed)
67
- all_text = " ".join(df.content.to_list())
68
- par_list = df.content.to_list()
69
-
70
- return docs_processed, df, all_text, par_list
71
-
72
- def preprocessing(document):
73
-
74
- """
75
- takes in haystack document object and splits it into paragraphs and applies simple cleaning.
76
- Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
77
- list that contains all text joined together.
78
- """
79
-
80
- preprocessor = PreProcessor(
81
- clean_empty_lines=True,
82
- clean_whitespace=True,
83
- clean_header_footer=True,
84
- split_by="sentence",
85
- split_length=3,
86
- split_respect_sentence_boundary=False,
87
- split_overlap=1
88
- )
89
- for i in document:
90
- docs_processed = preprocessor.process([i])
91
- for item in docs_processed:
92
- item.content = basic(item.content)
93
-
94
- st.write("your document has been splitted to", len(docs_processed), "paragraphs")
95
-
96
- # create dataframe of text and list of all text
97
- df = pd.DataFrame(docs_processed)
98
- all_text = " ".join(df.content.to_list())
99
- par_list = df.content.to_list()
100
-
101
- return docs_processed, df, all_text, par_list
102
-
103
- '''processing with spacy - suitable for models such as tf-idf, word2vec'''
104
- def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
105
-
106
- """
107
- Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
108
- filters out all but proper nouns, nounts, verbs and adjectives.
109
- Parameters
110
- ----------
111
- alpha : str
112
- The input string.
113
- use_nlp : bool, default False
114
- Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
115
- Should be set to False if used inside nlp.pipeline
116
- Returns
117
- -------
118
- ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
119
- Notes
120
- -----
121
- Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
122
- Use together with nlp.pipeline for batch processing.
123
- """
124
-
125
- nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
126
-
127
- if use_nlp:
128
-
129
- alpha = nlp(alpha)
130
-
131
-
132
-
133
- beta = []
134
-
135
- for tok in alpha:
136
-
137
- if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
138
-
139
- beta.append(tok.lemma_)
140
-
141
-
142
- text = ' '.join(beta)
143
- text = text.lower()
144
- return text