File size: 855 Bytes
825e978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import spacy
from tensorflow.keras.preprocessing.sequence import pad_sequences
nlp=spacy.load("en_core_web_lg")
MAX_LENGTH=10

def preprocess_texts(texts):
    sentence_vectors = []

    # Use nlp.pipe() to process texts in batch (much faster)
    for doc in nlp.pipe(texts, batch_size=1000):
        vectors = [token.vector for token in doc if token.has_vector]  # Extract word vectors
        sentence_vectors.append(vectors)

    # Pad all vectors to fixed length
    return pad_sequences(sentence_vectors, maxlen=MAX_LENGTH, dtype='float32', padding='post', truncating='post')

def wordEmbed(df,columns):
    for col in columns:
        processed_array = preprocess_texts(df[col].tolist()) 
        df["processed"+col] = [processed_array[i] for i in range(len(df))]
    df.drop(columns=columns,inplace=True)
    # print(df.head())
    return df