Spaces:
Runtime error
Runtime error
File size: 1,852 Bytes
736b778 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import pandas as pd
import torch
import re
from datasets import Dataset
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorForTokenClassification,
)
from huggingface_hub import notebook_login
# Login to Hugging Face Hub (Make sure your Space is set to private if needed)
notebook_login()
# Step 1: Load Luxury Fashion Dataset (Replace with actual dataset)
df = pd.read_csv("luxury_apparel_data.csv") # Update with correct dataset file
# Keep only relevant columns
df = df[['brand', 'category', 'description', 'price']].dropna()
# Generate search queries from dataset
df['query'] = df.apply(lambda x: f"{x['brand']} {x['category']} under {x['price']} AED", axis=1)
# Step 2: Tokenization
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_batch(batch):
return tokenizer(batch['query'], padding=True, truncation=True)
# Convert dataframe into Hugging Face dataset
hf_dataset = Dataset.from_pandas(df[['query']])
hf_dataset = hf_dataset.map(tokenize_batch, batched=True)
# Step 3: Fine-tune the Pretrained NER Model
model = AutoModelForTokenClassification.from_pretrained(model_name)
training_args = TrainingArguments(
output_dir="./luxury_ner_model",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
logging_dir="./logs",
logging_steps=500,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=hf_dataset,
eval_dataset=hf_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorForTokenClassification(tokenizer),
)
trainer.train()
# Save model to Hugging Face Hub
model.push_to_hub("luxury-fashion-ner")
tokenizer.push_to_hub("luxury-fashion-ner") |