In [3]:
import pandas as pd 
import requests 
import datetime as dt
import re
import json
from tqdm import tqdm
import os

from openai import OpenAI

#### Calculate

In [12]:
if "OPENAI_API_KEY" not in os.environ:
    with open('secrets/keys.txt', 'r') as f:
        keys = json.loads(f.read())
else : 
    keys=os.environ

In [15]:
save_path = 'save'
content_path = 'extract_sciences_po'


def retrieve_classifications(name, mapping_prompt):

    df = pd.read_csv('extract_sciences_po.csv')


    if os.path.exists(f"{save_path}/output_{name}.txt"):
        with open(f"{save_path}/output_{name}.txt", 'r') as f : 
            out_dict = json.loads(f.read())
        out_df = pd.DataFrame.from_dict(out_dict)
        out = out_dict
    else : 
        out_df = pd.DataFrame(columns = ['item_id', 'categorie_principale', 'categorie_secondaire'])
        out = []

    df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]

    if mapping_prompt[name]['client']=='deepseek':
        client = OpenAI(api_key=keys["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
        model="deepseek-chat"
    else:
        client=OpenAI(api_key=keys['OPENAI_API_KEY'])
        model="gpt-4o"

    df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]


    with open(mapping_prompt[name]['path_prompt'], 'r') as f:
        prompt = f.read()

    with tqdm(total=df_to_process.shape[0]) as pbar:
        for i, row in df_to_process.iterrows():
            titre_brut = f"{row.item_id}_"+row.titre.lower().strip().replace(f"\xa0", ' ').replace(' : ', ':').replace(' ', '_').replace('/', '')
            
            with open(f'{content_path}/{titre_brut}.txt', 'r') as f:
                text = f.read()

            messages = [{"role": "system", "content": prompt},
                        {"role": "user", "content": text}]

            response = client.chat.completions.create(
                model=model,
                messages=messages,
                response_format={
                    'type': 'json_object'
                }
            )
            try : 
                cat_json = json.loads(response.choices[0].message.content)

                out.append({
                    'item_id':row.item_id, 
                    'categorie_principale': cat_json['categorie_principale'],
                    'categorie_secondaire': cat_json['categorie_secondaire'],
                })
                
                with open(f'{save_path}/output_{name}.txt', 'w+') as f : 
                    f.write(json.dumps(out))

            except Exception as e : 
                print(f'Error with article {row.item_id}')
                pass

            
            pbar.update(1)



In [20]:
with open('mapping_prompts.txt', 'r') as f : 
    mapping = json.loads(f.read())

for name in mapping.keys():
    print(name)
    retrieve_classifications(name, mapping)

sans_titre_1


0it [00:00, ?it/s]

0it [00:00, ?it/s]


favarel_et_al


 21%|██▏       | 41/191 [05:28<16:54,  6.76s/it]

#### Ajouter images