{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import requests \n", "import datetime as dt\n", "import re\n", "import json\n", "from tqdm import tqdm\n", "import os\n", "\n", "from openai import OpenAI" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### SciencesPoC extract" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('extract_sciences_po.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0item_iddate_publishedurltitredescriptiontypeauthorsectionsubheadpremiumimage_url
00I4OEKQ6MHRBP3LQVVYDDXW6T6U2024-10-07 15:33:34https://www.liberation.fr/societe/familles/a-m...Centre de loisirs à Marseille : sept plaintes ...Une enquête pour violences sur mineurs et diff...articleCaroline DelabroyFamillesJusticeTruehttps://www.liberation.fr/resizer/E6tv2-_N7qhg...
11LVA4MZBQOBECNPZD323NV6O7K42024-10-09 14:53:55https://www.liberation.fr/sports/jeux-olympiqu...JO de Paris 2024 : pour la santé mentale des a...Avec la libération de la parole sur la santé m...articleMarie ThimonnierJeux olympiques et paralympiquesEcritureTruehttps://www.liberation.fr/resizer/uAqwfrqGpk93...
224FAEHUUZ5ZFAJKLFEV2LT5CBAQ2024-10-10 15:49:02https://www.liberation.fr/international/afriqu...Au Burkina Faso, la conscription pour «punir d...Au moins sept magistrats ont été réquisitionné...articleAgnès FaivreAfriqueRépressionFalsehttps://www.liberation.fr/resizer/_A-QaSGPPcHu...
334S4G6BKFRNER3LB22CLPAEWWKY2024-10-11 15:28:25https://www.liberation.fr/economie/social/ferm...Fermeture de l’usine automobile MA France : «O...Le 13 mai, l’entreprise sous-traitante de Stel...articleEléna RoneySocialLutte socialeTruehttps://www.liberation.fr/resizer/6hlzuHlwTJFQ...
44ZAFHRNAHJVC6THXRSBMCB4A24I2024-10-09 10:55:05https://www.liberation.fr/lifestyle/design/pre...Près de Rouen, jardiner dans «un jeu subtil av...Imaginé par Patrick et Sylvie Quibel il y a tr...article['Florian Bardou, envoyé spécial à Rouen', 'ph...DesignReportageTruehttps://www.liberation.fr/resizer/goppJdUChU09...
.......................................
511516AOT254SA2VDIDNF4YW7XPLWJ5E2024-10-12 15:46:00https://www.liberation.fr/culture/musique/char...Charlie Dalin : «Mon oncle était le chanteur d...Charlie Dalin, skipper qui s’apprête à embarqu...articleAlexis BernierMusiqueCasque t'écoutes?Falsehttps://www.liberation.fr/resizer/Oum9ZxntR4pt...
512517GUOUKHLPFZBK7GVR5XU7MXVD5A2024-10-12 16:32:37https://www.liberation.fr/societe/droits-des-f...Violences sexuelles : à la recherche de «co-vi...Avec un système de «matchs», le site Coabuse r...articleHélène CoutardDroits des femmes«Match»Truehttps://www.liberation.fr/resizer/F5PeR7sIedRA...
5135185HT6C24ZBVDOBFXPLA4HNVOTT42024-10-12 16:33:57https://www.liberation.fr/environnement/agricu...«Plus ils habitent près des champs, plus les t...Des traces de pesticides, dont certains interd...article[' LIBERATION', ' AFP']AgricultureContaminationFalsehttps://www.liberation.fr/resizer/VzMm-X_AuAhQ...
514519VLV6RSQ6U5E6XJ6AIRV26AEKO42024-10-12 16:43:33https://www.liberation.fr/economie/annonces-de...Annonces de la Chine contre la crise économiqu...Face aux problèmes structurels de l’économie, ...articleArnaud VaulerinEconomieAnalyseTruehttps://www.liberation.fr/resizer/ZKreJUwCgQPk...
515520FVCJ6DQ5HVDNDGC4F6F276NVFM2024-10-12 16:53:20https://www.liberation.fr/economie/medias/budg...Budget 2025 : les radios associatives dénoncen...Les radios associatives se sont alarmées vendr...article[' LIBERATION', ' AFP']MédiasMauvaises ondesFalsehttps://www.liberation.fr/resizer/KVsBqITY61oN...
\n", "

516 rows × 12 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 item_id date_published \\\n", "0 0 I4OEKQ6MHRBP3LQVVYDDXW6T6U 2024-10-07 15:33:34 \n", "1 1 LVA4MZBQOBECNPZD323NV6O7K4 2024-10-09 14:53:55 \n", "2 2 4FAEHUUZ5ZFAJKLFEV2LT5CBAQ 2024-10-10 15:49:02 \n", "3 3 4S4G6BKFRNER3LB22CLPAEWWKY 2024-10-11 15:28:25 \n", "4 4 ZAFHRNAHJVC6THXRSBMCB4A24I 2024-10-09 10:55:05 \n", ".. ... ... ... \n", "511 516 AOT254SA2VDIDNF4YW7XPLWJ5E 2024-10-12 15:46:00 \n", "512 517 GUOUKHLPFZBK7GVR5XU7MXVD5A 2024-10-12 16:32:37 \n", "513 518 5HT6C24ZBVDOBFXPLA4HNVOTT4 2024-10-12 16:33:57 \n", "514 519 VLV6RSQ6U5E6XJ6AIRV26AEKO4 2024-10-12 16:43:33 \n", "515 520 FVCJ6DQ5HVDNDGC4F6F276NVFM 2024-10-12 16:53:20 \n", "\n", " url \\\n", "0 https://www.liberation.fr/societe/familles/a-m... \n", "1 https://www.liberation.fr/sports/jeux-olympiqu... \n", "2 https://www.liberation.fr/international/afriqu... \n", "3 https://www.liberation.fr/economie/social/ferm... \n", "4 https://www.liberation.fr/lifestyle/design/pre... \n", ".. ... \n", "511 https://www.liberation.fr/culture/musique/char... \n", "512 https://www.liberation.fr/societe/droits-des-f... \n", "513 https://www.liberation.fr/environnement/agricu... \n", "514 https://www.liberation.fr/economie/annonces-de... \n", "515 https://www.liberation.fr/economie/medias/budg... \n", "\n", " titre \\\n", "0 Centre de loisirs à Marseille : sept plaintes ... \n", "1 JO de Paris 2024 : pour la santé mentale des a... \n", "2 Au Burkina Faso, la conscription pour «punir d... \n", "3 Fermeture de l’usine automobile MA France : «O... \n", "4 Près de Rouen, jardiner dans «un jeu subtil av... \n", ".. ... \n", "511 Charlie Dalin : «Mon oncle était le chanteur d... \n", "512 Violences sexuelles : à la recherche de «co-vi... \n", "513 «Plus ils habitent près des champs, plus les t... \n", "514 Annonces de la Chine contre la crise économiqu... \n", "515 Budget 2025 : les radios associatives dénoncen... \n", "\n", " description type \\\n", "0 Une enquête pour violences sur mineurs et diff... article \n", "1 Avec la libération de la parole sur la santé m... article \n", "2 Au moins sept magistrats ont été réquisitionné... article \n", "3 Le 13 mai, l’entreprise sous-traitante de Stel... article \n", "4 Imaginé par Patrick et Sylvie Quibel il y a tr... article \n", ".. ... ... \n", "511 Charlie Dalin, skipper qui s’apprête à embarqu... article \n", "512 Avec un système de «matchs», le site Coabuse r... article \n", "513 Des traces de pesticides, dont certains interd... article \n", "514 Face aux problèmes structurels de l’économie, ... article \n", "515 Les radios associatives se sont alarmées vendr... article \n", "\n", " author \\\n", "0 Caroline Delabroy \n", "1 Marie Thimonnier \n", "2 Agnès Faivre \n", "3 Eléna Roney \n", "4 ['Florian Bardou, envoyé spécial à Rouen', 'ph... \n", ".. ... \n", "511 Alexis Bernier \n", "512 Hélène Coutard \n", "513 [' LIBERATION', ' AFP'] \n", "514 Arnaud Vaulerin \n", "515 [' LIBERATION', ' AFP'] \n", "\n", " section subhead premium \\\n", "0 Familles Justice True \n", "1 Jeux olympiques et paralympiques Ecriture True \n", "2 Afrique Répression False \n", "3 Social Lutte sociale True \n", "4 Design Reportage True \n", ".. ... ... ... \n", "511 Musique Casque t'écoutes? False \n", "512 Droits des femmes «Match» True \n", "513 Agriculture Contamination False \n", "514 Economie Analyse True \n", "515 Médias Mauvaises ondes False \n", "\n", " image_url \n", "0 https://www.liberation.fr/resizer/E6tv2-_N7qhg... \n", "1 https://www.liberation.fr/resizer/uAqwfrqGpk93... \n", "2 https://www.liberation.fr/resizer/_A-QaSGPPcHu... \n", "3 https://www.liberation.fr/resizer/6hlzuHlwTJFQ... \n", "4 https://www.liberation.fr/resizer/goppJdUChU09... \n", ".. ... \n", "511 https://www.liberation.fr/resizer/Oum9ZxntR4pt... \n", "512 https://www.liberation.fr/resizer/F5PeR7sIedRA... \n", "513 https://www.liberation.fr/resizer/VzMm-X_AuAhQ... \n", "514 https://www.liberation.fr/resizer/ZKreJUwCgQPk... \n", "515 https://www.liberation.fr/resizer/KVsBqITY61oN... \n", "\n", "[516 rows x 12 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Connect to Deepseek endpoint" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "save_path = 'save'\n", "content_path = 'extract_sciences_po'\n", "\n", "\n", "def retrieve_classifications(name, mapping_prompt):\n", "\n", "\n", " if os.path.exists(f\"{save_path}/output_{name}.txt\"):\n", " with open(f\"{save_path}/output_{name}.txt\", 'r') as f : \n", " out_dict = json.loads(f.read())\n", " out_df = pd.DataFrame.from_dict(out_dict)\n", " out = out_dict\n", " else : \n", " out_df = pd.DataFrame(columns = ['item_id', 'categorie_principale', 'categorie_secondaire'])\n", " out = []\n", "\n", " df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]\n", "\n", " if mapping_prompt[name]['client']=='deepseek':\n", " client = OpenAI(api_key=os.environ[\"DEEPSEEK_API_KEY\"], base_url=\"https://api.deepseek.com\")\n", " else:\n", " client=OpenAI()\n", "\n", " df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]\n", "\n", "\n", " with open(mapping_prompt[name]['path_prompt'], 'r') as f:\n", " prompt = f.read()\n", "\n", " with tqdm(total=df_to_process.shape[0]) as pbar:\n", " for i, row in df_to_process.iterrows():\n", " titre_brut = f\"{row.item_id}_\"+row.titre.lower().strip().replace(f\"\\xa0\", ' ').replace(' : ', ':').replace(' ', '_').replace('/', '')\n", " \n", " with open(f'{content_path}/{titre_brut}.txt', 'r') as f:\n", " text = f.read()\n", "\n", " messages = [{\"role\": \"system\", \"content\": prompt},\n", " {\"role\": \"user\", \"content\": text}]\n", "\n", " response = client.chat.completions.create(\n", " model=\"deepseek-chat\",\n", " messages=messages,\n", " response_format={\n", " 'type': 'json_object'\n", " }\n", " )\n", " try : \n", " cat_json = json.loads(response.choices[0].message.content)\n", "\n", " out.append({\n", " 'item_id':row.item_id, \n", " 'categorie_principale': cat_json['categorie_principale'],\n", " 'categorie_secondaire': cat_json['categorie_secondaire'],\n", " })\n", " \n", " with open(f'{save_path}/output_{name}.txt', 'w+') as f : \n", " f.write(json.dumps(out))\n", "\n", " except Exception as e : \n", " print(f'Error with article {row.item_id}')\n", " pass\n", "\n", " \n", " pbar.update(1)\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'DEEPSEEK_API_KEY'", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mKeyError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mmapping_prompts.txt\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mr\u001b[39m\u001b[33m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f : \n\u001b[32m 2\u001b[39m mapping = json.loads(f.read())\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mretrieve_classifications\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msans_titre_1\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapping\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 20\u001b[39m, in \u001b[36mretrieve_classifications\u001b[39m\u001b[34m(name, mapping_prompt)\u001b[39m\n\u001b[32m 17\u001b[39m df_to_process = df.loc[~df.item_id.isin(out_df.item_id)]\n\u001b[32m 19\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m mapping_prompt[name][\u001b[33m'\u001b[39m\u001b[33mclient\u001b[39m\u001b[33m'\u001b[39m]==\u001b[33m'\u001b[39m\u001b[33mdeepseek\u001b[39m\u001b[33m'\u001b[39m:\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m client = OpenAI(api_key=\u001b[43mos\u001b[49m\u001b[43m.\u001b[49m\u001b[43menviron\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mDEEPSEEK_API_KEY\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, base_url=\u001b[33m\"\u001b[39m\u001b[33mhttps://api.deepseek.com\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 21\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 22\u001b[39m client=OpenAI()\n", "\u001b[36mFile \u001b[39m\u001b[32m:679\u001b[39m, in \u001b[36m__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n", "\u001b[31mKeyError\u001b[39m: 'DEEPSEEK_API_KEY'" ] } ], "source": [ "with open('mapping_prompts.txt', 'r') as f : \n", " mapping = json.loads(f.read())\n", "retrieve_classifications('sans_titre_1', mapping)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ajouter images" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }