import os from typing import List, Optional import logging import dotenv from azure.ai.textanalytics import TextAnalyticsClient from azure.core.credentials import AzureKeyCredential from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation from presidio_analyzer.nlp_engine import NlpArtifacts # šŸ“œ Our trusty scribe, logging every move of our privacy-protecting saga logger = logging.getLogger("presidio-streamlit") class AzureAIServiceWrapper(EntityRecognizer): """ šŸ¦øā€ā™‚ļø The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's analyzer, it’s ready to team up with your SFT app for world-saving AI missions. šŸ’Ŗ """ from azure.ai.textanalytics._models import PiiEntityCategory # šŸ“‹ Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it! TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory] def __init__( self, supported_entities: Optional[List[str]] = None, supported_language: str = "en", ta_client: Optional[TextAnalyticsClient] = None, ta_key: Optional[str] = None, ta_endpoint: Optional[str] = None, ): """ šŸŽ¬ Lights, camera, action! Initializes our Azure-powered PII slayer. :param supported_entities: PII types to hunt (defaults to ALL the baddies). :param supported_language: Language to analyze (English by default, mate! šŸ‡¬šŸ‡§). :param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves). :param ta_key: Secret key to unlock Azure’s vault of NLP magic. :param ta_endpoint: The Azure portal where the PII-zapping happens. *Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints, and entity lists snap together for a privacy-protecting masterpiece! 😼 """ # šŸ›”ļø Default to all supported entities if none specified—maximum coverage! if not supported_entities: supported_entities = self.TA_SUPPORTED_ENTITIES # 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest super().__init__( supported_entities=supported_entities, supported_language=supported_language, name="Azure AI Language PII", ) # šŸ”‘ Stash the key and endpoint for Azure’s secret handshake self.ta_key = ta_key self.ta_endpoint = ta_endpoint # šŸ¤ Authenticate if no client’s provided—time to summon Azure’s NLP beast! if not ta_client: ta_client = self.__authenticate_client(ta_key, ta_endpoint) self.ta_client = ta_client @staticmethod def __authenticate_client(key: str, endpoint: str): """ šŸ”“ Unlocks Azure’s treasure chest with a key and endpoint. :param key: The magic password to Azure’s NLP kingdom. :param endpoint: The gate to Azure’s Text Analytics realm. :return: A shiny TextAnalyticsClient ready to rumble! *Fun fact*: This is like getting VIP access to a privacy party—credentials checked, and we’re in! šŸŽ‰ """ ta_credential = AzureKeyCredential(key) text_analytics_client = TextAnalyticsClient( endpoint=endpoint, credential=ta_credential ) return text_analytics_client def analyze( self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None ) -> List[RecognizerResult]: """ šŸ•µļøā€ā™€ļø The main event: Scans text for PII like a hawk and returns redacted results. :param text: The text to scrub clean of sensitive data. :param entities: Specific PII types to hunt (or all if None). :param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ā€˜em). :return: A list of RecognizerResult with PII locations and confidence scores. *Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs, credit cards, and emails don’t stand a chance! šŸ¦… *SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! šŸ˜ """ # šŸ—³ļø Default to empty entity list if none provided—flexibility is our jam if not entities: entities = [] # šŸš€ Fire up Azure’s PII recognizer with the text and language response = self.ta_client.recognize_pii_entities( [text], language=self.supported_language ) # āœ… Filter out any errors—only the good stuff makes the cut results = [doc for doc in response if not doc.is_error] recognizer_results = [] # šŸ” Loop through results, cherry-picking valid entities for res in results: for entity in res.entities: # 🚫 Skip unsupported entities—we’re picky like that if entity.category not in self.supported_entities: continue # šŸ“ Craft a fancy explanation for why we flagged this PII analysis_explanation = self._build_explanation( original_score=entity.confidence_score, entity_type=entity.category, ) # šŸŽÆ Log the hit: entity type, position, and confidence score recognizer_results.append( RecognizerResult( entity_type=entity.category, start=entity.offset, end=entity.offset + len(entity.text), score=entity.confidence_score, analysis_explanation=analysis_explanation, ) ) # šŸ† Return the haul of PII findings—mission accomplished! return recognizer_results @staticmethod def _build_explanation( original_score: float, entity_type: str ) -> AnalysisExplanation: """ šŸ“œ Writes a love letter explaining why we flagged a PII entity. :param original_score: Confidence score from Azure’s NLP oracle. :param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER). :return: An AnalysisExplanation object with all the juicy details. *Witty note*: This is like leaving a Post-it note saying, ā€œCaught ya, sneaky credit card number!ā€ 😜 """ explanation = AnalysisExplanation( recognizer=AzureAIServiceWrapper.__class__.__name__, original_score=original_score, textual_explanation=f"Identified as {entity_type} by Text Analytics", ) return explanation def load(self) -> None: """ šŸ› ļø Placeholder for loading resources—Azure’s already warmed up, so we chill. *Cheeky remark*: Like a superhero on standby, we’re always ready to leap into action. No prep needed! šŸ˜Ž """ pass if __name__ == "__main__": """ šŸŽ® Demo mode: Test-drive our PII zapper with sample text! *Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it to the Hub! šŸ¤— """ import presidio_helpers # šŸ” Load secrets from .env—because hardcoding keys is so last century dotenv.load_dotenv() # šŸ“– Our test story, packed with PII for our hero to vanquish text = """ Here are a few example sentences we currently support: Hello, my name is David Johnson and I live in Maine. My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ. On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1. My passport: 191280342 and my phone number: (212) 555-1234. This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544? Kate's social security number is 078-05-1126. Her driver license? it is 1234567A. """ # šŸ¦øā€ā™€ļø Summon the analyzer with Azure’s secret sauce analyzer = presidio_helpers.analyzer_engine( model_path="Azure Text Analytics PII", ta_key=os.environ["TA_KEY"], ta_endpoint=os.environ["TA_ENDPOINT"], ) # šŸ’„ Unleash the PII-hunting beast on our text analyzer.analyze(text=text, language="en")