presidio-de-identify / azure_ai_language_wrapper.py
awacke1's picture
Update azure_ai_language_wrapper.py
ea3f5eb verified
import os
from typing import List, Optional
import logging
import dotenv
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
from presidio_analyzer.nlp_engine import NlpArtifacts
# 📜 Our trusty scribe, logging every move of our privacy-protecting saga
logger = logging.getLogger("presidio-streamlit")
class AzureAIServiceWrapper(EntityRecognizer):
"""
🦸‍♂️ The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics
to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's
analyzer, it’s ready to team up with your SFT app for world-saving AI missions. 💪
"""
from azure.ai.textanalytics._models import PiiEntityCategory
# 📋 Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it!
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
def __init__(
self,
supported_entities: Optional[List[str]] = None,
supported_language: str = "en",
ta_client: Optional[TextAnalyticsClient] = None,
ta_key: Optional[str] = None,
ta_endpoint: Optional[str] = None,
):
"""
🎬 Lights, camera, action! Initializes our Azure-powered PII slayer.
:param supported_entities: PII types to hunt (defaults to ALL the baddies).
:param supported_language: Language to analyze (English by default, mate! 🇬🇧).
:param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves).
:param ta_key: Secret key to unlock Azure’s vault of NLP magic.
:param ta_endpoint: The Azure portal where the PII-zapping happens.
*Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints,
and entity lists snap together for a privacy-protecting masterpiece! 😼
"""
# 🛡️ Default to all supported entities if none specified—maximum coverage!
if not supported_entities:
supported_entities = self.TA_SUPPORTED_ENTITIES
# 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest
super().__init__(
supported_entities=supported_entities,
supported_language=supported_language,
name="Azure AI Language PII",
)
# 🔑 Stash the key and endpoint for Azure’s secret handshake
self.ta_key = ta_key
self.ta_endpoint = ta_endpoint
# 🤝 Authenticate if no client’s provided—time to summon Azure’s NLP beast!
if not ta_client:
ta_client = self.__authenticate_client(ta_key, ta_endpoint)
self.ta_client = ta_client
@staticmethod
def __authenticate_client(key: str, endpoint: str):
"""
🔓 Unlocks Azure’s treasure chest with a key and endpoint.
:param key: The magic password to Azure’s NLP kingdom.
:param endpoint: The gate to Azure’s Text Analytics realm.
:return: A shiny TextAnalyticsClient ready to rumble!
*Fun fact*: This is like getting VIP access to a privacy party—credentials
checked, and we’re in! 🎉
"""
ta_credential = AzureKeyCredential(key)
text_analytics_client = TextAnalyticsClient(
endpoint=endpoint, credential=ta_credential
)
return text_analytics_client
def analyze(
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
) -> List[RecognizerResult]:
"""
🕵️‍♀️ The main event: Scans text for PII like a hawk and returns redacted results.
:param text: The text to scrub clean of sensitive data.
:param entities: Specific PII types to hunt (or all if None).
:param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ‘em).
:return: A list of RecognizerResult with PII locations and confidence scores.
*Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs,
credit cards, and emails don’t stand a chance! 🦅
*SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! 😏
"""
# 🗳️ Default to empty entity list if none provided—flexibility is our jam
if not entities:
entities = []
# 🚀 Fire up Azure’s PII recognizer with the text and language
response = self.ta_client.recognize_pii_entities(
[text], language=self.supported_language
)
# ✅ Filter out any errors—only the good stuff makes the cut
results = [doc for doc in response if not doc.is_error]
recognizer_results = []
# 🔍 Loop through results, cherry-picking valid entities
for res in results:
for entity in res.entities:
# 🚫 Skip unsupported entities—we’re picky like that
if entity.category not in self.supported_entities:
continue
# 📝 Craft a fancy explanation for why we flagged this PII
analysis_explanation = self._build_explanation(
original_score=entity.confidence_score,
entity_type=entity.category,
)
# 🎯 Log the hit: entity type, position, and confidence score
recognizer_results.append(
RecognizerResult(
entity_type=entity.category,
start=entity.offset,
end=entity.offset + len(entity.text),
score=entity.confidence_score,
analysis_explanation=analysis_explanation,
)
)
# 🏆 Return the haul of PII findings—mission accomplished!
return recognizer_results
@staticmethod
def _build_explanation(
original_score: float, entity_type: str
) -> AnalysisExplanation:
"""
📜 Writes a love letter explaining why we flagged a PII entity.
:param original_score: Confidence score from Azure’s NLP oracle.
:param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER).
:return: An AnalysisExplanation object with all the juicy details.
*Witty note*: This is like leaving a Post-it note saying, “Caught ya, sneaky
credit card number!” 😜
"""
explanation = AnalysisExplanation(
recognizer=AzureAIServiceWrapper.__class__.__name__,
original_score=original_score,
textual_explanation=f"Identified as {entity_type} by Text Analytics",
)
return explanation
def load(self) -> None:
"""
🛠️ Placeholder for loading resources—Azure’s already warmed up, so we chill.
*Cheeky remark*: Like a superhero on standby, we’re always ready to leap
into action. No prep needed! 😎
"""
pass
if __name__ == "__main__":
"""
🎮 Demo mode: Test-drive our PII zapper with sample text!
*Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it
to the Hub! 🤗
"""
import presidio_helpers
# 🔐 Load secrets from .env—because hardcoding keys is so last century
dotenv.load_dotenv()
# 📖 Our test story, packed with PII for our hero to vanquish
text = """
Here are a few example sentences we currently support:
Hello, my name is David Johnson and I live in Maine.
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
My passport: 191280342 and my phone number: (212) 555-1234.
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
"""
# 🦸‍♀️ Summon the analyzer with Azure’s secret sauce
analyzer = presidio_helpers.analyzer_engine(
model_path="Azure Text Analytics PII",
ta_key=os.environ["TA_KEY"],
ta_endpoint=os.environ["TA_ENDPOINT"],
)
# 💥 Unleash the PII-hunting beast on our text
analyzer.analyze(text=text, language="en")