Spaces:
Sleeping
Sleeping
Update azure_ai_language_wrapper.py
Browse files- azure_ai_language_wrapper.py +77 -7
azure_ai_language_wrapper.py
CHANGED
@@ -8,12 +8,19 @@ from azure.core.credentials import AzureKeyCredential
|
|
8 |
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
|
9 |
from presidio_analyzer.nlp_engine import NlpArtifacts
|
10 |
|
|
|
11 |
logger = logging.getLogger("presidio-streamlit")
|
12 |
|
13 |
|
14 |
class AzureAIServiceWrapper(EntityRecognizer):
|
|
|
|
|
|
|
|
|
|
|
15 |
from azure.ai.textanalytics._models import PiiEntityCategory
|
16 |
|
|
|
17 |
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
|
18 |
|
19 |
def __init__(
|
@@ -25,30 +32,45 @@ class AzureAIServiceWrapper(EntityRecognizer):
|
|
25 |
ta_endpoint: Optional[str] = None,
|
26 |
):
|
27 |
"""
|
28 |
-
|
29 |
-
:param
|
30 |
-
:param
|
31 |
-
:param
|
|
|
|
|
|
|
|
|
32 |
"""
|
33 |
-
|
34 |
if not supported_entities:
|
35 |
supported_entities = self.TA_SUPPORTED_ENTITIES
|
36 |
|
|
|
37 |
super().__init__(
|
38 |
supported_entities=supported_entities,
|
39 |
supported_language=supported_language,
|
40 |
name="Azure AI Language PII",
|
41 |
)
|
42 |
|
|
|
43 |
self.ta_key = ta_key
|
44 |
self.ta_endpoint = ta_endpoint
|
45 |
|
|
|
46 |
if not ta_client:
|
47 |
ta_client = self.__authenticate_client(ta_key, ta_endpoint)
|
48 |
self.ta_client = ta_client
|
49 |
|
50 |
@staticmethod
|
51 |
def __authenticate_client(key: str, endpoint: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
ta_credential = AzureKeyCredential(key)
|
53 |
text_analytics_client = TextAnalyticsClient(
|
54 |
endpoint=endpoint, credential=ta_credential
|
@@ -58,21 +80,43 @@ class AzureAIServiceWrapper(EntityRecognizer):
|
|
58 |
def analyze(
|
59 |
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
|
60 |
) -> List[RecognizerResult]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
if not entities:
|
62 |
entities = []
|
|
|
|
|
63 |
response = self.ta_client.recognize_pii_entities(
|
64 |
[text], language=self.supported_language
|
65 |
)
|
|
|
|
|
66 |
results = [doc for doc in response if not doc.is_error]
|
67 |
recognizer_results = []
|
|
|
|
|
68 |
for res in results:
|
69 |
for entity in res.entities:
|
|
|
70 |
if entity.category not in self.supported_entities:
|
71 |
continue
|
72 |
-
|
|
|
|
|
73 |
original_score=entity.confidence_score,
|
74 |
entity_type=entity.category,
|
75 |
)
|
|
|
|
|
76 |
recognizer_results.append(
|
77 |
RecognizerResult(
|
78 |
entity_type=entity.category,
|
@@ -83,12 +127,21 @@ class AzureAIServiceWrapper(EntityRecognizer):
|
|
83 |
)
|
84 |
)
|
85 |
|
|
|
86 |
return recognizer_results
|
87 |
|
88 |
@staticmethod
|
89 |
def _build_explanation(
|
90 |
original_score: float, entity_type: str
|
91 |
) -> AnalysisExplanation:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
explanation = AnalysisExplanation(
|
93 |
recognizer=AzureAIServiceWrapper.__class__.__name__,
|
94 |
original_score=original_score,
|
@@ -97,13 +150,26 @@ class AzureAIServiceWrapper(EntityRecognizer):
|
|
97 |
return explanation
|
98 |
|
99 |
def load(self) -> None:
|
|
|
|
|
|
|
|
|
|
|
100 |
pass
|
101 |
|
102 |
|
103 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
104 |
import presidio_helpers
|
105 |
|
|
|
106 |
dotenv.load_dotenv()
|
|
|
|
|
107 |
text = """
|
108 |
Here are a few example sentences we currently support:
|
109 |
|
@@ -118,9 +184,13 @@ if __name__ == "__main__":
|
|
118 |
|
119 |
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
|
120 |
"""
|
|
|
|
|
121 |
analyzer = presidio_helpers.analyzer_engine(
|
122 |
model_path="Azure Text Analytics PII",
|
123 |
ta_key=os.environ["TA_KEY"],
|
124 |
ta_endpoint=os.environ["TA_ENDPOINT"],
|
125 |
)
|
126 |
-
|
|
|
|
|
|
8 |
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
|
9 |
from presidio_analyzer.nlp_engine import NlpArtifacts
|
10 |
|
11 |
+
# 📜 Our trusty scribe, logging every move of our privacy-protecting saga
|
12 |
logger = logging.getLogger("presidio-streamlit")
|
13 |
|
14 |
|
15 |
class AzureAIServiceWrapper(EntityRecognizer):
|
16 |
+
"""
|
17 |
+
🦸♂️ The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics
|
18 |
+
to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's
|
19 |
+
analyzer, it’s ready to team up with your SFT app for world-saving AI missions. 💪
|
20 |
+
"""
|
21 |
from azure.ai.textanalytics._models import PiiEntityCategory
|
22 |
|
23 |
+
# 📋 Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it!
|
24 |
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
|
25 |
|
26 |
def __init__(
|
|
|
32 |
ta_endpoint: Optional[str] = None,
|
33 |
):
|
34 |
"""
|
35 |
+
🎬 Lights, camera, action! Initializes our Azure-powered PII slayer.
|
36 |
+
:param supported_entities: PII types to hunt (defaults to ALL the baddies).
|
37 |
+
:param supported_language: Language to analyze (English by default, mate! 🇬🇧).
|
38 |
+
:param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves).
|
39 |
+
:param ta_key: Secret key to unlock Azure’s vault of NLP magic.
|
40 |
+
:param ta_endpoint: The Azure portal where the PII-zapping happens.
|
41 |
+
*Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints,
|
42 |
+
and entity lists snap together for a privacy-protecting masterpiece! 😼
|
43 |
"""
|
44 |
+
# 🛡️ Default to all supported entities if none specified—maximum coverage!
|
45 |
if not supported_entities:
|
46 |
supported_entities = self.TA_SUPPORTED_ENTITIES
|
47 |
|
48 |
+
# 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest
|
49 |
super().__init__(
|
50 |
supported_entities=supported_entities,
|
51 |
supported_language=supported_language,
|
52 |
name="Azure AI Language PII",
|
53 |
)
|
54 |
|
55 |
+
# 🔑 Stash the key and endpoint for Azure’s secret handshake
|
56 |
self.ta_key = ta_key
|
57 |
self.ta_endpoint = ta_endpoint
|
58 |
|
59 |
+
# 🤝 Authenticate if no client’s provided—time to summon Azure’s NLP beast!
|
60 |
if not ta_client:
|
61 |
ta_client = self.__authenticate_client(ta_key, ta_endpoint)
|
62 |
self.ta_client = ta_client
|
63 |
|
64 |
@staticmethod
|
65 |
def __authenticate_client(key: str, endpoint: str):
|
66 |
+
"""
|
67 |
+
🔓 Unlocks Azure’s treasure chest with a key and endpoint.
|
68 |
+
:param key: The magic password to Azure’s NLP kingdom.
|
69 |
+
:param endpoint: The gate to Azure’s Text Analytics realm.
|
70 |
+
:return: A shiny TextAnalyticsClient ready to rumble!
|
71 |
+
*Fun fact*: This is like getting VIP access to a privacy party—credentials
|
72 |
+
checked, and we’re in! 🎉
|
73 |
+
"""
|
74 |
ta_credential = AzureKeyCredential(key)
|
75 |
text_analytics_client = TextAnalyticsClient(
|
76 |
endpoint=endpoint, credential=ta_credential
|
|
|
80 |
def analyze(
|
81 |
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
|
82 |
) -> List[RecognizerResult]:
|
83 |
+
"""
|
84 |
+
🕵️♀️ The main event: Scans text for PII like a hawk and returns redacted results.
|
85 |
+
:param text: The text to scrub clean of sensitive data.
|
86 |
+
:param entities: Specific PII types to hunt (or all if None).
|
87 |
+
:param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ‘em).
|
88 |
+
:return: A list of RecognizerResult with PII locations and confidence scores.
|
89 |
+
*Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs,
|
90 |
+
credit cards, and emails don’t stand a chance! 🦅
|
91 |
+
*SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! 😏
|
92 |
+
"""
|
93 |
+
# 🗳️ Default to empty entity list if none provided—flexibility is our jam
|
94 |
if not entities:
|
95 |
entities = []
|
96 |
+
|
97 |
+
# 🚀 Fire up Azure’s PII recognizer with the text and language
|
98 |
response = self.ta_client.recognize_pii_entities(
|
99 |
[text], language=self.supported_language
|
100 |
)
|
101 |
+
|
102 |
+
# ✅ Filter out any errors—only the good stuff makes the cut
|
103 |
results = [doc for doc in response if not doc.is_error]
|
104 |
recognizer_results = []
|
105 |
+
|
106 |
+
# 🔍 Loop through results, cherry-picking valid entities
|
107 |
for res in results:
|
108 |
for entity in res.entities:
|
109 |
+
# ��� Skip unsupported entities—we’re picky like that
|
110 |
if entity.category not in self.supported_entities:
|
111 |
continue
|
112 |
+
|
113 |
+
# 📝 Craft a fancy explanation for why we flagged this PII
|
114 |
+
analysis_explanation = self._build_explanation(
|
115 |
original_score=entity.confidence_score,
|
116 |
entity_type=entity.category,
|
117 |
)
|
118 |
+
|
119 |
+
# 🎯 Log the hit: entity type, position, and confidence score
|
120 |
recognizer_results.append(
|
121 |
RecognizerResult(
|
122 |
entity_type=entity.category,
|
|
|
127 |
)
|
128 |
)
|
129 |
|
130 |
+
# 🏆 Return the haul of PII findings—mission accomplished!
|
131 |
return recognizer_results
|
132 |
|
133 |
@staticmethod
|
134 |
def _build_explanation(
|
135 |
original_score: float, entity_type: str
|
136 |
) -> AnalysisExplanation:
|
137 |
+
"""
|
138 |
+
📜 Writes a love letter explaining why we flagged a PII entity.
|
139 |
+
:param original_score: Confidence score from Azure’s NLP oracle.
|
140 |
+
:param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER).
|
141 |
+
:return: An AnalysisExplanation object with all the juicy details.
|
142 |
+
*Witty note*: This is like leaving a Post-it note saying, “Caught ya, sneaky
|
143 |
+
credit card number!” 😜
|
144 |
+
"""
|
145 |
explanation = AnalysisExplanation(
|
146 |
recognizer=AzureAIServiceWrapper.__class__.__name__,
|
147 |
original_score=original_score,
|
|
|
150 |
return explanation
|
151 |
|
152 |
def load(self) -> None:
|
153 |
+
"""
|
154 |
+
🛠️ Placeholder for loading resources—Azure’s already warmed up, so we chill.
|
155 |
+
*Cheeky remark*: Like a superhero on standby, we’re always ready to leap
|
156 |
+
into action. No prep needed! 😎
|
157 |
+
"""
|
158 |
pass
|
159 |
|
160 |
|
161 |
if __name__ == "__main__":
|
162 |
+
"""
|
163 |
+
🎮 Demo mode: Test-drive our PII zapper with sample text!
|
164 |
+
*Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it
|
165 |
+
to the Hub! 🤗
|
166 |
+
"""
|
167 |
import presidio_helpers
|
168 |
|
169 |
+
# 🔐 Load secrets from .env—because hardcoding keys is so last century
|
170 |
dotenv.load_dotenv()
|
171 |
+
|
172 |
+
# 📖 Our test story, packed with PII for our hero to vanquish
|
173 |
text = """
|
174 |
Here are a few example sentences we currently support:
|
175 |
|
|
|
184 |
|
185 |
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
|
186 |
"""
|
187 |
+
|
188 |
+
# 🦸♀️ Summon the analyzer with Azure’s secret sauce
|
189 |
analyzer = presidio_helpers.analyzer_engine(
|
190 |
model_path="Azure Text Analytics PII",
|
191 |
ta_key=os.environ["TA_KEY"],
|
192 |
ta_endpoint=os.environ["TA_ENDPOINT"],
|
193 |
)
|
194 |
+
|
195 |
+
# 💥 Unleash the PII-hunting beast on our text
|
196 |
+
analyzer.analyze(text=text, language="en")
|