awacke1 commited on
Commit
ea3f5eb
·
verified ·
1 Parent(s): f100294

Update azure_ai_language_wrapper.py

Browse files
Files changed (1) hide show
  1. azure_ai_language_wrapper.py +77 -7
azure_ai_language_wrapper.py CHANGED
@@ -8,12 +8,19 @@ from azure.core.credentials import AzureKeyCredential
8
  from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
9
  from presidio_analyzer.nlp_engine import NlpArtifacts
10
 
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
13
 
14
  class AzureAIServiceWrapper(EntityRecognizer):
 
 
 
 
 
15
  from azure.ai.textanalytics._models import PiiEntityCategory
16
 
 
17
  TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
18
 
19
  def __init__(
@@ -25,30 +32,45 @@ class AzureAIServiceWrapper(EntityRecognizer):
25
  ta_endpoint: Optional[str] = None,
26
  ):
27
  """
28
- Wrapper for the Azure Text Analytics client
29
- :param ta_client: object of type TextAnalyticsClient
30
- :param ta_key: Azure cognitive Services for Language key
31
- :param ta_endpoint: Azure cognitive Services for Language endpoint
 
 
 
 
32
  """
33
-
34
  if not supported_entities:
35
  supported_entities = self.TA_SUPPORTED_ENTITIES
36
 
 
37
  super().__init__(
38
  supported_entities=supported_entities,
39
  supported_language=supported_language,
40
  name="Azure AI Language PII",
41
  )
42
 
 
43
  self.ta_key = ta_key
44
  self.ta_endpoint = ta_endpoint
45
 
 
46
  if not ta_client:
47
  ta_client = self.__authenticate_client(ta_key, ta_endpoint)
48
  self.ta_client = ta_client
49
 
50
  @staticmethod
51
  def __authenticate_client(key: str, endpoint: str):
 
 
 
 
 
 
 
 
52
  ta_credential = AzureKeyCredential(key)
53
  text_analytics_client = TextAnalyticsClient(
54
  endpoint=endpoint, credential=ta_credential
@@ -58,21 +80,43 @@ class AzureAIServiceWrapper(EntityRecognizer):
58
  def analyze(
59
  self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
60
  ) -> List[RecognizerResult]:
 
 
 
 
 
 
 
 
 
 
 
61
  if not entities:
62
  entities = []
 
 
63
  response = self.ta_client.recognize_pii_entities(
64
  [text], language=self.supported_language
65
  )
 
 
66
  results = [doc for doc in response if not doc.is_error]
67
  recognizer_results = []
 
 
68
  for res in results:
69
  for entity in res.entities:
 
70
  if entity.category not in self.supported_entities:
71
  continue
72
- analysis_explanation = AzureAIServiceWrapper._build_explanation(
 
 
73
  original_score=entity.confidence_score,
74
  entity_type=entity.category,
75
  )
 
 
76
  recognizer_results.append(
77
  RecognizerResult(
78
  entity_type=entity.category,
@@ -83,12 +127,21 @@ class AzureAIServiceWrapper(EntityRecognizer):
83
  )
84
  )
85
 
 
86
  return recognizer_results
87
 
88
  @staticmethod
89
  def _build_explanation(
90
  original_score: float, entity_type: str
91
  ) -> AnalysisExplanation:
 
 
 
 
 
 
 
 
92
  explanation = AnalysisExplanation(
93
  recognizer=AzureAIServiceWrapper.__class__.__name__,
94
  original_score=original_score,
@@ -97,13 +150,26 @@ class AzureAIServiceWrapper(EntityRecognizer):
97
  return explanation
98
 
99
  def load(self) -> None:
 
 
 
 
 
100
  pass
101
 
102
 
103
  if __name__ == "__main__":
 
 
 
 
 
104
  import presidio_helpers
105
 
 
106
  dotenv.load_dotenv()
 
 
107
  text = """
108
  Here are a few example sentences we currently support:
109
 
@@ -118,9 +184,13 @@ if __name__ == "__main__":
118
 
119
  Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
120
  """
 
 
121
  analyzer = presidio_helpers.analyzer_engine(
122
  model_path="Azure Text Analytics PII",
123
  ta_key=os.environ["TA_KEY"],
124
  ta_endpoint=os.environ["TA_ENDPOINT"],
125
  )
126
- analyzer.analyze(text=text, language="en")
 
 
 
8
  from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
9
  from presidio_analyzer.nlp_engine import NlpArtifacts
10
 
11
+ # 📜 Our trusty scribe, logging every move of our privacy-protecting saga
12
  logger = logging.getLogger("presidio-streamlit")
13
 
14
 
15
  class AzureAIServiceWrapper(EntityRecognizer):
16
+ """
17
+ 🦸‍♂️ The Azure AI Service Wrapper: A superhero class that wields Azure's Text Analytics
18
+ to zap PII/PHI from text like a privacy avenger! Built to integrate with Presidio's
19
+ analyzer, it’s ready to team up with your SFT app for world-saving AI missions. 💪
20
+ """
21
  from azure.ai.textanalytics._models import PiiEntityCategory
22
 
23
+ # 📋 Our hit list of PII entities Azure can tackle—SSNs, credit cards, you name it!
24
  TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
25
 
26
  def __init__(
 
32
  ta_endpoint: Optional[str] = None,
33
  ):
34
  """
35
+ 🎬 Lights, camera, action! Initializes our Azure-powered PII slayer.
36
+ :param supported_entities: PII types to hunt (defaults to ALL the baddies).
37
+ :param supported_language: Language to analyze (English by default, mate! 🇬🇧).
38
+ :param ta_client: Pre-authenticated Azure client (or we’ll forge one ourselves).
39
+ :param ta_key: Secret key to unlock Azure’s vault of NLP magic.
40
+ :param ta_endpoint: The Azure portal where the PII-zapping happens.
41
+ *Clever quip*: Think of this as assembling Iron Man’s suit—credentials, endpoints,
42
+ and entity lists snap together for a privacy-protecting masterpiece! 😼
43
  """
44
+ # 🛡️ Default to all supported entities if none specified—maximum coverage!
45
  if not supported_entities:
46
  supported_entities = self.TA_SUPPORTED_ENTITIES
47
 
48
+ # 🧬 Inherit Presidio’s EntityRecognizer powers, branding ourselves as Azure’s finest
49
  super().__init__(
50
  supported_entities=supported_entities,
51
  supported_language=supported_language,
52
  name="Azure AI Language PII",
53
  )
54
 
55
+ # 🔑 Stash the key and endpoint for Azure’s secret handshake
56
  self.ta_key = ta_key
57
  self.ta_endpoint = ta_endpoint
58
 
59
+ # 🤝 Authenticate if no client’s provided—time to summon Azure’s NLP beast!
60
  if not ta_client:
61
  ta_client = self.__authenticate_client(ta_key, ta_endpoint)
62
  self.ta_client = ta_client
63
 
64
  @staticmethod
65
  def __authenticate_client(key: str, endpoint: str):
66
+ """
67
+ 🔓 Unlocks Azure’s treasure chest with a key and endpoint.
68
+ :param key: The magic password to Azure’s NLP kingdom.
69
+ :param endpoint: The gate to Azure’s Text Analytics realm.
70
+ :return: A shiny TextAnalyticsClient ready to rumble!
71
+ *Fun fact*: This is like getting VIP access to a privacy party—credentials
72
+ checked, and we’re in! 🎉
73
+ """
74
  ta_credential = AzureKeyCredential(key)
75
  text_analytics_client = TextAnalyticsClient(
76
  endpoint=endpoint, credential=ta_credential
 
80
  def analyze(
81
  self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
82
  ) -> List[RecognizerResult]:
83
+ """
84
+ 🕵️‍♀️ The main event: Scans text for PII like a hawk and returns redacted results.
85
+ :param text: The text to scrub clean of sensitive data.
86
+ :param entities: Specific PII types to hunt (or all if None).
87
+ :param nlp_artifacts: Optional Presidio NLP goodies (we’re cool without ‘em).
88
+ :return: A list of RecognizerResult with PII locations and confidence scores.
89
+ *Superpower alert*: This method’s like X-ray vision for sensitive data—SSNs,
90
+ credit cards, and emails don’t stand a chance! 🦅
91
+ *SFT tease*: Imagine pairing this with your fine-tuned model for next-level AI! 😏
92
+ """
93
+ # 🗳️ Default to empty entity list if none provided—flexibility is our jam
94
  if not entities:
95
  entities = []
96
+
97
+ # 🚀 Fire up Azure’s PII recognizer with the text and language
98
  response = self.ta_client.recognize_pii_entities(
99
  [text], language=self.supported_language
100
  )
101
+
102
+ # ✅ Filter out any errors—only the good stuff makes the cut
103
  results = [doc for doc in response if not doc.is_error]
104
  recognizer_results = []
105
+
106
+ # 🔍 Loop through results, cherry-picking valid entities
107
  for res in results:
108
  for entity in res.entities:
109
+ # ��� Skip unsupported entities—we’re picky like that
110
  if entity.category not in self.supported_entities:
111
  continue
112
+
113
+ # 📝 Craft a fancy explanation for why we flagged this PII
114
+ analysis_explanation = self._build_explanation(
115
  original_score=entity.confidence_score,
116
  entity_type=entity.category,
117
  )
118
+
119
+ # 🎯 Log the hit: entity type, position, and confidence score
120
  recognizer_results.append(
121
  RecognizerResult(
122
  entity_type=entity.category,
 
127
  )
128
  )
129
 
130
+ # 🏆 Return the haul of PII findings—mission accomplished!
131
  return recognizer_results
132
 
133
  @staticmethod
134
  def _build_explanation(
135
  original_score: float, entity_type: str
136
  ) -> AnalysisExplanation:
137
+ """
138
+ 📜 Writes a love letter explaining why we flagged a PII entity.
139
+ :param original_score: Confidence score from Azure’s NLP oracle.
140
+ :param entity_type: The type of PII we nabbed (e.g., SSN, PHONE_NUMBER).
141
+ :return: An AnalysisExplanation object with all the juicy details.
142
+ *Witty note*: This is like leaving a Post-it note saying, “Caught ya, sneaky
143
+ credit card number!” 😜
144
+ """
145
  explanation = AnalysisExplanation(
146
  recognizer=AzureAIServiceWrapper.__class__.__name__,
147
  original_score=original_score,
 
150
  return explanation
151
 
152
  def load(self) -> None:
153
+ """
154
+ 🛠️ Placeholder for loading resources—Azure’s already warmed up, so we chill.
155
+ *Cheeky remark*: Like a superhero on standby, we’re always ready to leap
156
+ into action. No prep needed! 😎
157
+ """
158
  pass
159
 
160
 
161
  if __name__ == "__main__":
162
+ """
163
+ 🎮 Demo mode: Test-drive our PII zapper with sample text!
164
+ *Hugging Face nod*: Think of this as a mini HF Space—try it, love it, push it
165
+ to the Hub! 🤗
166
+ """
167
  import presidio_helpers
168
 
169
+ # 🔐 Load secrets from .env—because hardcoding keys is so last century
170
  dotenv.load_dotenv()
171
+
172
+ # 📖 Our test story, packed with PII for our hero to vanquish
173
  text = """
174
  Here are a few example sentences we currently support:
175
 
 
184
 
185
  Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
186
  """
187
+
188
+ # 🦸‍♀️ Summon the analyzer with Azure’s secret sauce
189
  analyzer = presidio_helpers.analyzer_engine(
190
  model_path="Azure Text Analytics PII",
191
  ta_key=os.environ["TA_KEY"],
192
  ta_endpoint=os.environ["TA_ENDPOINT"],
193
  )
194
+
195
+ # 💥 Unleash the PII-hunting beast on our text
196
+ analyzer.analyze(text=text, language="en")