Kiran5's picture
Track large files and images with Git LFS
54fa0c8
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
# from pii_inference.utils import PiiNERPipeline
from datasets import Dataset
# from pii_redaction.utils import get_replacements, redact_pii_batch
from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch
import json
import re
import os
from privacy.util.code_detect.ner.pii_inference.utils.pipeline import PiiNERPipeline
class codeNer:
def codeFile(code, filename,model,tokenizer):
# Specify the path to your local model and input code file
# model_path = "pii_inference/nermodel"
# code_file_path = "1.txt"
# redacted_code_file_path = "1_redacted.txt"
# Load the model and tokenizers
# model = AutoModelForTokenClassification.from_pretrained(model1)
# tokenizer = AutoTokenizer.from_pretrained(tokenizer1)
# If code is a bytes object, decode it to a string
if isinstance(code, bytes):
code = code.decode('utf-8')
print(code, "CODE")
# Create the NER pipeline
pipeline = PiiNERPipeline(
model,
tokenizer=tokenizer,
batch_size=1024,
window_size=512,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
num_workers=1,
id_to_label=model.config.id2label,
window_overlap=False,
bf16=True
)
# # Read the input code file
# with open(code_file_path, "r",encoding="utf-8") as file:
# code = file.read()
# Split the code into sentences
# sentences = code.split(". ")
sentences = re.split('(?<=\. )(?!$|[a-z])', code)
# sentences = code
print(sentences, "############SENTENCES#########")
# Create an id list
ids = list(range(len(sentences)))
# Create a Dataset object from the sentences
dataset = Dataset.from_dict({"content": sentences, "id": ids})
replacements = get_replacements()
# Process the sentences with the NER pipeline
result = pipeline(dataset)
# Convert the generator to a list and print the results
results = list(result)
print(results, "RESULT")
# # Check the structure of each result
# for res in results:
# print(res)
# Prepare the examples for redaction
examples = {
"content": [res["content"] for res in results],
"entities": [res["entities"] for res in results]
}
# Print examples for debugging
# print(examples, "EXAMPLES")
# Redact PII in the batch of examples
redacted_results = redact_pii_batch(examples, replacements)
print(redacted_results, "redacted_code_parts")
# Extract the redacted code from the results
redacted_code_parts = redacted_results["new_content"]
print(redacted_code_parts, "redacted_code_parts")
# redacted_code = "".join(redacted_code_parts)
# print(redacted_code, "redacted_code")
# print(redacted_code1, "redacted_code1")
# Save the redacted code into a new file
# Generate the output file name based on the input file name
output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]
try:
# with open(output_code_file, "w") as file:
# for part in redacted_code_parts:
# file.write(part)
# print(part, "part")
# print(part, "part")
with open(output_code_file, "w") as file:
# Join all the parts into a single string
content = ''.join(redacted_code_parts)
# Write the content to the file
file.write(content)
print(content, "content")
# with open(output_code_file, "r") as file:
# content = file.read()
# print(content,"content from function")
# file.write(redacted_code_parts)
except Exception as e:
print(f"An error occurred while writing to the file: {e}")
return content, output_code_file
def codeText(code, model, tokenizer):
# If code is a bytes object, decode it to a string
if isinstance(code, bytes):
code = code.decode('utf-8')
print(code, "CODE")
# Create the NER pipeline
pipeline = PiiNERPipeline(
model,
tokenizer=tokenizer,
batch_size=1024,
window_size=512,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
num_workers=1,
id_to_label=model.config.id2label,
window_overlap=False,
bf16=True
)
# Split the code into sentences
sentences = re.split('(?<=\. )(?!$|[a-z])', code)
print(sentences, "############SENTENCES#########")
# Create an id list
ids = list(range(len(sentences)))
# Create a Dataset object from the sentences
dataset = Dataset.from_dict({"content": sentences, "id": ids})
replacements = get_replacements()
# Process the sentences with the NER pipeline
result = pipeline(dataset)
# Convert the generator to a list and print the results
results = list(result)
print(results, "RESULT")
# Prepare the examples for redaction
examples = {
"content": [res["content"] for res in results],
"entities": [res["entities"] for res in results]
}
# Redact PII in the batch of examples
redacted_results = redact_pii_batch(examples, replacements)
print(redacted_results, "redacted_code_parts")
# Extract the redacted code from the results
redacted_code_parts = redacted_results["new_content"]
print(redacted_code_parts, "redacted_code_parts")
# Join all the parts into a single string
redacted_code = ''.join(redacted_code_parts)
print(redacted_code, "redacted_code")
return redacted_code
# if __name__ == "__main__":
# main()