Merge pull request #20 from seanpedrick-case/dev
Browse filesDynamoDB logging format/example and minor text revisions
- DocRedactApp_0.6.1.spec → DocRedactApp.spec +2 -2
- README.md +2 -0
- how_to_create_exe_dist.txt +2 -2
- load_dynamo_logs.py +56 -0
- load_s3_logs.py +3 -3
- pyproject.toml +7 -8
- tools/custom_csvlogger.py +2 -1
DocRedactApp_0.6.1.spec → DocRedactApp.spec
RENAMED
@@ -43,7 +43,7 @@ exe = EXE(
|
|
43 |
a.scripts,
|
44 |
[],
|
45 |
exclude_binaries=True,
|
46 |
-
name='DocRedactApp_0.
|
47 |
debug=False,
|
48 |
bootloader_ignore_signals=False,
|
49 |
strip=False,
|
@@ -62,5 +62,5 @@ coll = COLLECT(
|
|
62 |
strip=False,
|
63 |
upx=True,
|
64 |
upx_exclude=[],
|
65 |
-
name='DocRedactApp_0.6.
|
66 |
)
|
|
|
43 |
a.scripts,
|
44 |
[],
|
45 |
exclude_binaries=True,
|
46 |
+
name='DocRedactApp_0.6.2',
|
47 |
debug=False,
|
48 |
bootloader_ignore_signals=False,
|
49 |
strip=False,
|
|
|
62 |
strip=False,
|
63 |
upx=True,
|
64 |
upx_exclude=[],
|
65 |
+
name='DocRedactApp_0.6.2',
|
66 |
)
|
README.md
CHANGED
@@ -10,6 +10,8 @@ license: agpl-3.0
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
|
|
|
|
13 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
14 |
|
15 |
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
+
version: 0.6.2
|
14 |
+
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
17 |
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
how_to_create_exe_dist.txt
CHANGED
@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
16 |
|
17 |
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
18 |
|
19 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name
|
20 |
|
21 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
22 |
|
@@ -32,7 +32,7 @@ a = Analysis(
|
|
32 |
|
33 |
hook-presidio-image-redactor.py
|
34 |
|
35 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm
|
36 |
|
37 |
|
38 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
|
|
16 |
|
17 |
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
18 |
|
19 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp app.py
|
20 |
|
21 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
22 |
|
|
|
32 |
|
33 |
hook-presidio-image-redactor.py
|
34 |
|
35 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp.spec
|
36 |
|
37 |
|
38 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
load_dynamo_logs.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
import csv
|
3 |
+
from decimal import Decimal
|
4 |
+
from boto3.dynamodb.conditions import Key
|
5 |
+
|
6 |
+
from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
|
7 |
+
|
8 |
+
# Replace with your actual table name and region
|
9 |
+
TABLE_NAME = USAGE_LOG_DYNAMODB_TABLE_NAME # Choose as appropriate
|
10 |
+
REGION = AWS_REGION
|
11 |
+
CSV_OUTPUT = OUTPUT_FOLDER + 'dynamodb_logs_export.csv'
|
12 |
+
|
13 |
+
# Create DynamoDB resource
|
14 |
+
dynamodb = boto3.resource('dynamodb', region_name=REGION)
|
15 |
+
table = dynamodb.Table(TABLE_NAME)
|
16 |
+
|
17 |
+
# Helper function to convert Decimal to float or int
|
18 |
+
def convert_types(item):
|
19 |
+
for key, value in item.items():
|
20 |
+
if isinstance(value, Decimal):
|
21 |
+
# Convert to int if no decimal places, else float
|
22 |
+
item[key] = int(value) if value % 1 == 0 else float(value)
|
23 |
+
return item
|
24 |
+
|
25 |
+
# Paginated scan
|
26 |
+
def scan_table():
|
27 |
+
items = []
|
28 |
+
response = table.scan()
|
29 |
+
items.extend(response['Items'])
|
30 |
+
|
31 |
+
while 'LastEvaluatedKey' in response:
|
32 |
+
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
|
33 |
+
items.extend(response['Items'])
|
34 |
+
|
35 |
+
return items
|
36 |
+
|
37 |
+
# Export to CSV
|
38 |
+
def export_to_csv(items, output_path):
|
39 |
+
if not items:
|
40 |
+
print("No items found.")
|
41 |
+
return
|
42 |
+
|
43 |
+
fieldnames = sorted(items[0].keys())
|
44 |
+
|
45 |
+
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
46 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
47 |
+
writer.writeheader()
|
48 |
+
|
49 |
+
for item in items:
|
50 |
+
writer.writerow(convert_types(item))
|
51 |
+
|
52 |
+
print(f"Exported {len(items)} items to {output_path}")
|
53 |
+
|
54 |
+
# Run export
|
55 |
+
items = scan_table()
|
56 |
+
export_to_csv(items, CSV_OUTPUT)
|
load_s3_logs.py
CHANGED
@@ -2,7 +2,7 @@ import boto3
|
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
from datetime import datetime
|
5 |
-
from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
6 |
|
7 |
# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
|
8 |
|
@@ -71,7 +71,7 @@ if df_list:
|
|
71 |
concatenated_df = pd.concat(df_list, ignore_index=True)
|
72 |
|
73 |
# Save the concatenated DataFrame to a CSV file
|
74 |
-
concatenated_df.to_csv('
|
75 |
-
print("Consolidated CSV saved as '
|
76 |
else:
|
77 |
print("No log files found in the given date range.")
|
|
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
from datetime import datetime
|
5 |
+
from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, OUTPUT_FOLDER
|
6 |
|
7 |
# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
|
8 |
|
|
|
71 |
concatenated_df = pd.concat(df_list, ignore_index=True)
|
72 |
|
73 |
# Save the concatenated DataFrame to a CSV file
|
74 |
+
concatenated_df.to_csv(OUTPUT_FOLDER + 'consolidated_s3_logs.csv', index=False)
|
75 |
+
print("Consolidated CSV saved as 'consolidated_s3_logs.csv'")
|
76 |
else:
|
77 |
print("No log files found in the given date range.")
|
pyproject.toml
CHANGED
@@ -3,11 +3,11 @@ requires = ["setuptools>=61.0", "wheel"]
|
|
3 |
build-backend = "setuptools.build_meta"
|
4 |
|
5 |
[project]
|
6 |
-
name = "doc_redaction"
|
7 |
-
version = "0.6.
|
8 |
-
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
-
readme = "README.md"
|
10 |
-
requires-python = ">=3.10"
|
11 |
|
12 |
dependencies = [
|
13 |
"pdfminer.six==20240706",
|
@@ -45,13 +45,12 @@ repository = "https://github.com/seanpedrick-case/doc_redaction"
|
|
45 |
[project.optional-dependencies]
|
46 |
dev = ["pytest"]
|
47 |
|
48 |
-
#
|
49 |
-
# For example, configuration for a linter like Ruff:
|
50 |
[tool.ruff]
|
51 |
line-length = 88
|
52 |
select = ["E", "F", "I"]
|
53 |
|
54 |
-
#
|
55 |
[tool.black]
|
56 |
line-length = 88
|
57 |
target-version = ['py310']
|
|
|
3 |
build-backend = "setuptools.build_meta"
|
4 |
|
5 |
[project]
|
6 |
+
name = "doc_redaction"
|
7 |
+
version = "0.6.2"
|
8 |
+
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
+
readme = "README.md"
|
10 |
+
requires-python = ">=3.10"
|
11 |
|
12 |
dependencies = [
|
13 |
"pdfminer.six==20240706",
|
|
|
45 |
[project.optional-dependencies]
|
46 |
dev = ["pytest"]
|
47 |
|
48 |
+
# Configuration for Ruff linter:
|
|
|
49 |
[tool.ruff]
|
50 |
line-length = 88
|
51 |
select = ["E", "F", "I"]
|
52 |
|
53 |
+
# Configuration for a Black formatter:
|
54 |
[tool.black]
|
55 |
line-length = 88
|
56 |
target-version = ['py310']
|
tools/custom_csvlogger.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2 |
import contextlib
|
3 |
import csv
|
4 |
import datetime
|
|
|
5 |
import os
|
6 |
import re
|
7 |
import boto3
|
@@ -177,7 +178,7 @@ class CSVLogger_custom(FlaggingCallback):
|
|
177 |
csv_data.append(username)
|
178 |
|
179 |
|
180 |
-
timestamp =
|
181 |
csv_data.append(timestamp)
|
182 |
|
183 |
generated_id = str(uuid.uuid4())
|
|
|
2 |
import contextlib
|
3 |
import csv
|
4 |
import datetime
|
5 |
+
from datetime import datetime
|
6 |
import os
|
7 |
import re
|
8 |
import boto3
|
|
|
178 |
csv_data.append(username)
|
179 |
|
180 |
|
181 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
|
182 |
csv_data.append(timestamp)
|
183 |
|
184 |
generated_id = str(uuid.uuid4())
|