Spaces:
Running
Running
File size: 3,427 Bytes
d5f5654 2a50df2 8287176 d5f5654 8858793 d5f5654 8287176 d5f5654 ddf094a 299be58 ddf094a 299be58 ddf094a 299be58 ddf094a d5f5654 8287176 d5f5654 ddf094a 299be58 ddf094a 299be58 ddf094a 299be58 ddf094a d5f5654 8287176 d5f5654 ddf094a 299be58 ddf094a 8287176 299be58 8287176 299be58 ddf094a 299be58 ddf094a 299be58 47d2223 e7bc9a5 47d2223 299be58 47d2223 e7bc9a5 299be58 e7bc9a5 47d2223 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import json
import traceback
import mlcroissant as mlc
import func_timeout
WAIT_TIME = 5 * 60 # seconds
def validate_json(file_path):
"""Validate that the file is proper JSON."""
try:
with open(file_path, 'r') as f:
json_data = json.load(f)
return True, "The file is valid JSON.", json_data
except json.JSONDecodeError as e:
error_message = f"Invalid JSON format: {str(e)}"
return False, error_message, None
except Exception as e:
error_message = f"Error reading file: {str(e)}"
return False, error_message, None
def validate_croissant(json_data):
"""Validate that the JSON follows Croissant schema."""
try:
dataset = mlc.Dataset(jsonld=json_data)
return True, "The dataset passes Croissant validation."
except mlc.ValidationError as e:
error_details = traceback.format_exc()
error_message = f"Validation failed: {str(e)}\n\n{error_details}"
return False, error_message
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
return False, error_message
def validate_records(json_data):
"""Validate that records can be generated within the time limit."""
try:
dataset = mlc.Dataset(jsonld=json_data)
record_sets = dataset.metadata.record_sets
if not record_sets:
return True, "No record sets found to validate."
results = []
for record_set in record_sets:
try:
records = dataset.records(record_set=record_set.uuid)
print(records)
_ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
results.append(f"Record set '{record_set.uuid}' passed validation.")
except func_timeout.exceptions.FunctionTimedOut:
error_message = f"Record set '{record_set.uuid}' generation took too long (>300s)"
return False, error_message
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
return False, error_message
return True, "\n".join(results)
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
return False, error_message
def generate_validation_report(filename, json_data, results):
"""Generate a detailed validation report in markdown format."""
report = []
report.append("# CROISSANT VALIDATION REPORT")
report.append("=" * 80)
report.append("## VALIDATION RESULTS")
report.append("-" * 80)
report.append(f"Starting validation for file: {filename}")
# Add validation results
for test_name, passed, message in results:
report.append(f"### {test_name}")
report.append("β" if passed else "β")
report.append(message.strip()) # Remove any trailing newlines
# Add JSON-LD reference
report.append("## JSON-LD REFERENCE")
report.append("=" * 80)
report.append("```json")
report.append(json.dumps(json_data, indent=2))
report.append("```")
return "\n".join(report) |