File size: 8,284 Bytes
06555b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
"""
Functionality to convert tabular data in Frictionless Data Packages to CSVW.
We translate [table schemas](https://specs.frictionlessdata.io/table-schema/) defined
for [data resources](https://specs.frictionlessdata.io/data-resource/) in a
[data package](https://specs.frictionlessdata.io/data-package/) to a CVSW TableGroup.
This functionality can be used together with the `frictionless describe` command to add
CSVW metadata to "raw" CSV tables.
"""
import json
import pathlib
def convert_column_spec(spec):
"""
https://specs.frictionlessdata.io/table-schema/#field-descriptors
:param spec:
:return:
"""
typemap = {
'year': 'gYear',
'yearmonth': 'gYearMonth',
}
titles = [t for t in [spec.get('title')] if t]
res = {'name': spec['name'], 'datatype': {'base': 'string'}}
if 'type' in spec:
if spec['type'] == 'string' and spec.get('format') == 'binary':
res['datatype']['base'] = 'binary'
elif spec['type'] == 'string' and spec.get('format') == 'uri':
res['datatype']['base'] = 'anyURI'
elif spec['type'] in typemap:
res['datatype']['base'] = typemap[spec['type']]
elif spec['type'] in [
'string', 'number', 'integer', 'boolean', 'date', 'time', 'datetime', 'duration',
]:
res['datatype']['base'] = spec['type']
if spec['type'] == 'string' and spec.get('format'):
res['datatype']['dc:format'] = spec['format']
if spec['type'] == 'boolean' and spec.get('trueValues') and spec.get('falseValues'):
res['datatype']['format'] = '{}|{}'.format(
spec['trueValues'][0], spec['falseValues'][0])
if spec['type'] in ['number', 'integer']:
if spec.get('bareNumber') is True: # pragma: no cover
raise NotImplementedError(
'bareNumber is not supported in CSVW. It may be possible to translate to '
'a number pattern, though. See '
'https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/'
'#formats-for-numeric-types')
if any(prop in spec for prop in ['decimalChar', 'groupChar']):
res['datatype']['format'] = {}
for p in ['decimalChar', 'groupChar']:
if spec.get(p):
res['datatype']['format'][p] = spec[p]
elif spec['type'] in ['object', 'array']:
res['datatype']['base'] = 'json'
res['datatype']['dc:format'] = 'application/json'
elif spec['type'] == 'geojson':
res['datatype']['base'] = 'json'
res['datatype']['dc:format'] = 'application/geo+json'
if titles:
res['titles'] = titles
if 'description' in spec:
res['dc:description'] = [spec['description']]
if 'rdfType' in spec:
res['propertyUrl'] = spec['rdfType']
constraints = spec.get('constraints', {})
for prop in ['required', 'minLength', 'maxLength', 'minimum', 'maximum']:
if prop in constraints:
res['datatype'][prop] = constraints[prop]
if ('pattern' in constraints) and ('format' not in res['datatype']):
res['datatype']['format'] = constraints['pattern']
# FIXME: we could transform the "enum" constraint for string into
# a regular expression in the "format" property.
return res
def convert_foreignKey(rsc_name, fk, resource_map):
"""
https://specs.frictionlessdata.io/table-schema/#foreign-keys
"""
# Rename "fields" to "columnReference" and map resource name to url (resolving self-referential
# foreign keys).
return dict(
columnReference=fk['fields'],
reference=dict(
columnReference=fk['reference']['fields'],
resource=resource_map[fk['reference']['resource'] or rsc_name],
)
)
def convert_table_schema(rsc_name, schema, resource_map):
"""
:param rsc_name: `name` property of the resource the schema belongs to. Needed to resolve \
self-referential foreign keys.
:param schema: `dict` parsed from JSON representing a frictionless Table Schema object.
:param resource_map: `dict` mapping resource names to resource paths, needed to convert foreign\
key constraints.
:return: `dict` suitable for instantiating a `csvw.metadata.Schema` object.
"""
res = dict(
columns=[convert_column_spec(f) for f in schema['fields']],
)
for prop in [
('missingValues', 'null'),
'primaryKey',
'foreignKeys',
]:
if isinstance(prop, tuple):
prop, toprop = prop
else:
toprop = prop
if prop in schema:
res[toprop] = schema[prop]
if prop == 'foreignKeys':
res[toprop] = [convert_foreignKey(rsc_name, fk, resource_map) for fk in res[toprop]]
return res
def convert_dialect(rsc):
"""
Limitations: lineTerminator is not supported.
https://specs.frictionlessdata.io/csv-dialect/
"""
d = rsc.get('dialect', {})
# Work around https://github.com/frictionlessdata/frictionless-py/issues/1506
if 'csv' in d:
d = d['csv']
res = {}
if d.get('delimiter'):
res['delimiter'] = d['delimiter']
if rsc.get('encoding'):
res['encoding'] = rsc['encoding']
for prop in [
'delimiter',
'quoteChar',
'doubleQuote',
'skipInitialSpace',
'header',
]:
if prop in d:
res[prop] = d[prop]
if 'commentChar' in d:
res['commentPrefix'] = d['commentChar']
return res
class DataPackage:
def __init__(self, spec, directory=None):
if isinstance(spec, DataPackage):
self.json = spec.json
self.dir = spec.dir
return
if isinstance(spec, dict):
# already a parsed JSON object
self.dir = pathlib.Path(directory or '.')
elif isinstance(spec, pathlib.Path):
self.dir = directory or spec.parent
spec = json.loads(spec.read_text(encoding='utf8'))
else: # assume a JSON formatted string
spec = json.loads(spec)
self.dir = pathlib.Path(directory or '.')
self.json = spec
def to_tablegroup(self, cls=None):
from csvw import TableGroup
md = {'@context': "http://www.w3.org/ns/csvw"}
# Package metadata:
md['dc:replaces'] = json.dumps(self.json)
# version,
# image,
for flprop, csvwprop in [
('id', 'dc:identifier'),
('licenses', 'dc:license'),
('title', 'dc:title'),
('homepage', 'dcat:accessURL'),
('description', 'dc:description'),
('sources', 'dc:source'),
('contributors', 'dc:contributor'),
('profile', 'dc:conformsTo'),
('keywords', 'dc:subject'),
('created', 'dc:created'),
]:
if flprop in self.json:
md[csvwprop] = self.json[flprop]
if 'name' in self.json:
if 'id' not in self.json:
md['dc:identifier'] = self.json['name']
elif 'title' not in self.json:
md['dc:title'] = self.json['name']
# Data Resource metadata:
resources = [rsc for rsc in self.json.get('resources', []) if 'path' in rsc]
resource_map = {rsc['name']: rsc['path'] for rsc in resources if 'name' in rsc}
for rsc in resources:
schema = rsc.get('schema')
if schema and \
rsc.get('scheme') == 'file' and \
rsc.get('format') == 'csv':
# Table Schema:
md.setdefault('tables', [])
table = dict(
url=rsc['path'],
tableSchema=convert_table_schema(rsc.get('name'), schema, resource_map),
dialect=convert_dialect(rsc),
)
md['tables'].append(table)
cls = cls or TableGroup
res = cls.fromvalue(md)
res._fname = self.dir / 'csvw-metadata.json'
return res
|