krishna-k's picture
Upload folder using huggingface_hub
06555b5 verified
raw
history blame contribute delete
8.28 kB
import re
import copy
import html
import json
import string
import keyword
import pathlib
import warnings
import collections
import unicodedata
import attr
def is_url(s):
return re.match(r'https?://', str(s))
def converter(type_, default, s, allow_none=False, cond=None, allow_list=True):
if allow_list and type_ != list and isinstance(s, list):
return [v for v in [converter(type_, None, ss, cond=cond) for ss in s] if v is not None]
if allow_none and s is None:
return s
if not isinstance(s, type_) or (type_ == int and isinstance(s, bool)) or (cond and not cond(s)):
warnings.warn('Invalid value for property: {}'.format(s))
return default
return s
def ensure_path(fname):
if not isinstance(fname, pathlib.Path):
assert isinstance(fname, str)
return pathlib.Path(fname)
return fname
def attr_defaults(cls):
res = collections.OrderedDict()
for field in attr.fields(cls):
default = field.default
if isinstance(default, attr.Factory):
default = default.factory()
res[field.name] = default
return res
def attr_asdict(obj, omit_defaults=True, omit_private=True):
defs = attr_defaults(obj.__class__)
res = collections.OrderedDict()
for field in attr.fields(obj.__class__):
if not (omit_private and field.name.startswith('_')):
value = getattr(obj, field.name)
if not (omit_defaults and value == defs[field.name]):
if hasattr(value, 'asdict'):
value = value.asdict(omit_defaults=True)
res[field.name] = value
return res
def normalize_name(s):
"""Convert a string into a valid python attribute name.
This function is called to convert ASCII strings to something that can pass as
python attribute name, to be used with namedtuples.
>>> str(normalize_name('class'))
'class_'
>>> str(normalize_name('a-name'))
'a_name'
>>> str(normalize_name('a n\u00e4me'))
'a_name'
>>> str(normalize_name('Name'))
'Name'
>>> str(normalize_name(''))
'_'
>>> str(normalize_name('1'))
'_1'
"""
s = s.replace('-', '_').replace('.', '_').replace(' ', '_')
if s in keyword.kwlist:
return s + '_'
s = '_'.join(slug(ss, lowercase=False) for ss in s.split('_'))
if not s:
s = '_'
if s[0] not in string.ascii_letters + '_':
s = '_' + s
return s
def slug(s, remove_whitespace=True, lowercase=True):
"""Condensed version of s, containing only lowercase alphanumeric characters.
>>> str(slug('A B. \u00e4C'))
'abac'
"""
res = ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
if lowercase:
res = res.lower()
for c in string.punctuation:
res = res.replace(c, '')
res = re.sub(r'\s+', '' if remove_whitespace else ' ', res)
res = res.encode('ascii', 'ignore').decode('ascii')
assert re.match('[ A-Za-z0-9]*$', res)
return res
def qname2url(qname):
for prefix, uri in {
'csvw': 'http://www.w3.org/ns/csvw#',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
'xsd': 'http://www.w3.org/2001/XMLSchema#',
'dc': 'http://purl.org/dc/terms/',
'dcat': 'http://www.w3.org/ns/dcat#',
'prov': 'http://www.w3.org/ns/prov#',
}.items():
if qname.startswith(prefix + ':'):
return qname.replace(prefix + ':', uri)
def metadata2markdown(tg, link_files=False) -> str:
"""
Render the metadata of a dataset as markdown.
:param link_files: If True, links to data files will be added, assuming the markdown is stored \
in the same directory as the metadata file.
:return: `str` with markdown formatted text
"""
def qname2link(qname, html=False):
url = qname2url(qname)
if url:
if html:
return '<a href="{}">{}</a>'.format(url, qname)
return '[{}]({})'.format(qname, url)
return qname
def htmlify(obj, key=None):
"""
For inclusion in tables we must use HTML for lists.
"""
if isinstance(obj, list):
return '<ol>{}</ol>'.format(
''.join('<li>{}</li>'.format(htmlify(item, key=key)) for item in obj))
if isinstance(obj, dict):
items = []
for k, v in obj.items():
items.append('<dt>{}</dt><dd>{}</dd>'.format(
qname2link(k, html=True), html.escape(str(v))))
return '<dl>{}</dl>'.format(''.join(items))
return str(obj)
def properties(props):
props = {k: v for k, v in copy.deepcopy(props).items() if v}
res = []
desc = props.pop('dc:description', None)
if desc:
res.append(desc + '\n')
img = props.pop('https://schema.org/image', None)
if img:
if isinstance(img, str): # pragma: no cover
img = {'contentUrl': img}
res.append('![{}]({})\n'.format(
img.get('https://schema.org/caption') or '',
img.get('https://schema.org/contentUrl')))
if props:
res.append('property | value\n --- | ---')
for k, v in props.items():
res.append('{} | {}'.format(qname2link(k), htmlify(v, key=k)))
return '\n'.join(res) + '\n'
def colrow(col, fks, pk):
dt = '`{}`'.format(col.datatype.base if col.datatype else 'string')
if col.datatype:
if col.datatype.format:
if re.fullmatch(r'[\w\s]+(\|[\w\s]+)*', col.datatype.format):
dt += '<br>Valid choices:<br>'
dt += ''.join(' `{}`'.format(w) for w in col.datatype.format.split('|'))
elif col.datatype.base == 'string':
dt += '<br>Regex: `{}`'.format(col.datatype.format)
if col.datatype.minimum:
dt += '<br>&ge; {}'.format(col.datatype.minimum)
if col.datatype.maximum:
dt += '<br>&le; {}'.format(col.datatype.maximum)
if col.separator:
dt = 'list of {} (separated by `{}`)'.format(dt, col.separator)
desc = col.common_props.get('dc:description', '').replace('\n', ' ')
if pk and col.name in pk:
desc = (desc + '<br>') if desc else desc
desc += 'Primary key'
if col.name in fks:
desc = (desc + '<br>') if desc else desc
desc += 'References [{}::{}](#table-{})'.format(
fks[col.name][1], fks[col.name][0], slug(fks[col.name][1]))
return ' | '.join([
'[{}]({})'.format(col.name, col.propertyUrl)
if col.propertyUrl else '`{}`'.format(col.name),
dt,
desc,
])
res = ['# {}\n'.format(tg.common_props.get('dc:title', 'Dataset'))]
if tg._fname and link_files:
res.append('> [!NOTE]\n> Described by [{0}]({0}).\n'.format(tg._fname.name))
res.append(properties({k: v for k, v in tg.common_props.items() if k != 'dc:title'}))
for table in tg.tables:
fks = {
fk.columnReference[0]: (fk.reference.columnReference[0], fk.reference.resource.string)
for fk in table.tableSchema.foreignKeys if len(fk.columnReference) == 1}
header = '## <a name="table-{}"></a>Table '.format(slug(table.url.string))
if link_files and tg._fname and tg._fname.parent.joinpath(table.url.string).exists():
header += '[{0}]({0})\n'.format(table.url.string)
else: # pragma: no cover
header += table.url.string
res.append('\n' + header + '\n')
res.append(properties(table.common_props))
dialect = table.inherit('dialect')
if dialect.asdict():
res.append('\n**CSV dialect**: `{}`\n'.format(json.dumps(dialect.asdict())))
res.append('\n### Columns\n')
res.append('Name/Property | Datatype | Description')
res.append(' --- | --- | --- ')
for col in table.tableSchema.columns:
res.append(colrow(col, fks, table.tableSchema.primaryKey))
return '\n'.join(res)