File size: 5,177 Bytes
06555b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
"""
DSV data can be surprisingly diverse. While Python's `csv` module offers out-of-the-box support
for the basic formatting parameters, CSVW recognizes a couple more, like `skipColumns` or
`skipRows`.
.. seealso::
- `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_
- `<https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters>`_
- `<https://specs.frictionlessdata.io/csv-dialect/>`_
"""
import attr
import warnings
import functools
from . import utils
__all__ = ['Dialect']
ENCODING_MAP = {
'UTF-8-BOM': 'utf-8-sig', # Recognize the name of this encoding in R.
}
# FIXME: replace with attrs.validators.ge(0) from attrs 21.3.0
def _non_negative(instance, attribute, value):
if value < 0: # pragma: no cover
raise ValueError('{0} is not a valid {1}'.format(value, attribute.name))
non_negative_int = [attr.validators.instance_of(int), _non_negative]
def convert_encoding(s):
s = utils.converter(str, 'utf-8', s)
try:
_ = 'x'.encode(ENCODING_MAP.get(s, s))
return s
except LookupError:
warnings.warn('Invalid value for property: {}'.format(s))
return 'utf-8'
@attr.s
class Dialect(object):
"""
A CSV dialect specification.
.. seealso:: `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_
"""
encoding = attr.ib(
default='utf-8',
converter=convert_encoding,
validator=attr.validators.instance_of(str))
lineTerminators = attr.ib(
converter=functools.partial(utils.converter, list, ['\r\n', '\n']),
default=attr.Factory(lambda: ['\r\n', '\n']))
quoteChar = attr.ib(
converter=functools.partial(utils.converter, str, '"', allow_none=True),
default='"',
)
doubleQuote = attr.ib(
default=True,
converter=functools.partial(utils.converter, bool, True),
validator=attr.validators.instance_of(bool))
skipRows = attr.ib(
default=0,
converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0),
validator=non_negative_int)
commentPrefix = attr.ib(
default='#',
converter=functools.partial(utils.converter, str, '#', allow_none=True),
validator=attr.validators.optional(attr.validators.instance_of(str)))
header = attr.ib(
default=True,
converter=functools.partial(utils.converter, bool, True),
validator=attr.validators.instance_of(bool))
headerRowCount = attr.ib(
default=1,
converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0),
validator=non_negative_int)
delimiter = attr.ib(
default=',',
converter=functools.partial(utils.converter, str, ','),
validator=attr.validators.instance_of(str))
skipColumns = attr.ib(
default=0,
converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0),
validator=non_negative_int)
skipBlankRows = attr.ib(
default=False,
converter=functools.partial(utils.converter, bool, False),
validator=attr.validators.instance_of(bool))
skipInitialSpace = attr.ib(
default=False,
converter=functools.partial(utils.converter, bool, False),
validator=attr.validators.instance_of(bool))
trim = attr.ib(
default='false',
validator=attr.validators.in_(['true', 'false', 'start', 'end']),
converter=lambda v: functools.partial(
utils.converter,
(str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v))
def updated(self, **kw):
res = self.__class__(**attr.asdict(self))
for k, v in kw.items():
setattr(res, k, v)
return res
@functools.cached_property
def escape_character(self):
return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\')
@functools.cached_property
def line_terminators(self):
return [self.lineTerminators] \
if isinstance(self.lineTerminators, str) else self.lineTerminators
@functools.cached_property
def trimmer(self):
return {
'true': lambda s: s.strip(),
'false': lambda s: s,
'start': lambda s: s.lstrip(),
'end': lambda s: s.rstrip()
}[self.trim]
def asdict(self, omit_defaults=True):
return utils.attr_asdict(self, omit_defaults=omit_defaults)
@property
def python_encoding(self):
return ENCODING_MAP.get(self.encoding, self.encoding)
def as_python_formatting_parameters(self):
return {
'delimiter': self.delimiter,
'doublequote': self.doubleQuote,
# We have to hack around incompatible ways escape char is interpreted in csvw
# and python's csv lib:
'escapechar': self.escape_character if not self.doubleQuote else None,
'lineterminator': self.line_terminators[0],
'quotechar': self.quoteChar,
'skipinitialspace': self.skipInitialSpace,
'strict': True,
}
|