""" DSV data can be surprisingly diverse. While Python's `csv` module offers out-of-the-box support for the basic formatting parameters, CSVW recognizes a couple more, like `skipColumns` or `skipRows`. .. seealso:: - ``_ - ``_ - ``_ """ import attr import warnings import functools from . import utils __all__ = ['Dialect'] ENCODING_MAP = { 'UTF-8-BOM': 'utf-8-sig', # Recognize the name of this encoding in R. } # FIXME: replace with attrs.validators.ge(0) from attrs 21.3.0 def _non_negative(instance, attribute, value): if value < 0: # pragma: no cover raise ValueError('{0} is not a valid {1}'.format(value, attribute.name)) non_negative_int = [attr.validators.instance_of(int), _non_negative] def convert_encoding(s): s = utils.converter(str, 'utf-8', s) try: _ = 'x'.encode(ENCODING_MAP.get(s, s)) return s except LookupError: warnings.warn('Invalid value for property: {}'.format(s)) return 'utf-8' @attr.s class Dialect(object): """ A CSV dialect specification. .. seealso:: ``_ """ encoding = attr.ib( default='utf-8', converter=convert_encoding, validator=attr.validators.instance_of(str)) lineTerminators = attr.ib( converter=functools.partial(utils.converter, list, ['\r\n', '\n']), default=attr.Factory(lambda: ['\r\n', '\n'])) quoteChar = attr.ib( converter=functools.partial(utils.converter, str, '"', allow_none=True), default='"', ) doubleQuote = attr.ib( default=True, converter=functools.partial(utils.converter, bool, True), validator=attr.validators.instance_of(bool)) skipRows = attr.ib( default=0, converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), validator=non_negative_int) commentPrefix = attr.ib( default='#', converter=functools.partial(utils.converter, str, '#', allow_none=True), validator=attr.validators.optional(attr.validators.instance_of(str))) header = attr.ib( default=True, converter=functools.partial(utils.converter, bool, True), validator=attr.validators.instance_of(bool)) headerRowCount = attr.ib( default=1, converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0), validator=non_negative_int) delimiter = attr.ib( default=',', converter=functools.partial(utils.converter, str, ','), validator=attr.validators.instance_of(str)) skipColumns = attr.ib( default=0, converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), validator=non_negative_int) skipBlankRows = attr.ib( default=False, converter=functools.partial(utils.converter, bool, False), validator=attr.validators.instance_of(bool)) skipInitialSpace = attr.ib( default=False, converter=functools.partial(utils.converter, bool, False), validator=attr.validators.instance_of(bool)) trim = attr.ib( default='false', validator=attr.validators.in_(['true', 'false', 'start', 'end']), converter=lambda v: functools.partial( utils.converter, (str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v)) def updated(self, **kw): res = self.__class__(**attr.asdict(self)) for k, v in kw.items(): setattr(res, k, v) return res @functools.cached_property def escape_character(self): return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\') @functools.cached_property def line_terminators(self): return [self.lineTerminators] \ if isinstance(self.lineTerminators, str) else self.lineTerminators @functools.cached_property def trimmer(self): return { 'true': lambda s: s.strip(), 'false': lambda s: s, 'start': lambda s: s.lstrip(), 'end': lambda s: s.rstrip() }[self.trim] def asdict(self, omit_defaults=True): return utils.attr_asdict(self, omit_defaults=omit_defaults) @property def python_encoding(self): return ENCODING_MAP.get(self.encoding, self.encoding) def as_python_formatting_parameters(self): return { 'delimiter': self.delimiter, 'doublequote': self.doubleQuote, # We have to hack around incompatible ways escape char is interpreted in csvw # and python's csv lib: 'escapechar': self.escape_character if not self.doubleQuote else None, 'lineterminator': self.line_terminators[0], 'quotechar': self.quoteChar, 'skipinitialspace': self.skipInitialSpace, 'strict': True, }