From 21fd30b4002bc9f5b9dc08aacd492c5e9970c36c Mon Sep 17 00:00:00 2001 From: Benjamin Dauvergne Date: Sat, 11 Apr 2020 12:54:37 +0200 Subject: [PATCH 2/2] csvdatasource: backports fix on csv.Sniffer (#41612) https://bugs.python.org/issue30157 --- passerelle/apps/csvdatasource/models.py | 156 ++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/passerelle/apps/csvdatasource/models.py b/passerelle/apps/csvdatasource/models.py index ff32166f..8ed84903 100644 --- a/passerelle/apps/csvdatasource/models.py +++ b/passerelle/apps/csvdatasource/models.py @@ -17,6 +17,7 @@ import io import os import re +import sys import csv from collections import OrderedDict @@ -46,6 +47,161 @@ identifier_re = re.compile(r"^[^\d\W]\w*\Z", re.UNICODE) code_cache = OrderedDict() +# Backport of https://bugs.python.org/issue30157 +if sys.version_info < (2, 7, 15): + def _guess_quote_and_delimiter(self, data, delimiters): + """ + Looks for text enclosed between two identical quotes + (the probable quotechar) which are preceded and followed + by the same character (the probable delimiter). + For example: + ,'some text', + The quote with the most wins, same with the delimiter. + If there is no quotechar the delimiter can't be determined + this way. + """ + + matches = [] + for restr in ('(?P[^\w\n"\'])(?P ?)(?P["\']).*?(?P=quote)(?P=delim)', # ,".*?", + '(?:^|\n)(?P["\']).*?(?P=quote)(?P[^\w\n"\'])(?P ?)', # ".*?", + '(?P[^\w\n"\'])(?P ?)(?P["\']).*?(?P=quote)(?:$|\n)', # ,".*?" + '(?:^|\n)(?P["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) + regexp = re.compile(restr, re.DOTALL | re.MULTILINE) + matches = regexp.findall(data) + if matches: + break + + if not matches: + # (quotechar, doublequote, delimiter, skipinitialspace) + return ('', False, None, 0) + quotes = {} + delims = {} + spaces = 0 + for m in matches: + n = regexp.groupindex['quote'] - 1 + key = m[n] + if key: + quotes[key] = quotes.get(key, 0) + 1 + try: + n = regexp.groupindex['delim'] - 1 + key = m[n] + except KeyError: + continue + if key and (delimiters is None or key in delimiters): + delims[key] = delims.get(key, 0) + 1 + try: + n = regexp.groupindex['space'] - 1 + except KeyError: + continue + if m[n]: + spaces += 1 + + quotechar = reduce(lambda a, b, quotes = quotes: + (quotes[a] > quotes[b]) and a or b, quotes.keys()) + + if delims: + delim = reduce(lambda a, b, delims = delims: + (delims[a] > delims[b]) and a or b, delims.keys()) + skipinitialspace = delims[delim] == spaces + if delim == '\n': # most likely a file with a single column + delim = '' + else: + # there is *no* delimiter, it's a single column of quoted data + delim = '' + skipinitialspace = 0 + + # if we see an extra quote between delimiters, we've got a + # double quoted format + dq_regexp = re.compile( + r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ + {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) + + + + if dq_regexp.search(data): + doublequote = True + else: + doublequote = False + + return (quotechar, doublequote, delim, skipinitialspace) + csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter +elif six.PY3 and sys.version_info < (3, 7): + def _guess_quote_and_delimiter(self, data, delimiters): + """ + Looks for text enclosed between two identical quotes + (the probable quotechar) which are preceded and followed + by the same character (the probable delimiter). + For example: + ,'some text', + The quote with the most wins, same with the delimiter. + If there is no quotechar the delimiter can't be determined + this way. + """ + + matches = [] + for restr in (r'(?P[^\w\n"\'])(?P ?)(?P["\']).*?(?P=quote)(?P=delim)', # ,".*?", + r'(?:^|\n)(?P["\']).*?(?P=quote)(?P[^\w\n"\'])(?P ?)', # ".*?", + r'(?P[^\w\n"\'])(?P ?)(?P["\']).*?(?P=quote)(?:$|\n)', # ,".*?" + r'(?:^|\n)(?P["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) + regexp = re.compile(restr, re.DOTALL | re.MULTILINE) + matches = regexp.findall(data) + if matches: + break + + if not matches: + # (quotechar, doublequote, delimiter, skipinitialspace) + return ('', False, None, 0) + quotes = {} + delims = {} + spaces = 0 + groupindex = regexp.groupindex + for m in matches: + n = groupindex['quote'] - 1 + key = m[n] + if key: + quotes[key] = quotes.get(key, 0) + 1 + try: + n = groupindex['delim'] - 1 + key = m[n] + except KeyError: + continue + if key and (delimiters is None or key in delimiters): + delims[key] = delims.get(key, 0) + 1 + try: + n = groupindex['space'] - 1 + except KeyError: + continue + if m[n]: + spaces += 1 + + quotechar = max(quotes, key=quotes.get) + + if delims: + delim = max(delims, key=delims.get) + skipinitialspace = delims[delim] == spaces + if delim == '\n': # most likely a file with a single column + delim = '' + else: + # there is *no* delimiter, it's a single column of quoted data + delim = '' + skipinitialspace = 0 + + # if we see an extra quote between delimiters, we've got a + # double quoted format + dq_regexp = re.compile( + r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ + {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) + + + + if dq_regexp.search(data): + doublequote = True + else: + doublequote = False + + return (quotechar, doublequote, delim, skipinitialspace) + csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter + def get_code(expr): # limit size of code cache to 1024 -- 2.24.0