0002-csvdatasource-backports-fix-on-csv.Sniffer-41612.patch

Benjamin Dauvergne, 11 avril 2020 13:43

Voir les différences: en ligne côte à côte

Subject: [PATCH 2/2] csvdatasource: backports fix on csv.Sniffer (#41612)

  https://bugs.python.org/issue30157

 passerelle/apps/csvdatasource/models.py | 156 ++++++++++++++++++++++++
 1 file changed, 156 insertions(+)

     import io
     import os
     import re
     import sys
     import csv
     from collections import OrderedDict
-...
     code_cache = OrderedDict()
     # Backport of https://bugs.python.org/issue30157
     if sys.version_info < (2, 7, 15):
         def _guess_quote_and_delimiter(self, data, delimiters):
             """
             Looks for text enclosed between two identical quotes
             (the probable quotechar) which are preceded and followed
             by the same character (the probable delimiter).
             For example:
                              ,'some text',
             The quote with the most wins, same with the delimiter.
             If there is no quotechar the delimiter can't be determined
             this way.
             """
             matches = []
             for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
                           '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
                           '(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
                           '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
                 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
                 matches = regexp.findall(data)
                 if matches:
                     break
             if not matches:
                 # (quotechar, doublequote, delimiter, skipinitialspace)
                 return ('', False, None, 0)
             quotes = {}
             delims = {}
             spaces = 0
             for m in matches:
                 n = regexp.groupindex['quote'] - 1
                 key = m[n]
                 if key:
                     quotes[key] = quotes.get(key, 0) + 1
                 try:
                     n = regexp.groupindex['delim'] - 1
                     key = m[n]
                 except KeyError:
                     continue
                 if key and (delimiters is None or key in delimiters):
                     delims[key] = delims.get(key, 0) + 1
                 try:
                     n = regexp.groupindex['space'] - 1
                 except KeyError:
                     continue
                 if m[n]:
                     spaces += 1
             quotechar = reduce(lambda a, b, quotes = quotes:
                                (quotes[a] > quotes[b]) and a or b, quotes.keys())
             if delims:
                 delim = reduce(lambda a, b, delims = delims:
                                (delims[a] > delims[b]) and a or b, delims.keys())
                 skipinitialspace = delims[delim] == spaces
                 if delim == '\n': # most likely a file with a single column
                     delim = ''
             else:
                 # there is *no* delimiter, it's a single column of quoted data
                 delim = ''
                 skipinitialspace = 0
             # if we see an extra quote between delimiters, we've got a
             # double quoted format
             dq_regexp = re.compile(
                                    r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
                                    {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
             if dq_regexp.search(data):
                 doublequote = True
             else:
                 doublequote = False
             return (quotechar, doublequote, delim, skipinitialspace)
         csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
     elif six.PY3 and sys.version_info < (3, 7):
         def _guess_quote_and_delimiter(self, data, delimiters):
             """
             Looks for text enclosed between two identical quotes
             (the probable quotechar) which are preceded and followed
             by the same character (the probable delimiter).
             For example:
                              ,'some text',
             The quote with the most wins, same with the delimiter.
             If there is no quotechar the delimiter can't be determined
             this way.
             """
             matches = []
             for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
                           r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
                           r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
                           r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
                 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
                 matches = regexp.findall(data)
                 if matches:
                     break
             if not matches:
                 # (quotechar, doublequote, delimiter, skipinitialspace)
                 return ('', False, None, 0)
             quotes = {}
             delims = {}
             spaces = 0
             groupindex = regexp.groupindex
             for m in matches:
                 n = groupindex['quote'] - 1
                 key = m[n]
                 if key:
                     quotes[key] = quotes.get(key, 0) + 1
                 try:
                     n = groupindex['delim'] - 1
                     key = m[n]
                 except KeyError:
                     continue
                 if key and (delimiters is None or key in delimiters):
                     delims[key] = delims.get(key, 0) + 1
                 try:
                     n = groupindex['space'] - 1
                 except KeyError:
                     continue
                 if m[n]:
                     spaces += 1
             quotechar = max(quotes, key=quotes.get)
             if delims:
                 delim = max(delims, key=delims.get)
                 skipinitialspace = delims[delim] == spaces
                 if delim == '\n': # most likely a file with a single column
                     delim = ''
             else:
                 # there is *no* delimiter, it's a single column of quoted data
                 delim = ''
                 skipinitialspace = 0
             # if we see an extra quote between delimiters, we've got a
             # double quoted format
             dq_regexp = re.compile(
                                    r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
                                    {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
             if dq_regexp.search(data):
                 doublequote = True
             else:
                 doublequote = False
             return (quotechar, doublequote, delim, skipinitialspace)
         csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
     def get_code(expr):
         # limit size of code cache to 1024
+    -

Projet

Général

Profil

Produits Entr'ouvert » Passerelle

0002-csvdatasource-backports-fix-on-csv.Sniffer-41612.patch