From 21fd30b4002bc9f5b9dc08aacd492c5e9970c36c Mon Sep 17 00:00:00 2001
From: Benjamin Dauvergne <bdauvergne@entrouvert.com>
Date: Sat, 11 Apr 2020 12:54:37 +0200
Subject: [PATCH 2/2] csvdatasource: backports fix on csv.Sniffer (#41612)

  https://bugs.python.org/issue30157
---
 passerelle/apps/csvdatasource/models.py | 156 ++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
diff --git a/passerelle/apps/csvdatasource/models.py b/passerelle/apps/csvdatasource/models.py
index ff32166f..8ed84903 100644
--- a/passerelle/apps/csvdatasource/models.py
+++ b/passerelle/apps/csvdatasource/models.py
@@ -17,6 +17,7 @@
 import io
 import os
 import re
+import sys
 import csv
 from collections import OrderedDict
 
@@ -46,6 +47,161 @@ identifier_re = re.compile(r"^[^\d\W]\w*\Z", re.UNICODE)
 
 code_cache = OrderedDict()
 
+# Backport of https://bugs.python.org/issue30157
+if sys.version_info < (2, 7, 15):
+    def _guess_quote_and_delimiter(self, data, delimiters):
+        """
+        Looks for text enclosed between two identical quotes
+        (the probable quotechar) which are preceded and followed
+        by the same character (the probable delimiter).
+        For example:
+                         ,'some text',
+        The quote with the most wins, same with the delimiter.
+        If there is no quotechar the delimiter can't be determined
+        this way.
+        """
+
+        matches = []
+        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
+                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
+                      '(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
+                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
+            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
+            matches = regexp.findall(data)
+            if matches:
+                break
+
+        if not matches:
+            # (quotechar, doublequote, delimiter, skipinitialspace)
+            return ('', False, None, 0)
+        quotes = {}
+        delims = {}
+        spaces = 0
+        for m in matches:
+            n = regexp.groupindex['quote'] - 1
+            key = m[n]
+            if key:
+                quotes[key] = quotes.get(key, 0) + 1
+            try:
+                n = regexp.groupindex['delim'] - 1
+                key = m[n]
+            except KeyError:
+                continue
+            if key and (delimiters is None or key in delimiters):
+                delims[key] = delims.get(key, 0) + 1
+            try:
+                n = regexp.groupindex['space'] - 1
+            except KeyError:
+                continue
+            if m[n]:
+                spaces += 1
+
+        quotechar = reduce(lambda a, b, quotes = quotes:
+                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
+
+        if delims:
+            delim = reduce(lambda a, b, delims = delims:
+                           (delims[a] > delims[b]) and a or b, delims.keys())
+            skipinitialspace = delims[delim] == spaces
+            if delim == '\n': # most likely a file with a single column
+                delim = ''
+        else:
+            # there is *no* delimiter, it's a single column of quoted data
+            delim = ''
+            skipinitialspace = 0
+
+        # if we see an extra quote between delimiters, we've got a
+        # double quoted format
+        dq_regexp = re.compile(
+                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
+                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
+
+
+
+        if dq_regexp.search(data):
+            doublequote = True
+        else:
+            doublequote = False
+
+        return (quotechar, doublequote, delim, skipinitialspace)
+    csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
+elif six.PY3 and sys.version_info < (3, 7):
+    def _guess_quote_and_delimiter(self, data, delimiters):
+        """
+        Looks for text enclosed between two identical quotes
+        (the probable quotechar) which are preceded and followed
+        by the same character (the probable delimiter).
+        For example:
+                         ,'some text',
+        The quote with the most wins, same with the delimiter.
+        If there is no quotechar the delimiter can't be determined
+        this way.
+        """
+
+        matches = []
+        for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
+                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
+                      r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
+                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
+            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
+            matches = regexp.findall(data)
+            if matches:
+                break
+
+        if not matches:
+            # (quotechar, doublequote, delimiter, skipinitialspace)
+            return ('', False, None, 0)
+        quotes = {}
+        delims = {}
+        spaces = 0
+        groupindex = regexp.groupindex
+        for m in matches:
+            n = groupindex['quote'] - 1
+            key = m[n]
+            if key:
+                quotes[key] = quotes.get(key, 0) + 1
+            try:
+                n = groupindex['delim'] - 1
+                key = m[n]
+            except KeyError:
+                continue
+            if key and (delimiters is None or key in delimiters):
+                delims[key] = delims.get(key, 0) + 1
+            try:
+                n = groupindex['space'] - 1
+            except KeyError:
+                continue
+            if m[n]:
+                spaces += 1
+
+        quotechar = max(quotes, key=quotes.get)
+
+        if delims:
+            delim = max(delims, key=delims.get)
+            skipinitialspace = delims[delim] == spaces
+            if delim == '\n': # most likely a file with a single column
+                delim = ''
+        else:
+            # there is *no* delimiter, it's a single column of quoted data
+            delim = ''
+            skipinitialspace = 0
+
+        # if we see an extra quote between delimiters, we've got a
+        # double quoted format
+        dq_regexp = re.compile(
+                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
+                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
+
+
+
+        if dq_regexp.search(data):
+            doublequote = True
+        else:
+            doublequote = False
+
+        return (quotechar, doublequote, delim, skipinitialspace)
+    csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
+
 
 def get_code(expr):
     # limit size of code cache to 1024
-- 
2.24.0