Projet

Général

Profil

0002-csvdatasource-backports-fix-on-csv.Sniffer-41612.patch

Benjamin Dauvergne, 11 avril 2020 13:43

Télécharger (7,07 ko)

Voir les différences:

Subject: [PATCH 2/2] csvdatasource: backports fix on csv.Sniffer (#41612)

  https://bugs.python.org/issue30157
 passerelle/apps/csvdatasource/models.py | 156 ++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
passerelle/apps/csvdatasource/models.py
17 17
import io
18 18
import os
19 19
import re
20
import sys
20 21
import csv
21 22
from collections import OrderedDict
22 23

  
......
46 47

  
47 48
code_cache = OrderedDict()
48 49

  
50
# Backport of https://bugs.python.org/issue30157
51
if sys.version_info < (2, 7, 15):
52
    def _guess_quote_and_delimiter(self, data, delimiters):
53
        """
54
        Looks for text enclosed between two identical quotes
55
        (the probable quotechar) which are preceded and followed
56
        by the same character (the probable delimiter).
57
        For example:
58
                         ,'some text',
59
        The quote with the most wins, same with the delimiter.
60
        If there is no quotechar the delimiter can't be determined
61
        this way.
62
        """
63

  
64
        matches = []
65
        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
66
                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
67
                      '(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
68
                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
69
            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
70
            matches = regexp.findall(data)
71
            if matches:
72
                break
73

  
74
        if not matches:
75
            # (quotechar, doublequote, delimiter, skipinitialspace)
76
            return ('', False, None, 0)
77
        quotes = {}
78
        delims = {}
79
        spaces = 0
80
        for m in matches:
81
            n = regexp.groupindex['quote'] - 1
82
            key = m[n]
83
            if key:
84
                quotes[key] = quotes.get(key, 0) + 1
85
            try:
86
                n = regexp.groupindex['delim'] - 1
87
                key = m[n]
88
            except KeyError:
89
                continue
90
            if key and (delimiters is None or key in delimiters):
91
                delims[key] = delims.get(key, 0) + 1
92
            try:
93
                n = regexp.groupindex['space'] - 1
94
            except KeyError:
95
                continue
96
            if m[n]:
97
                spaces += 1
98

  
99
        quotechar = reduce(lambda a, b, quotes = quotes:
100
                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
101

  
102
        if delims:
103
            delim = reduce(lambda a, b, delims = delims:
104
                           (delims[a] > delims[b]) and a or b, delims.keys())
105
            skipinitialspace = delims[delim] == spaces
106
            if delim == '\n': # most likely a file with a single column
107
                delim = ''
108
        else:
109
            # there is *no* delimiter, it's a single column of quoted data
110
            delim = ''
111
            skipinitialspace = 0
112

  
113
        # if we see an extra quote between delimiters, we've got a
114
        # double quoted format
115
        dq_regexp = re.compile(
116
                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
117
                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
118

  
119

  
120

  
121
        if dq_regexp.search(data):
122
            doublequote = True
123
        else:
124
            doublequote = False
125

  
126
        return (quotechar, doublequote, delim, skipinitialspace)
127
    csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
128
elif six.PY3 and sys.version_info < (3, 7):
129
    def _guess_quote_and_delimiter(self, data, delimiters):
130
        """
131
        Looks for text enclosed between two identical quotes
132
        (the probable quotechar) which are preceded and followed
133
        by the same character (the probable delimiter).
134
        For example:
135
                         ,'some text',
136
        The quote with the most wins, same with the delimiter.
137
        If there is no quotechar the delimiter can't be determined
138
        this way.
139
        """
140

  
141
        matches = []
142
        for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
143
                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
144
                      r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
145
                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
146
            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
147
            matches = regexp.findall(data)
148
            if matches:
149
                break
150

  
151
        if not matches:
152
            # (quotechar, doublequote, delimiter, skipinitialspace)
153
            return ('', False, None, 0)
154
        quotes = {}
155
        delims = {}
156
        spaces = 0
157
        groupindex = regexp.groupindex
158
        for m in matches:
159
            n = groupindex['quote'] - 1
160
            key = m[n]
161
            if key:
162
                quotes[key] = quotes.get(key, 0) + 1
163
            try:
164
                n = groupindex['delim'] - 1
165
                key = m[n]
166
            except KeyError:
167
                continue
168
            if key and (delimiters is None or key in delimiters):
169
                delims[key] = delims.get(key, 0) + 1
170
            try:
171
                n = groupindex['space'] - 1
172
            except KeyError:
173
                continue
174
            if m[n]:
175
                spaces += 1
176

  
177
        quotechar = max(quotes, key=quotes.get)
178

  
179
        if delims:
180
            delim = max(delims, key=delims.get)
181
            skipinitialspace = delims[delim] == spaces
182
            if delim == '\n': # most likely a file with a single column
183
                delim = ''
184
        else:
185
            # there is *no* delimiter, it's a single column of quoted data
186
            delim = ''
187
            skipinitialspace = 0
188

  
189
        # if we see an extra quote between delimiters, we've got a
190
        # double quoted format
191
        dq_regexp = re.compile(
192
                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
193
                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
194

  
195

  
196

  
197
        if dq_regexp.search(data):
198
            doublequote = True
199
        else:
200
            doublequote = False
201

  
202
        return (quotechar, doublequote, delim, skipinitialspace)
203
    csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
204

  
49 205

  
50 206
def get_code(expr):
51 207
    # limit size of code cache to 1024
52
-