Tokenizer.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9/* DEBUG: section 24 SBuf */
10
11#include "squid.h"
12#include "debug/Stream.h"
13#include "parser/forward.h"
14#include "parser/Tokenizer.h"
15#include "sbuf/Stream.h"
16
17#include <cctype>
18#include <cerrno>
19
21SBuf
23{
24 // careful: n may be npos!
25 debugs(24, 5, "consuming " << n << " bytes");
26 const SBuf result = buf_.consume(n);
27 parsed_ += result.length();
28 return result;
29}
30
34{
35 return consume(n).length();
36}
37
39SBuf
41{
42 debugs(24, 5, "consuming " << n << " bytes");
43
44 // If n is npos, we consume everything from buf_ (and nothing from result).
45 const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
46
47 SBuf result = buf_;
48 buf_ = result.consume(buf_.length() - parsed);
49 parsed_ += parsed;
50 return result;
51}
52
56{
57 return consumeTrailing(n).length();
58}
59
60bool
61Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
62{
63 const Tokenizer saved(*this);
64 skipAll(delimiters);
65 const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
66 if (tokenLen == SBuf::npos) {
67 debugs(24, 8, "no token found for delimiters " << delimiters.name);
68 *this = saved;
69 return false;
70 }
71 returnedToken = consume(tokenLen); // cannot be empty
72 skipAll(delimiters);
73 debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
74 returnedToken << '\'');
75 return true;
76}
77
78bool
79Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
80{
81 SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
82 if (prefixLen == 0) {
83 debugs(24, 8, "no prefix for set " << tokenChars.name);
84 return false;
85 }
86 if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
87 debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
88 return false;
89 }
90 if (prefixLen == SBuf::npos && limit > 0) {
91 debugs(24, 8, "whole haystack matched");
92 prefixLen = limit;
93 }
94 debugs(24, 8, "found with length " << prefixLen);
95 returnedToken = consume(prefixLen); // cannot be empty after the npos check
96 return true;
97}
98
99SBuf
100Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
101{
102 if (atEnd())
103 throw InsufficientInput();
104
105 SBuf result;
106
107 if (!prefix(result, tokenChars, limit))
108 throw TexcHere(ToSBuf("cannot parse ", description));
109
110 if (atEnd())
111 throw InsufficientInput();
112
113 return result;
114}
115
116bool
117Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
118{
119 SBuf span = buf_;
120
121 if (limit < buf_.length())
122 span.consume(buf_.length() - limit); // ignore the N prefix characters
123
124 auto i = span.rbegin();
125 SBuf::size_type found = 0;
126 while (i != span.rend() && tokenChars[*i]) {
127 ++i;
128 ++found;
129 }
130 if (!found)
131 return false;
132 returnedToken = consumeTrailing(found);
133 return true;
134}
135
138{
139 const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
140 if (prefixLen == 0) {
141 debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
142 return 0;
143 }
144 debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
145 return success(prefixLen);
146}
147
148bool
150{
151 if (!buf_.isEmpty() && chars[buf_[0]]) {
152 debugs(24, 8, "skipping one-of " << chars.name);
153 return success(1);
154 }
155 debugs(24, 8, "no match while skipping one-of " << chars.name);
156 return false;
157}
158
159bool
161{
162 if (buf_.length() < tokenToSkip.length())
163 return false;
164
165 SBuf::size_type offset = 0;
166 if (tokenToSkip.length() < buf_.length())
167 offset = buf_.length() - tokenToSkip.length();
168
169 if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
170 debugs(24, 8, "skipping " << tokenToSkip.length());
171 return successTrailing(tokenToSkip.length());
172 }
173 return false;
174}
175
176bool
177Parser::Tokenizer::skip(const SBuf &tokenToSkip)
178{
179 if (buf_.startsWith(tokenToSkip)) {
180 debugs(24, 8, "skipping " << tokenToSkip.length());
181 return success(tokenToSkip.length());
182 }
183 debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
184 return false;
185}
186
187bool
188Parser::Tokenizer::skip(const char tokenChar)
189{
190 if (!buf_.isEmpty() && buf_[0] == tokenChar) {
191 debugs(24, 8, "skipping char '" << tokenChar << '\'');
192 return success(1);
193 }
194 debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
195 return false;
196}
197
198bool
200{
201 if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
202 debugs(24, 8, "skipping one-of " << skippable.name);
203 return successTrailing(1);
204 }
205 debugs(24, 8, "no match while skipping one-of " << skippable.name);
206 return false;
207}
208
211{
212 const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
213 const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
214 0 : (prefixEnd + 1);
215 const SBuf::size_type suffixLen = buf_.length() - prefixLen;
216 if (suffixLen == 0) {
217 debugs(24, 8, "no match when trying to skip " << skippable.name);
218 return 0;
219 }
220 debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
221 return successTrailing(suffixLen);
222}
223
224/* reworked from compat/strtoll.c */
225bool
226Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
227{
228 if (atEnd() || limit == 0)
229 return false;
230
231 const SBuf range(buf_.substr(0,limit));
232
233 // XXX: account for buf_.size()
234 bool neg = false;
235 const char *s = range.rawContent();
236 const char *end = range.rawContent() + range.length();
237
238 if (allowSign) {
239 if (*s == '-') {
240 neg = true;
241 ++s;
242 } else if (*s == '+') {
243 ++s;
244 }
245 if (s >= end) return false;
246 }
247 if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
248 tolower(*(s+1)) == 'x') {
249 s += 2;
250 base = 16;
251 }
252 if (base == 0) {
253 if ( *s == '0') {
254 base = 8;
255 ++s;
256 } else {
257 base = 10;
258 }
259 }
260 if (s >= end) return false;
261
262 uint64_t cutoff;
263
264 cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
265 const int cutlim = cutoff % static_cast<int64_t>(base);
266 cutoff /= static_cast<uint64_t>(base);
267
268 int any = 0, c;
269 int64_t acc = 0;
270 do {
271 c = *s;
272 if (xisdigit(c)) {
273 c -= '0';
274 } else if (xisalpha(c)) {
275 c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
276 } else {
277 break;
278 }
279 if (c >= base)
280 break;
281 if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
282 any = -1;
283 else {
284 any = 1;
285 acc *= base;
286 acc += c;
287 }
288 } while (++s < end);
289
290 if (any == 0) // nothing was parsed
291 return false;
292 if (any < 0) {
293 acc = neg ? INT64_MIN : INT64_MAX;
294 errno = ERANGE;
295 return false;
296 } else if (neg)
297 acc = -acc;
298
299 result = acc;
300 return success(s - range.rawContent());
301}
302
303int64_t
304Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
305{
306 if (atEnd())
307 throw InsufficientInput();
308
309 int64_t result = 0;
310
311 // Since we only support unsigned decimals, a parsing failure with a
312 // non-empty input always implies invalid/malformed input (or a buggy
313 // limit=0 caller). TODO: Support signed and non-decimal integers by
314 // refactoring int64() to detect insufficient input.
315 if (!int64(result, 10, false, limit))
316 throw TexcHere(ToSBuf("cannot parse ", description));
317
318 if (atEnd())
319 throw InsufficientInput(); // more digits may be coming
320
321 return result;
322}
323
#define TexcHere(msg)
legacy convenience macro; it is not difficult to type Here() now
Definition: TextException.h:59
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:18
const char * name
optional set label for debugging (default: "anonymous")
Definition: CharacterSet.h:72
thrown by modern "incremental" parsers when they need more data
Definition: forward.h:18
SBuf::size_type skipAllTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:210
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:79
SBuf buf_
yet unparsed input
Definition: Tokenizer.h:169
bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:117
SBuf::size_type successTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns their count
Definition: Tokenizer.cc:55
bool skipOne(const CharacterSet &discardables)
Definition: Tokenizer.cc:149
bool token(SBuf &returnedToken, const CharacterSet &delimiters)
Definition: Tokenizer.cc:61
SBuf consumeTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns them
Definition: Tokenizer.cc:40
SBuf::size_type success(const SBuf::size_type n)
convenience method: consume()s up to n bytes and returns their count
Definition: Tokenizer.cc:33
bool skipSuffix(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:160
SBuf::size_type skipAll(const CharacterSet &discardables)
Definition: Tokenizer.cc:137
bool int64(int64_t &result, int base=0, bool allowSign=true, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:226
SBuf consume(const SBuf::size_type n)
convenience method: consumes up to n bytes, counts, and returns them
Definition: Tokenizer.cc:22
bool skipOneTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:199
SBuf::size_type parsed_
bytes successfully parsed, including skipped
Definition: Tokenizer.h:170
int64_t udec64(const char *description, SBuf::size_type limit=SBuf::npos)
int64() wrapper but limited to unsigned decimal integers (for now)
Definition: Tokenizer.cc:304
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:177
Definition: SBuf.h:94
const char * rawContent() const
Definition: SBuf.cc:509
static const size_type npos
Definition: SBuf.h:99
SBuf consume(size_type n=npos)
Definition: SBuf.cc:481
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:415
const_reverse_iterator rbegin() const
Definition: SBuf.h:591
MemBlob::size_type size_type
Definition: SBuf.h:96
const_reverse_iterator rend() const
Definition: SBuf.h:595
#define DBG_DATA
Definition: Stream.h:40
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:194
SBuf ToSBuf(Args &&... args)
slowly stream-prints all arguments into a freshly allocated SBuf
Definition: Stream.h:63
#define INT64_MIN
Definition: types.h:79
#define INT64_MAX
Definition: types.h:89
#define xisupper(x)
Definition: xis.h:26
#define xisalpha(x)
Definition: xis.h:21
#define xisdigit(x)
Definition: xis.h:18

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors