Tokenizer.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9/* DEBUG: section 24 SBuf */
10
11#include "squid.h"
12#include "debug/Stream.h"
13#include "parser/forward.h"
14#include "parser/Tokenizer.h"
15#include "sbuf/Stream.h"
16
17#include <cerrno>
18#if HAVE_CTYPE_H
19#include <ctype.h>
20#endif
21
23SBuf
25{
26 // careful: n may be npos!
27 debugs(24, 5, "consuming " << n << " bytes");
28 const SBuf result = buf_.consume(n);
29 parsed_ += result.length();
30 return result;
31}
32
36{
37 return consume(n).length();
38}
39
41SBuf
43{
44 debugs(24, 5, "consuming " << n << " bytes");
45
46 // If n is npos, we consume everything from buf_ (and nothing from result).
47 const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
48
49 SBuf result = buf_;
50 buf_ = result.consume(buf_.length() - parsed);
51 parsed_ += parsed;
52 return result;
53}
54
58{
59 return consumeTrailing(n).length();
60}
61
62bool
63Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
64{
65 const Tokenizer saved(*this);
66 skipAll(delimiters);
67 const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
68 if (tokenLen == SBuf::npos) {
69 debugs(24, 8, "no token found for delimiters " << delimiters.name);
70 *this = saved;
71 return false;
72 }
73 returnedToken = consume(tokenLen); // cannot be empty
74 skipAll(delimiters);
75 debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
76 returnedToken << '\'');
77 return true;
78}
79
80bool
81Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
82{
83 SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
84 if (prefixLen == 0) {
85 debugs(24, 8, "no prefix for set " << tokenChars.name);
86 return false;
87 }
88 if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
89 debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
90 return false;
91 }
92 if (prefixLen == SBuf::npos && limit > 0) {
93 debugs(24, 8, "whole haystack matched");
94 prefixLen = limit;
95 }
96 debugs(24, 8, "found with length " << prefixLen);
97 returnedToken = consume(prefixLen); // cannot be empty after the npos check
98 return true;
99}
100
101SBuf
102Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
103{
104 if (atEnd())
105 throw InsufficientInput();
106
107 SBuf result;
108
109 if (!prefix(result, tokenChars, limit))
110 throw TexcHere(ToSBuf("cannot parse ", description));
111
112 if (atEnd())
113 throw InsufficientInput();
114
115 return result;
116}
117
118bool
119Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
120{
121 SBuf span = buf_;
122
123 if (limit < buf_.length())
124 span.consume(buf_.length() - limit); // ignore the N prefix characters
125
126 auto i = span.rbegin();
127 SBuf::size_type found = 0;
128 while (i != span.rend() && tokenChars[*i]) {
129 ++i;
130 ++found;
131 }
132 if (!found)
133 return false;
134 returnedToken = consumeTrailing(found);
135 return true;
136}
137
140{
141 const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
142 if (prefixLen == 0) {
143 debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
144 return 0;
145 }
146 debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
147 return success(prefixLen);
148}
149
150bool
152{
153 if (!buf_.isEmpty() && chars[buf_[0]]) {
154 debugs(24, 8, "skipping one-of " << chars.name);
155 return success(1);
156 }
157 debugs(24, 8, "no match while skipping one-of " << chars.name);
158 return false;
159}
160
161bool
163{
164 if (buf_.length() < tokenToSkip.length())
165 return false;
166
167 SBuf::size_type offset = 0;
168 if (tokenToSkip.length() < buf_.length())
169 offset = buf_.length() - tokenToSkip.length();
170
171 if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
172 debugs(24, 8, "skipping " << tokenToSkip.length());
173 return successTrailing(tokenToSkip.length());
174 }
175 return false;
176}
177
178bool
179Parser::Tokenizer::skip(const SBuf &tokenToSkip)
180{
181 if (buf_.startsWith(tokenToSkip)) {
182 debugs(24, 8, "skipping " << tokenToSkip.length());
183 return success(tokenToSkip.length());
184 }
185 debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
186 return false;
187}
188
189bool
190Parser::Tokenizer::skip(const char tokenChar)
191{
192 if (!buf_.isEmpty() && buf_[0] == tokenChar) {
193 debugs(24, 8, "skipping char '" << tokenChar << '\'');
194 return success(1);
195 }
196 debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
197 return false;
198}
199
200bool
202{
203 if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
204 debugs(24, 8, "skipping one-of " << skippable.name);
205 return successTrailing(1);
206 }
207 debugs(24, 8, "no match while skipping one-of " << skippable.name);
208 return false;
209}
210
213{
214 const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
215 const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
216 0 : (prefixEnd + 1);
217 const SBuf::size_type suffixLen = buf_.length() - prefixLen;
218 if (suffixLen == 0) {
219 debugs(24, 8, "no match when trying to skip " << skippable.name);
220 return 0;
221 }
222 debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
223 return successTrailing(suffixLen);
224}
225
226/* reworked from compat/strtoll.c */
227bool
228Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
229{
230 if (atEnd() || limit == 0)
231 return false;
232
233 const SBuf range(buf_.substr(0,limit));
234
235 // XXX: account for buf_.size()
236 bool neg = false;
237 const char *s = range.rawContent();
238 const char *end = range.rawContent() + range.length();
239
240 if (allowSign) {
241 if (*s == '-') {
242 neg = true;
243 ++s;
244 } else if (*s == '+') {
245 ++s;
246 }
247 if (s >= end) return false;
248 }
249 if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
250 tolower(*(s+1)) == 'x') {
251 s += 2;
252 base = 16;
253 }
254 if (base == 0) {
255 if ( *s == '0') {
256 base = 8;
257 ++s;
258 } else {
259 base = 10;
260 }
261 }
262 if (s >= end) return false;
263
264 uint64_t cutoff;
265
266 cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
267 const int cutlim = cutoff % static_cast<int64_t>(base);
268 cutoff /= static_cast<uint64_t>(base);
269
270 int any = 0, c;
271 int64_t acc = 0;
272 do {
273 c = *s;
274 if (xisdigit(c)) {
275 c -= '0';
276 } else if (xisalpha(c)) {
277 c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
278 } else {
279 break;
280 }
281 if (c >= base)
282 break;
283 if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
284 any = -1;
285 else {
286 any = 1;
287 acc *= base;
288 acc += c;
289 }
290 } while (++s < end);
291
292 if (any == 0) // nothing was parsed
293 return false;
294 if (any < 0) {
295 acc = neg ? INT64_MIN : INT64_MAX;
296 errno = ERANGE;
297 return false;
298 } else if (neg)
299 acc = -acc;
300
301 result = acc;
302 return success(s - range.rawContent());
303}
304
305int64_t
306Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
307{
308 if (atEnd())
309 throw InsufficientInput();
310
311 int64_t result = 0;
312
313 // Since we only support unsigned decimals, a parsing failure with a
314 // non-empty input always implies invalid/malformed input (or a buggy
315 // limit=0 caller). TODO: Support signed and non-decimal integers by
316 // refactoring int64() to detect insufficient input.
317 if (!int64(result, 10, false, limit))
318 throw TexcHere(ToSBuf("cannot parse ", description));
319
320 if (atEnd())
321 throw InsufficientInput(); // more digits may be coming
322
323 return result;
324}
325
#define TexcHere(msg)
legacy convenience macro; it is not difficult to type Here() now
Definition: TextException.h:59
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:18
const char * name
optional set label for debugging (default: "anonymous")
Definition: CharacterSet.h:72
thrown by modern "incremental" parsers when they need more data
Definition: forward.h:18
SBuf::size_type skipAllTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:212
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:81
SBuf buf_
yet unparsed input
Definition: Tokenizer.h:169
bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:119
SBuf::size_type successTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns their count
Definition: Tokenizer.cc:57
bool skipOne(const CharacterSet &discardables)
Definition: Tokenizer.cc:151
bool token(SBuf &returnedToken, const CharacterSet &delimiters)
Definition: Tokenizer.cc:63
SBuf consumeTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns them
Definition: Tokenizer.cc:42
SBuf::size_type success(const SBuf::size_type n)
convenience method: consume()s up to n bytes and returns their count
Definition: Tokenizer.cc:35
bool skipSuffix(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:162
SBuf::size_type skipAll(const CharacterSet &discardables)
Definition: Tokenizer.cc:139
bool int64(int64_t &result, int base=0, bool allowSign=true, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:228
SBuf consume(const SBuf::size_type n)
convenience method: consumes up to n bytes, counts, and returns them
Definition: Tokenizer.cc:24
bool skipOneTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:201
SBuf::size_type parsed_
bytes successfully parsed, including skipped
Definition: Tokenizer.h:170
int64_t udec64(const char *description, SBuf::size_type limit=SBuf::npos)
int64() wrapper but limited to unsigned decimal integers (for now)
Definition: Tokenizer.cc:306
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:179
Definition: SBuf.h:94
const char * rawContent() const
Definition: SBuf.cc:509
static const size_type npos
Definition: SBuf.h:99
SBuf consume(size_type n=npos)
Definition: SBuf.cc:481
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:415
const_reverse_iterator rbegin() const
Definition: SBuf.h:591
MemBlob::size_type size_type
Definition: SBuf.h:96
const_reverse_iterator rend() const
Definition: SBuf.h:595
#define DBG_DATA
Definition: Stream.h:40
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:193
SBuf ToSBuf(Args &&... args)
slowly stream-prints all arguments into a freshly allocated SBuf
Definition: Stream.h:63
#define INT64_MIN
Definition: types.h:79
#define INT64_MAX
Definition: types.h:89
#define xisupper(x)
Definition: xis.h:28
#define xisalpha(x)
Definition: xis.h:23
#define xisdigit(x)
Definition: xis.h:20

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors