Tokenizer.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 24 SBuf */
10 
11 #include "squid.h"
12 #include "Debug.h"
13 #include "parser/forward.h"
14 #include "parser/Tokenizer.h"
15 #include "sbuf/Stream.h"
16 
17 #include <cerrno>
18 #if HAVE_CTYPE_H
19 #include <ctype.h>
20 #endif
21 
23 SBuf
25 {
26  // careful: n may be npos!
27  debugs(24, 5, "consuming " << n << " bytes");
28  const SBuf result = buf_.consume(n);
29  parsed_ += result.length();
30  return result;
31 }
32 
36 {
37  return consume(n).length();
38 }
39 
41 SBuf
43 {
44  debugs(24, 5, "consuming " << n << " bytes");
45 
46  // If n is npos, we consume everything from buf_ (and nothing from result).
47  const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
48 
49  SBuf result = buf_;
50  buf_ = result.consume(buf_.length() - parsed);
51  parsed_ += parsed;
52  return result;
53 }
54 
58 {
59  return consumeTrailing(n).length();
60 }
61 
62 bool
63 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
64 {
65  const Tokenizer saved(*this);
66  skipAll(delimiters);
67  const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
68  if (tokenLen == SBuf::npos) {
69  debugs(24, 8, "no token found for delimiters " << delimiters.name);
70  *this = saved;
71  return false;
72  }
73  returnedToken = consume(tokenLen); // cannot be empty
74  skipAll(delimiters);
75  debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
76  returnedToken << '\'');
77  return true;
78 }
79 
80 bool
81 Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
82 {
83  SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
84  if (prefixLen == 0) {
85  debugs(24, 8, "no prefix for set " << tokenChars.name);
86  return false;
87  }
88  if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
89  debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
90  return false;
91  }
92  if (prefixLen == SBuf::npos && limit > 0) {
93  debugs(24, 8, "whole haystack matched");
94  prefixLen = limit;
95  }
96  debugs(24, 8, "found with length " << prefixLen);
97  returnedToken = consume(prefixLen); // cannot be empty after the npos check
98  return true;
99 }
100 
101 SBuf
102 Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
103 {
104  if (atEnd())
105  throw InsufficientInput();
106 
107  SBuf result;
108 
109  if (!prefix(result, tokenChars, limit))
110  throw TexcHere(ToSBuf("cannot parse ", description));
111 
112  if (atEnd())
113  throw InsufficientInput();
114 
115  return result;
116 }
117 
118 bool
119 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
120 {
121  SBuf span = buf_;
122 
123  if (limit < buf_.length())
124  span.consume(buf_.length() - limit); // ignore the N prefix characters
125 
126  auto i = span.rbegin();
127  SBuf::size_type found = 0;
128  while (i != span.rend() && tokenChars[*i]) {
129  ++i;
130  ++found;
131  }
132  if (!found)
133  return false;
134  returnedToken = consumeTrailing(found);
135  return true;
136 }
137 
140 {
141  const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
142  if (prefixLen == 0) {
143  debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
144  return 0;
145  }
146  debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
147  return success(prefixLen);
148 }
149 
150 bool
152 {
153  if (!buf_.isEmpty() && chars[buf_[0]]) {
154  debugs(24, 8, "skipping one-of " << chars.name);
155  return success(1);
156  }
157  debugs(24, 8, "no match while skipping one-of " << chars.name);
158  return false;
159 }
160 
161 bool
163 {
164  if (buf_.length() < tokenToSkip.length())
165  return false;
166 
167  SBuf::size_type offset = 0;
168  if (tokenToSkip.length() < buf_.length())
169  offset = buf_.length() - tokenToSkip.length();
170 
171  if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
172  debugs(24, 8, "skipping " << tokenToSkip.length());
173  return successTrailing(tokenToSkip.length());
174  }
175  return false;
176 }
177 
178 bool
179 Parser::Tokenizer::skip(const SBuf &tokenToSkip)
180 {
181  if (buf_.startsWith(tokenToSkip)) {
182  debugs(24, 8, "skipping " << tokenToSkip.length());
183  return success(tokenToSkip.length());
184  }
185  debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
186  return false;
187 }
188 
189 bool
190 Parser::Tokenizer::skip(const char tokenChar)
191 {
192  if (!buf_.isEmpty() && buf_[0] == tokenChar) {
193  debugs(24, 8, "skipping char '" << tokenChar << '\'');
194  return success(1);
195  }
196  debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
197  return false;
198 }
199 
200 bool
202 {
203  if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
204  debugs(24, 8, "skipping one-of " << skippable.name);
205  return successTrailing(1);
206  }
207  debugs(24, 8, "no match while skipping one-of " << skippable.name);
208  return false;
209 }
210 
213 {
214  const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
215  const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
216  0 : (prefixEnd + 1);
217  const SBuf::size_type suffixLen = buf_.length() - prefixLen;
218  if (suffixLen == 0) {
219  debugs(24, 8, "no match when trying to skip " << skippable.name);
220  return 0;
221  }
222  debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
223  return successTrailing(suffixLen);
224 }
225 
226 /* reworked from compat/strtoll.c */
227 bool
228 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
229 {
230  if (atEnd() || limit == 0)
231  return false;
232 
233  const SBuf range(buf_.substr(0,limit));
234 
235  //fixme: account for buf_.size()
236  bool neg = false;
237  const char *s = range.rawContent();
238  const char *end = range.rawContent() + range.length();
239 
240  if (allowSign) {
241  if (*s == '-') {
242  neg = true;
243  ++s;
244  } else if (*s == '+') {
245  ++s;
246  }
247  if (s >= end) return false;
248  }
249  if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
250  tolower(*(s+1)) == 'x') {
251  s += 2;
252  base = 16;
253  }
254  if (base == 0) {
255  if ( *s == '0') {
256  base = 8;
257  ++s;
258  } else {
259  base = 10;
260  }
261  }
262  if (s >= end) return false;
263 
264  uint64_t cutoff;
265 
266  cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
267  const int cutlim = cutoff % static_cast<int64_t>(base);
268  cutoff /= static_cast<uint64_t>(base);
269 
270  int any = 0, c;
271  int64_t acc = 0;
272  do {
273  c = *s;
274  if (xisdigit(c)) {
275  c -= '0';
276  } else if (xisalpha(c)) {
277  c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
278  } else {
279  break;
280  }
281  if (c >= base)
282  break;
283  if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
284  any = -1;
285  else {
286  any = 1;
287  acc *= base;
288  acc += c;
289  }
290  } while (++s < end);
291 
292  if (any == 0) // nothing was parsed
293  return false;
294  if (any < 0) {
295  acc = neg ? INT64_MIN : INT64_MAX;
296  errno = ERANGE;
297  return false;
298  } else if (neg)
299  acc = -acc;
300 
301  result = acc;
302  return success(s - range.rawContent());
303 }
304 
305 int64_t
306 Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
307 {
308  if (atEnd())
309  throw InsufficientInput();
310 
311  int64_t result = 0;
312 
313  // Since we only support unsigned decimals, a parsing failure with a
314  // non-empty input always implies invalid/malformed input (or a buggy
315  // limit=0 caller). TODO: Support signed and non-decimal integers by
316  // refactoring int64() to detect insufficient input.
317  if (!int64(result, 10, false, limit))
318  throw TexcHere(ToSBuf("cannot parse ", description));
319 
320  if (atEnd())
321  throw InsufficientInput(); // more digits may be coming
322 
323  return result;
324 }
325 
size_type findFirstOf(const CharacterSet &set, size_type startPos=0) const
Definition: SBuf.cc:733
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:404
const char * name
optional set label for debugging (default: "anonymous")
Definition: CharacterSet.h:69
int64_t udec64(const char *description, SBuf::size_type limit=SBuf::npos)
int64() wrapper but limited to unsigned decimal integers (for now)
Definition: Tokenizer.cc:306
SBuf consume(const SBuf::size_type n)
convenience method: consumes up to n bytes, counts, and returns them
Definition: Tokenizer.cc:24
size_type findFirstNotOf(const CharacterSet &set, size_type startPos=0) const
Definition: SBuf.cc:756
const_reverse_iterator rbegin() const
Definition: SBuf.h:580
Definition: SBuf.h:86
int i
Definition: membanger.c:49
SBuf ToSBuf(Args &&... args)
slowly stream-prints all arguments into a freshly allocated SBuf
Definition: Stream.h:124
bool isEmpty() const
Definition: SBuf.h:420
const_reverse_iterator rend() const
Definition: SBuf.h:584
bool startsWith(const SBuf &S, const SBufCaseSensitive isCaseSensitive=caseSensitive) const
Definition: SBuf.cc:452
size_type findLastNotOf(const CharacterSet &set, size_type endPos=npos) const
Definition: SBuf.cc:800
SBuf::size_type successTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns their count
Definition: Tokenizer.cc:57
bool skipSuffix(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:162
SBuf consumeTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns them
Definition: Tokenizer.cc:42
SBuf::size_type parsed_
bytes successfully parsed, including skipped
Definition: Tokenizer.h:170
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Debug.h:124
thrown by modern "incremental" parsers when they need more data
Definition: forward.h:18
SBuf::size_type skipAllTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:212
SBuf buf_
yet unparsed input
Definition: Tokenizer.h:169
bool skipOneTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:201
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:17
bool skipOne(const CharacterSet &discardables)
Definition: Tokenizer.cc:151
SBuf consume(size_type n=npos)
Definition: SBuf.cc:491
SBuf::size_type success(const SBuf::size_type n)
convenience method: consume()s up to n bytes and returns their count
Definition: Tokenizer.cc:35
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:179
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:81
#define INT64_MAX
Definition: strtoll.c:70
bool atEnd() const
whether the end of the buffer has been reached
Definition: Tokenizer.h:41
bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:119
int cmp(const SBuf &S, const size_type n) const
shorthand version for compare()
Definition: SBuf.h:264
SBuf substr(size_type pos, size_type n=npos) const
Definition: SBuf.cc:586
#define xisdigit(x)
Definition: xis.h:20
static const size_type npos
Definition: SBuf.h:92
#define TexcHere(msg)
legacy convenience macro; it is not difficult to type Here() now
Definition: TextException.h:55
SBuf::size_type skipAll(const CharacterSet &discardables)
Definition: Tokenizer.cc:139
#define xisupper(x)
Definition: xis.h:28
#define xisalpha(x)
Definition: xis.h:23
#define DBG_DATA
Definition: Debug.h:48
MemBlob::size_type size_type
Definition: SBuf.h:89
#define INT64_MIN
Definition: strtoll.c:60
const char * rawContent() const
Definition: SBuf.cc:519
bool token(SBuf &returnedToken, const CharacterSet &delimiters)
Definition: Tokenizer.cc:63
bool int64(int64_t &result, int base=0, bool allowSign=true, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:228

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors