testTokenizer.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9#include "squid.h"
10#include "base/CharacterSet.h"
11#include "compat/cppunit.h"
12#include "parser/Tokenizer.h"
13#include "unitTestMain.h"
14
15class TestTokenizer : public CPPUNIT_NS::TestFixture
16{
24
25protected:
28 void testTokenizerSkip();
29 void testTokenizerToken();
30 void testTokenizerInt64();
31};
33
34SBuf text("GET http://resource.com/path HTTP/1.1\r\n"
35 "Host: resource.com\r\n"
36 "Cookie: laijkpk3422r j1noin \r\n"
37 "\r\n");
38const CharacterSet alpha("alpha","abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
39const CharacterSet whitespace("whitespace"," \r\n");
40const CharacterSet crlf("crlf","\r\n");
41const CharacterSet tab("tab","\t");
42const CharacterSet numbers("numbers","0123456789");
43
44void
46{
47 const SBuf canary("This text should not be changed.");
48
50 SBuf s;
51
53 all += alpha;
54 all += crlf;
55 all += numbers;
56 all.add(':').add('.').add('/');
57
58 // an empty prefix should return false (the full output buffer case)
59 s = canary;
60 const SBuf before = t.remaining();
61 CPPUNIT_ASSERT(!t.prefix(s, all, 0));
62 // ... and a false return value means no parameter changes
63 CPPUNIT_ASSERT_EQUAL(canary, s);
64 // ... and a false return value means no input buffer changes
65 CPPUNIT_ASSERT_EQUAL(before, t.remaining());
66
67 // successful prefix tokenization
68 CPPUNIT_ASSERT(t.prefix(s,alpha));
69 CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
70 CPPUNIT_ASSERT(t.prefix(s,whitespace));
71 CPPUNIT_ASSERT_EQUAL(SBuf(" "),s);
72
73 //no match (first char is not in the prefix set)
74 CPPUNIT_ASSERT(!t.prefix(s,whitespace));
75 CPPUNIT_ASSERT_EQUAL(SBuf(" "),s);
76
77 // one more match to set S to something meaningful
78 CPPUNIT_ASSERT(t.prefix(s,alpha));
79 CPPUNIT_ASSERT_EQUAL(SBuf("http"),s);
80
81 //no match (no characters from the character set in the prefix)
82 CPPUNIT_ASSERT(!t.prefix(s,tab));
83 CPPUNIT_ASSERT_EQUAL(SBuf("http"),s); //output SBuf left untouched
84
85 // match until the end of the sample
86 CPPUNIT_ASSERT(t.prefix(s,all));
87 CPPUNIT_ASSERT_EQUAL(SBuf(),t.remaining());
88
89 // empty prefix should return false (the empty input buffer case)
90 s = canary;
91 CPPUNIT_ASSERT(!t.prefix(s, all));
92 // ... and a false return value means no parameter changes
93 CPPUNIT_ASSERT_EQUAL(canary, s);
94}
95
96void
98{
100 SBuf s;
101
102 // first scenario: patterns match
103 // prep for test
104 CPPUNIT_ASSERT(t.prefix(s,alpha));
105 CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
106
107 // test skipping one character from a character set
108 CPPUNIT_ASSERT(t.skipOne(whitespace));
109 // check that skip was right
110 CPPUNIT_ASSERT(t.prefix(s,alpha));
111 CPPUNIT_ASSERT_EQUAL(SBuf("http"),s);
112
113 //check skip prefix
114 CPPUNIT_ASSERT(t.skip(SBuf("://")));
115 // verify
116 CPPUNIT_ASSERT(t.prefix(s,alpha));
117 CPPUNIT_ASSERT_EQUAL(SBuf("resource"),s);
118
119 // no skip
120 CPPUNIT_ASSERT(!t.skipOne(alpha));
121 CPPUNIT_ASSERT(!t.skip(SBuf("://")));
122 CPPUNIT_ASSERT(!t.skip('a'));
123
124 // test skipping all characters from a character set while looking at .com
125 CPPUNIT_ASSERT(t.skip('.'));
126 CPPUNIT_ASSERT_EQUAL(static_cast<SBuf::size_type>(3), t.skipAll(alpha));
127 CPPUNIT_ASSERT(t.remaining().startsWith(SBuf("/path")));
128}
129
130void
132{
134 SBuf s;
135
136 // first scenario: patterns match
137 CPPUNIT_ASSERT(t.token(s,whitespace));
138 CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
139 CPPUNIT_ASSERT(t.token(s,whitespace));
140 CPPUNIT_ASSERT_EQUAL(SBuf("http://resource.com/path"),s);
141 CPPUNIT_ASSERT(t.token(s,whitespace));
142 CPPUNIT_ASSERT_EQUAL(SBuf("HTTP/1.1"),s);
143 CPPUNIT_ASSERT(t.token(s,whitespace));
144 CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s);
145
146}
147
148void
150{
151 const SBuf canary("This text should not be changed.");
152
154 SBuf s;
155
157 all += alpha;
158 all += crlf;
159 all += numbers;
160 all.add(':').add('.').add('/');
161
162 // an empty suffix should return false (the full output buffer case)
163 s = canary;
164 const SBuf before = t.remaining();
165 CPPUNIT_ASSERT(!t.suffix(s, all, 0));
166 // ... and a false return value means no parameter changes
167 CPPUNIT_ASSERT_EQUAL(canary, s);
168 // ... and a false return value means no input buffer changes
169 CPPUNIT_ASSERT_EQUAL(before, t.remaining());
170
171 // consume suffix until the last CRLF, including that last CRLF
172 SBuf::size_type remaining = t.remaining().length();
173 while (t.remaining().findLastOf(crlf) != SBuf::npos) {
174 CPPUNIT_ASSERT(t.remaining().length() > 0);
175 CPPUNIT_ASSERT(t.skipOneTrailing(all));
176 // ensure steady progress
177 CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1);
178 --remaining;
179 }
180
181 // no match (last char is not in the suffix set)
182 CPPUNIT_ASSERT(!t.suffix(s, crlf));
183 CPPUNIT_ASSERT(!t.suffix(s, whitespace));
184
185 // successful suffix tokenization
186 CPPUNIT_ASSERT(t.suffix(s, numbers));
187 CPPUNIT_ASSERT_EQUAL(SBuf("1"), s);
188 CPPUNIT_ASSERT(t.skipSuffix(SBuf("1.")));
189 CPPUNIT_ASSERT(t.skipSuffix(SBuf("/")));
190 CPPUNIT_ASSERT(t.suffix(s, alpha));
191 CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s);
192 CPPUNIT_ASSERT(t.suffix(s, whitespace));
193 CPPUNIT_ASSERT_EQUAL(SBuf(" "), s);
194
195 // match until the end of the sample
196 CPPUNIT_ASSERT(t.suffix(s, all));
197 CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining());
198
199 // an empty buffer does not end with a token
200 s = canary;
201 CPPUNIT_ASSERT(!t.suffix(s, all));
202 CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes
203
204 // we cannot skip an empty suffix, even in an empty buffer
205 CPPUNIT_ASSERT(!t.skipSuffix(SBuf()));
206}
207
208void
210{
211 // successful parse in base 10
212 {
213 int64_t rv;
214 Parser::Tokenizer t(SBuf("1234"));
215 const int64_t benchmark = 1234;
216 CPPUNIT_ASSERT(t.int64(rv, 10));
217 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
218 CPPUNIT_ASSERT(t.buf().isEmpty());
219 }
220
221 // successful parse, autodetect base
222 {
223 int64_t rv;
224 Parser::Tokenizer t(SBuf("1234"));
225 const int64_t benchmark = 1234;
226 CPPUNIT_ASSERT(t.int64(rv));
227 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
228 CPPUNIT_ASSERT(t.buf().isEmpty());
229 }
230
231 // successful parse, autodetect base
232 {
233 int64_t rv;
234 Parser::Tokenizer t(SBuf("01234"));
235 const int64_t benchmark = 01234;
236 CPPUNIT_ASSERT(t.int64(rv));
237 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
238 CPPUNIT_ASSERT(t.buf().isEmpty());
239 }
240
241 // successful parse, autodetect base
242 {
243 int64_t rv;
244 Parser::Tokenizer t(SBuf("0x12f4"));
245 const int64_t benchmark = 0x12f4;
246 CPPUNIT_ASSERT(t.int64(rv));
247 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
248 CPPUNIT_ASSERT(t.buf().isEmpty());
249 }
250
251 // API mismatch: don't eat leading space
252 {
253 int64_t rv;
254 Parser::Tokenizer t(SBuf(" 1234"));
255 CPPUNIT_ASSERT(!t.int64(rv));
256 CPPUNIT_ASSERT_EQUAL(SBuf(" 1234"), t.buf());
257 }
258
259 // API mismatch: don't eat multiple leading spaces
260 {
261 int64_t rv;
262 Parser::Tokenizer t(SBuf(" 1234"));
263 CPPUNIT_ASSERT(!t.int64(rv));
264 CPPUNIT_ASSERT_EQUAL(SBuf(" 1234"), t.buf());
265 }
266
267 // trailing spaces
268 {
269 int64_t rv;
270 Parser::Tokenizer t(SBuf("1234 foo"));
271 const int64_t benchmark = 1234;
272 CPPUNIT_ASSERT(t.int64(rv));
273 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
274 CPPUNIT_ASSERT_EQUAL(SBuf(" foo"), t.buf());
275 }
276
277 // trailing nonspaces
278 {
279 int64_t rv;
280 Parser::Tokenizer t(SBuf("1234foo"));
281 const int64_t benchmark = 1234;
282 CPPUNIT_ASSERT(t.int64(rv));
283 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
284 CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf());
285 }
286
287 // trailing nonspaces
288 {
289 int64_t rv;
290 Parser::Tokenizer t(SBuf("0x1234foo"));
291 const int64_t benchmark = 0x1234f;
292 CPPUNIT_ASSERT(t.int64(rv));
293 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
294 CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf());
295 }
296
297 // overflow
298 {
299 int64_t rv;
300 Parser::Tokenizer t(SBuf("1029397752385698678762234"));
301 CPPUNIT_ASSERT(!t.int64(rv));
302 CPPUNIT_ASSERT_EQUAL(SBuf("1029397752385698678762234"), t.buf());
303 }
304
305 // buffered sub-string parsing
306 {
307 int64_t rv;
308 SBuf base("1029397752385698678762234");
309 const int64_t benchmark = 22;
310 Parser::Tokenizer t(base.substr(base.length()-4,2));
311 CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf());
312 CPPUNIT_ASSERT(t.int64(rv));
313 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
314 CPPUNIT_ASSERT(t.buf().isEmpty());
315 }
316
317 // base-16, prefix
318 {
319 int64_t rv;
320 SBuf base("deadbeefrow");
321 const int64_t benchmark=0xdeadbeef;
322 Parser::Tokenizer t(base);
323 CPPUNIT_ASSERT(t.int64(rv,16));
324 CPPUNIT_ASSERT_EQUAL(benchmark,rv);
325 CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf());
326
327 }
328}
329
330int
331main(int argc, char *argv[])
332{
333 return TestProgram().run(argc, argv);
334}
335
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:18
CharacterSet & add(const unsigned char c)
add a given character to the character set
Definition: CharacterSet.cc:47
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:79
bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:117
bool skipOne(const CharacterSet &discardables)
Definition: Tokenizer.cc:161
bool token(SBuf &returnedToken, const CharacterSet &delimiters)
Definition: Tokenizer.cc:61
bool skipSuffix(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:172
SBuf::size_type skipAll(const CharacterSet &discardables)
Definition: Tokenizer.cc:137
bool int64(int64_t &result, int base=0, bool allowSign=true, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:238
SBuf buf() const
yet unparsed data
Definition: Tokenizer.h:35
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
bool skipOneTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:211
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:189
Definition: SBuf.h:94
static const size_type npos
Definition: SBuf.h:99
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:415
bool isEmpty() const
Definition: SBuf.h:431
size_type findLastOf(const CharacterSet &set, size_type endPos=npos) const
Definition: SBuf.cc:769
bool startsWith(const SBuf &S, const SBufCaseSensitive isCaseSensitive=caseSensitive) const
Definition: SBuf.cc:442
SBuf substr(size_type pos, size_type n=npos) const
Definition: SBuf.cc:576
MemBlob::size_type size_type
Definition: SBuf.h:96
implements test program's main() function while enabling customization
Definition: unitTestMain.h:26
int run(int argc, char *argv[])
Definition: unitTestMain.h:44
CPPUNIT_TEST(testTokenizerPrefix)
CPPUNIT_TEST(testTokenizerSuffix)
void testTokenizerPrefix()
void testTokenizerSkip()
void testTokenizerToken()
CPPUNIT_TEST(testTokenizerToken)
CPPUNIT_TEST(testTokenizerSkip)
void testTokenizerInt64()
void testTokenizerSuffix()
CPPUNIT_TEST(testTokenizerInt64)
CPPUNIT_TEST_SUITE(TestTokenizer)
int main(int argc, char *argv[])
const CharacterSet tab("tab","\t")
SBuf text("GET http://resource.com/path HTTP/1.1\r\n" "Host: resource.com\r\n" "Cookie: laijkpk3422r j1noin \r\n" "\r\n")
CPPUNIT_TEST_SUITE_REGISTRATION(TestTokenizer)
const CharacterSet alpha("alpha","abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
const CharacterSet crlf("crlf","\r\n")
const CharacterSet whitespace("whitespace"," \r\n")
const CharacterSet numbers("numbers","0123456789")

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors