Uri.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9/* DEBUG: section 23 URL Parsing */
10
11#include "squid.h"
12#include "anyp/Uri.h"
13#include "base/Raw.h"
14#include "globals.h"
15#include "HttpRequest.h"
16#include "parser/Tokenizer.h"
17#include "rfc1738.h"
18#include "SquidConfig.h"
19#include "SquidMath.h"
20#include "SquidString.h"
21
22static const char valid_hostname_chars_u[] =
23 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
24 "abcdefghijklmnopqrstuvwxyz"
25 "0123456789-._"
26 "[:]"
27 ;
28static const char valid_hostname_chars[] =
29 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
30 "abcdefghijklmnopqrstuvwxyz"
31 "0123456789-."
32 "[:]"
33 ;
34
36static const CharacterSet &
38{
39 /*
40 * RFC 3986 section 3.2.1
41 *
42 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
43 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
44 * pct-encoded = "%" HEXDIG HEXDIG
45 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
46 */
47 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
50 return userInfoValid;
51}
52
56SBuf
57AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
58{
59 if (buf.isEmpty())
60 return buf;
61
62 Parser::Tokenizer tk(buf);
63 SBuf goodSection;
64 // optimization for the arguably common "no encoding necessary" case
65 if (tk.prefix(goodSection, ignore) && tk.atEnd())
66 return buf;
67
68 SBuf output;
69 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
70 output.append(goodSection); // may be empty
71
72 while (!tk.atEnd()) {
73 // TODO: Add Tokenizer::parseOne(void).
74 const auto ch = tk.remaining()[0];
75 output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table
76 (void)tk.skip(ch);
77
78 if (tk.prefix(goodSection, ignore))
79 output.append(goodSection);
80 }
81
82 return output;
83}
84
85const SBuf &
87{
88 static SBuf star("*");
89 return star;
90}
91
92const SBuf &
94{
95 static SBuf slash("/");
96 return slash;
97}
98
99void
100AnyP::Uri::host(const char *src)
101{
102 hostAddr_.fromHost(src);
103 if (hostAddr_.isAnyAddr()) {
104 xstrncpy(host_, src, sizeof(host_));
105 hostIsNumeric_ = false;
106 } else {
107 hostAddr_.toHostStr(host_, sizeof(host_));
108 debugs(23, 3, "given IP: " << hostAddr_);
109 hostIsNumeric_ = 1;
110 }
111 touch();
112}
113
114SBuf
116{
117 if (hostIsNumeric()) {
118 static char ip[MAX_IPSTRLEN];
119 const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
120 return SBuf(ip, hostStrLen);
121 } else
122 return SBuf(host());
123}
124
125const SBuf &
127{
128 // RFC 3986 section 3.3 says path can be empty (path-abempty).
129 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
130 // at least when sending and using. We must still accept path-abempty as input.
131 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
132 return SlashPath();
133
134 return path_;
135}
136
137void
139{
140 debugs(23, 5, "urlInitialize: Initializing...");
141 /* this ensures that the number of protocol strings is the same as
142 * the enum slots allocated because the last enum is always 'MAX'.
143 */
144 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
145 /*
146 * These test that our matchDomainName() function works the
147 * way we expect it to.
148 */
149 assert(0 == matchDomainName("foo.com", "foo.com"));
150 assert(0 == matchDomainName(".foo.com", "foo.com"));
151 assert(0 == matchDomainName("foo.com", ".foo.com"));
152 assert(0 == matchDomainName(".foo.com", ".foo.com"));
153 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
154 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
155 assert(0 != matchDomainName("x.foo.com", "foo.com"));
156 assert(0 != matchDomainName("foo.com", "x.foo.com"));
157 assert(0 != matchDomainName("bar.com", "foo.com"));
158 assert(0 != matchDomainName(".bar.com", "foo.com"));
159 assert(0 != matchDomainName(".bar.com", ".foo.com"));
160 assert(0 != matchDomainName("bar.com", ".foo.com"));
161 assert(0 < matchDomainName("zzz.com", "foo.com"));
162 assert(0 > matchDomainName("aaa.com", "foo.com"));
163 assert(0 == matchDomainName("FOO.com", "foo.COM"));
164 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
165 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
166 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
167
168 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
169 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
170 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
171 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
172
173 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
174 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
175 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
176 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
177
178 assert(0 != matchDomainName("foo.com", ""));
179 assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards));
180 assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains));
181
182 /* more cases? */
183}
184
192static AnyP::UriScheme
194{
195 /*
196 * RFC 3986 section 3.1 paragraph 2:
197 *
198 * Scheme names consist of a sequence of characters beginning with a
199 * letter and followed by any combination of letters, digits, plus
200 * ("+"), period ("."), or hyphen ("-").
201 */
202 static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
203
204 SBuf str;
205 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
206 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
207 if (protocol == AnyP::PROTO_UNKNOWN)
208 return AnyP::UriScheme(protocol, str.c_str());
209 return AnyP::UriScheme(protocol, nullptr);
210 }
211
212 throw TextException("invalid URI scheme", Here());
213}
214
222bool
224{
225 /* For IPv4 addresses check for a dot */
226 /* For IPv6 addresses also check for a colon */
227 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
228 const uint64_t dlen = strlen(host);
229 const uint64_t want = dlen + Config.appendDomainLen;
230 if (want > SQUIDHOSTNAMELEN - 1) {
231 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
232 return false;
233 }
234 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
235 }
236 return true;
237}
238
239/*
240 * Parse a URI/URL.
241 *
242 * It is assumed that the URL is complete -
243 * ie, the end of the string is the end of the URL. Don't pass a partial
244 * URL here as this routine doesn't have any way of knowing whether
245 * it is partial or not (ie, it handles the case of no trailing slash as
246 * being "end of host with implied path of /".
247 *
248 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
249 * then rather than a URL a hostname:port is looked for.
250 */
251bool
252AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
253{
254 try {
255
256 LOCAL_ARRAY(char, login, MAX_URL);
257 LOCAL_ARRAY(char, foundHost, MAX_URL);
258 LOCAL_ARRAY(char, urlpath, MAX_URL);
259 char *t = nullptr;
260 char *q = nullptr;
261 int foundPort;
262 int l;
263 int i;
264 const char *src;
265 char *dst;
266 foundHost[0] = urlpath[0] = login[0] = '\0';
267
268 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
269 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
270 return false;
271 }
272
273 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
274 Asterisk().cmp(rawUrl) == 0) {
275 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
276 setScheme(AnyP::PROTO_HTTP, nullptr);
277 port(getScheme().defaultPort());
278 path(Asterisk());
279 return true;
280 }
281
282 Parser::Tokenizer tok(rawUrl);
283 AnyP::UriScheme scheme;
284
285 if (method == Http::METHOD_CONNECT) {
286 // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
287 // port number of the tunnel destination, separated by a colon".
288
289 const auto rawHost = parseHost(tok);
290 Assure(rawHost.length() < sizeof(foundHost));
291 SBufToCstring(foundHost, rawHost);
292
293 if (!tok.skip(':'))
294 throw TextException("missing required :port in CONNECT target", Here());
295 foundPort = parsePort(tok);
296
297 if (!tok.remaining().isEmpty())
298 throw TextException("garbage after host:port in CONNECT target", Here());
299 } else {
300
301 scheme = uriParseScheme(tok);
302
303 if (scheme == AnyP::PROTO_NONE)
304 return false; // invalid scheme
305
306 if (scheme == AnyP::PROTO_URN) {
307 parseUrn(tok); // throws on any error
308 return true;
309 }
310
311 // URLs then have "//"
312 static const SBuf doubleSlash("//");
313 if (!tok.skip(doubleSlash))
314 return false;
315
316 auto B = tok.remaining();
317 const char *url = B.c_str();
318
319 /* Parse the URL: */
320 src = url;
321 i = 0;
322
323 /* Then everything until first /; that's host (and port; which we'll look for here later) */
324 // bug 1881: If we don't get a "/" then we imply it was there
325 // bug 3074: We could just be given a "?" or "#". These also imply "/"
326 // bug 3233: whitespace is also a hostname delimiter.
327 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
328 *dst = *src;
329 }
330
331 /*
332 * We can't check for "i >= l" here because we could be at the end of the line
333 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
334 * been -given- a valid URL and the path is just '/'.
335 */
336 if (i > l)
337 return false;
338 *dst = '\0';
339
340 // We are looking at path-abempty.
341 if (*src != '/') {
342 // path-empty, including the end of the `src` c-string cases
343 urlpath[0] = '/';
344 dst = &urlpath[1];
345 } else {
346 dst = urlpath;
347 }
348 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
349 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
350 *dst = *src;
351 }
352
353 /* We -could- be at the end of the buffer here */
354 if (i > l)
355 return false;
356 *dst = '\0';
357
358 // If the parsed scheme has no (known) default port, and there is no
359 // explicit port, then we will reject the zero port during foundPort
360 // validation, often resulting in a misleading 400/ERR_INVALID_URL.
361 // TODO: Remove this hack when switching to Tokenizer-based parsing.
362 foundPort = scheme.defaultPort().value_or(0); // may be reset later
363
364 /* Is there any login information? (we should eventually parse it above) */
365 t = strrchr(foundHost, '@');
366 if (t != nullptr) {
367 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
368 login[sizeof(login)-1] = '\0';
369 t = strrchr(login, '@');
370 *t = 0;
371 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
372 foundHost[sizeof(foundHost)-1] = '\0';
373 // Bug 4498: URL-unescape the login info after extraction
374 rfc1738_unescape(login);
375 }
376
377 /* Is there any host information? (we should eventually parse it above) */
378 if (*foundHost == '[') {
379 /* strip any IPA brackets. valid under IPv6. */
380 dst = foundHost;
381 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
382 src = foundHost;
383 ++src;
384 l = strlen(foundHost);
385 i = 1;
386 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
387 *dst = *src;
388 }
389
390 /* we moved in-place, so truncate the actual hostname found */
391 *dst = '\0';
392 ++dst;
393
394 /* skip ahead to either start of port, or original EOS */
395 while (*dst != '\0' && *dst != ':')
396 ++dst;
397 t = dst;
398 } else {
399 t = strrchr(foundHost, ':');
400
401 if (t != strchr(foundHost,':') ) {
402 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
403 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
404 /* therefore we MUST accept the case where they are not bracketed at all. */
405 t = nullptr;
406 }
407 }
408
409 // Bug 3183 sanity check: If scheme is present, host must be too.
410 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
411 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
412 return false;
413 }
414
415 if (t && *t == ':') {
416 *t = '\0';
417 ++t;
418 foundPort = atoi(t);
419 }
420 }
421
422 for (t = foundHost; *t; ++t)
423 *t = xtolower(*t);
424
425 if (stringHasWhitespace(foundHost)) {
427 t = q = foundHost;
428 while (*t) {
429 if (!xisspace(*t)) {
430 *q = *t;
431 ++q;
432 }
433 ++t;
434 }
435 *q = '\0';
436 }
437 }
438
439 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
440
442 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
443 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
444 return false;
445 }
446
447 if (!urlAppendDomain(foundHost))
448 return false;
449
450 /* remove trailing dots from hostnames */
451 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
452 foundHost[l] = '\0';
453
454 /* reject duplicate or leading dots */
455 if (strstr(foundHost, "..") || *foundHost == '.') {
456 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
457 return false;
458 }
459
460 if (foundPort < 1 || foundPort > 65535) {
461 debugs(23, 3, "Invalid port '" << foundPort << "'");
462 return false;
463 }
464
465 if (stringHasWhitespace(urlpath)) {
466 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
467
468 switch (Config.uri_whitespace) {
469
471 return false;
472
474 break;
475
477 t = rfc1738_escape_unescaped(urlpath);
478 xstrncpy(urlpath, t, MAX_URL);
479 break;
480
482 *(urlpath + strcspn(urlpath, w_space)) = '\0';
483 break;
484
486 default:
487 t = q = urlpath;
488 while (*t) {
489 if (!xisspace(*t)) {
490 *q = *t;
491 ++q;
492 }
493 ++t;
494 }
495 *q = '\0';
496 }
497 }
498
499 setScheme(scheme);
500 path(urlpath);
501 host(foundHost);
502 userInfo(SBuf(login));
503 port(foundPort);
504 return true;
505
506 } catch (...) {
507 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
508 return false;
509 }
510}
511
526void
528{
529 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
530 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
531 SBuf nid;
532 if (!tok.prefix(nid, nidChars, 32))
533 throw TextException("NID not found", Here());
534
535 if (!tok.skip(':'))
536 throw TextException("NID too long or missing ':' delimiter", Here());
537
538 if (nid.length() < 2)
539 throw TextException("NID too short", Here());
540
541 if (!alphanum[*nid.begin()])
542 throw TextException("NID prefix is not alphanumeric", Here());
543
544 if (!alphanum[*nid.rbegin()])
545 throw TextException("NID suffix is not alphanumeric", Here());
546
547 setScheme(AnyP::PROTO_URN, nullptr);
548 host(nid.c_str());
549 // TODO validate path characters
550 path(tok.remaining());
551 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
552}
553
557SBuf
559{
560 // host = IP-literal / IPv4address / reg-name
561
562 // XXX: CharacterSets below reject uri-host values containing whitespace
563 // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
564 // can be interpreted as if it applies to uri-host and this code. TODO: Fix
565 // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
566 // port, etc.) from that directive scope.
567
568 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
569 if (tok.skip('[')) {
570 // Add "." because IPv6address in RFC 3986 includes ls32, which includes
571 // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
572 // This set rejects IPvFuture that needs a "v" character.
573 static const CharacterSet IPv6chars = (
574 CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
575 SBuf ipv6ish;
576 if (!tok.prefix(ipv6ish, IPv6chars))
577 throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
578
579 if (!tok.skip(']'))
580 throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
581
582 // This rejects bracketed IPv4address and domain names because they lack ":".
583 if (ipv6ish.find(':') == SBuf::npos)
584 throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
585
586 // This rejects bracketed non-IP addresses that our caller would have
587 // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
588 Ip::Address ipv6check;
589 if (!ipv6check.fromHost(ipv6ish.c_str()))
590 throw TextException("malformed bracketed IPv6 address in uri-host", Here());
591
592 return ipv6ish;
593 }
594
595 // no brackets implies we are looking at IPv4address or reg-name
596
597 // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&"
598 // and "1.2.3.4.5"). TODO: Add more checks here, after migrating the
599 // non-CONNECT uri-host parsing code to use us.
600
601 SBuf otherHost; // IPv4address-ish or reg-name-ish;
602 // ":" is not in TCHAR so we will stop before any port specification
603 if (tok.prefix(otherHost, CharacterSet::TCHAR))
604 return otherHost;
605
606 throw TextException("malformed IPv4 address or host name in uri-host", Here());
607}
608
615int
617{
618 if (tok.skip('0'))
619 throw TextException("zero or zero-prefixed port", Here());
620
621 int64_t rawPort = 0;
622 if (!tok.int64(rawPort, 10, false)) // port = *DIGIT
623 throw TextException("malformed or missing port", Here());
624
625 Assure(rawPort > 0);
626 constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it.
627 constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max();
628 static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number");
629 if (Less(portMax, rawPort))
630 throw TextException("huge port", Here());
631
632 // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
633 // code to use us (so that foundPort "int" disappears or starts using Port).
634 return NaturalCast<int>(rawPort);
635}
636
637void
639{
640 absolute_.clear();
641 authorityHttp_.clear();
642 authorityWithPort_.clear();
643}
644
645SBuf &
646AnyP::Uri::authority(bool requirePort) const
647{
648 if (authorityHttp_.isEmpty()) {
649
650 // both formats contain Host/IP
651 authorityWithPort_.append(host());
652 authorityHttp_ = authorityWithPort_;
653
654 if (port().has_value()) {
655 authorityWithPort_.appendf(":%hu", *port());
656 // authorityHttp_ only has :port for known non-default ports
657 if (port() != getScheme().defaultPort())
658 authorityHttp_ = authorityWithPort_;
659 }
660 // else XXX: We made authorityWithPort_ that does not have a port.
661 // TODO: Audit callers and refuse to give out broken authorityWithPort_.
662 }
663
664 return requirePort ? authorityWithPort_ : authorityHttp_;
665}
666
667SBuf &
669{
670 if (absolute_.isEmpty()) {
671 // TODO: most URL will be much shorter, avoid allocating this much
672 absolute_.reserveCapacity(MAX_URL);
673
674 absolute_.append(getScheme().image());
675 absolute_.append(":",1);
676 if (getScheme() != AnyP::PROTO_URN) {
677 absolute_.append("//", 2);
678 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
679 getScheme() == AnyP::PROTO_UNKNOWN;
680
681 if (allowUserInfo && !userInfo().isEmpty()) {
682 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
683 .remove('%')
684 .rename("userinfo-reserved");
685 absolute_.append(Encode(userInfo(), uiChars));
686 absolute_.append("@", 1);
687 }
688 absolute_.append(authority());
689 } else {
690 absolute_.append(host());
691 absolute_.append(":", 1);
692 }
693 absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
694 }
695
696 return absolute_;
697}
698
699/* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
700 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
701 * and never copy the query-string part in the first place
702 */
703char *
705{
706 LOCAL_ARRAY(char, buf, MAX_URL);
707
708 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
709 buf[sizeof(buf)-1] = '\0';
710
711 // URN, CONNECT method, and non-stripped URIs can go straight out
712 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
713 // strip anything AFTER a question-mark
714 // leaving the '?' in place
715 if (auto t = strchr(buf, '?')) {
716 *(++t) = '\0';
717 }
718 }
719
720 if (stringHasCntl(buf))
722
723 return buf;
724}
725
732const char *
734{
735 LOCAL_ARRAY(char, buf, MAX_URL);
736
737 // method CONNECT and port HTTPS
738 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
739 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
740 return buf;
741 }
742
743 // else do the normal complete canonical thing.
744 return request->canonicalCleanUrl();
745}
746
759bool
760urlIsRelative(const char *url)
761{
762 if (!url)
763 return false; // no URL
764
765 /*
766 * RFC 3986 section 5.2.3
767 *
768 * path = path-abempty ; begins with "/" or is empty
769 * / path-absolute ; begins with "/" but not "//"
770 * / path-noscheme ; begins with a non-colon segment
771 * / path-rootless ; begins with a segment
772 * / path-empty ; zero characters
773 */
774
775 if (*url == '\0')
776 return true; // path-empty
777
778 if (*url == '/') {
779 // RFC 3986 section 5.2.3
780 // path-absolute ; begins with "/" but not "//"
781 if (url[1] == '/')
782 return true; // network-path reference, aka. 'scheme-relative URI'
783 else
784 return true; // path-absolute, aka 'absolute-path reference'
785 }
786
787 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
788 if (*p == ':')
789 return false; // colon is forbidden in first segment
790 }
791
792 return true; // path-noscheme, path-abempty, path-rootless
793}
794
795void
796AnyP::Uri::addRelativePath(const char *relUrl)
797{
798 // URN cannot be merged
799 if (getScheme() == AnyP::PROTO_URN)
800 return;
801
802 // TODO: Handle . and .. segment normalization
803
804 const auto lastSlashPos = path_.rfind('/');
805 // TODO: To optimize and simplify, add and use SBuf::replace().
806 const auto relUrlLength = strlen(relUrl);
807 if (lastSlashPos == SBuf::npos) {
808 // start replacing the whole path
809 path_.reserveCapacity(1 + relUrlLength);
810 path_.assign("/", 1);
811 } else {
812 // start replacing just the last segment
813 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
814 path_.chop(0, lastSlashPos+1);
815 }
816 path_.append(relUrl, relUrlLength);
817}
818
819int
820matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
821{
822 int dl;
823 int hl;
824
825 const bool hostIncludesSubdomains = (*h == '.');
826 while ('.' == *h)
827 ++h;
828
829 hl = strlen(h);
830
831 if (hl == 0)
832 return -1;
833
834 dl = strlen(d);
835 if (dl == 0)
836 return 1;
837
838 /*
839 * Start at the ends of the two strings and work towards the
840 * beginning.
841 */
842 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
843 if (hl == 0 && dl == 0) {
844 /*
845 * We made it all the way to the beginning of both
846 * strings without finding any difference.
847 */
848 return 0;
849 }
850
851 if (0 == hl) {
852 /*
853 * The host string is shorter than the domain string.
854 * There is only one case when this can be a match.
855 * If the domain is just one character longer, and if
856 * that character is a leading '.' then we call it a
857 * match.
858 */
859
860 if (1 == dl && '.' == d[0])
861 return 0;
862 else
863 return -1;
864 }
865
866 if (0 == dl) {
867 /*
868 * The domain string is shorter than the host string.
869 * This is a match only if the first domain character
870 * is a leading '.'.
871 */
872
873 if ('.' == d[0]) {
874 if (flags & mdnRejectSubsubDomains) {
875 // Check for sub-sub domain and reject
876 while(--hl >= 0 && h[hl] != '.');
877 if (hl < 0) {
878 // No sub-sub domain found, but reject if there is a
879 // leading dot in given host string (which is removed
880 // before the check is started).
881 return hostIncludesSubdomains ? 1 : 0;
882 } else
883 return 1; // sub-sub domain, reject
884 } else
885 return 0;
886 } else
887 return 1;
888 }
889 }
890
891 /*
892 * We found different characters in the same position (from the end).
893 */
894
895 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
896 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
897 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
898 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
899 return 0;
900
901 /*
902 * If one of those character is '.' then its special. In order
903 * for splay tree sorting to work properly, "x-foo.com" must
904 * be greater than ".foo.com" even though '-' is less than '.'.
905 */
906 if ('.' == d[dl])
907 return 1;
908
909 if ('.' == h[hl])
910 return -1;
911
912 return (xtolower(h[hl]) - xtolower(d[dl]));
913}
914
915/*
916 * return true if we can serve requests for this method.
917 */
918bool
920{
921 /* protocol "independent" methods
922 *
923 * actually these methods are specific to HTTP:
924 * they are methods we receive on our HTTP port,
925 * and if we had a FTP listener would not be relevant
926 * there.
927 *
928 * So, we should delegate them to HTTP. The problem is that we
929 * do not have a default protocol from the client side of HTTP.
930 */
931
933 return true;
934
935 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
936 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
939
940 if (r->method == Http::METHOD_PURGE)
941 return true;
942
943 /* does method match the protocol? */
944 switch (r->url.getScheme()) {
945
946 case AnyP::PROTO_URN:
947 case AnyP::PROTO_HTTP:
948 return true;
949
950 case AnyP::PROTO_FTP:
951 if (r->method == Http::METHOD_PUT ||
952 r->method == Http::METHOD_GET ||
954 return true;
955 return false;
956
957 case AnyP::PROTO_WAIS:
959 if (r->method == Http::METHOD_GET ||
961 return true;
962 return false;
963
965#if USE_OPENSSL || USE_GNUTLS
966 return true;
967#else
968 /*
969 * Squid can't originate an SSL connection, so it should
970 * never receive an "https:" URL. It should always be
971 * CONNECT instead.
972 */
973 return false;
974#endif
975
976 default:
977 return false;
978 }
979
980 /* notreached */
981 return false;
982}
983
985 scheme_(aScheme),
986 hostIsNumeric_(false)
987{
988 *host_=0;
989}
990
991// TODO: fix code duplication with AnyP::Uri::parse()
992char *
993AnyP::Uri::cleanup(const char *uri)
994{
995 char *cleanedUri = nullptr;
996 switch (Config.uri_whitespace) {
999 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1000 break;
1001 }
1002
1005 break;
1006
1007 case URI_WHITESPACE_CHOP: {
1008 const auto pos = strcspn(uri, w_space);
1009 char *choppedUri = nullptr;
1010 if (pos < strlen(uri))
1011 choppedUri = xstrndup(uri, pos + 1);
1012 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
1014 cleanedUri[pos] = '\0';
1015 xfree(choppedUri);
1016 break;
1017 }
1018
1021 default: {
1022 // TODO: avoid duplication with urlParse()
1023 const char *t;
1024 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1025 char *q = tmp_uri;
1026 t = uri;
1027 while (*t) {
1028 if (!xisspace(*t)) {
1029 *q = *t;
1030 ++q;
1031 }
1032 ++t;
1033 }
1034 *q = '\0';
1035 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1036 xfree(tmp_uri);
1037 break;
1038 }
1039 }
1040
1041 assert(cleanedUri);
1042 return cleanedUri;
1043}
1044
#define Assure(condition)
Definition: Assure.h:35
#define Here()
source code location of the caller
Definition: Here.h:15
#define SQUIDSBUFPH
Definition: SBuf.h:31
void SBufToCstring(char *d, const SBuf &s)
Definition: SBuf.h:752
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
class SquidConfig Config
Definition: SquidConfig.cc:12
constexpr bool Less(const A a, const B b)
whether integer a is less than integer b, with correct overflow handling
Definition: SquidMath.h:48
int stringHasWhitespace(const char *)
Definition: String.cc:287
int stringHasCntl(const char *)
Definition: String.cc:294
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
bool urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:919
static const char valid_hostname_chars[]
Definition: Uri.cc:28
static const char valid_hostname_chars_u[]
Definition: Uri.cc:22
bool urlIsRelative(const char *url)
Definition: Uri.cc:760
void urlInitialize(void)
Definition: Uri.cc:138
int matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
Definition: Uri.cc:820
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:704
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition: Uri.cc:193
static const CharacterSet & UserInfoChars()
Characters which are valid within a URI userinfo section.
Definition: Uri.cc:37
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:733
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition: Uri.cc:223
MatchDomainNameFlags
Definition: Uri.h:228
@ mdnRejectSubsubDomains
Definition: Uri.h:231
@ mdnHonorWildcards
Definition: Uri.h:230
#define assert(EX)
Definition: assert.h:17
static AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition: UriScheme.cc:52
Port defaultPort() const
Definition: UriScheme.cc:71
SBuf image() const
Definition: UriScheme.h:57
static const SBuf & SlashPath()
the static '/' default URL-path
Definition: Uri.cc:93
SBuf parseHost(Parser::Tokenizer &) const
Definition: Uri.cc:558
void parseUrn(Parser::Tokenizer &)
Definition: Uri.cc:527
AnyP::UriScheme const & getScheme() const
Definition: Uri.h:67
void touch()
clear the cached URI display forms
Definition: Uri.cc:638
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:646
void path(const char *p)
Definition: Uri.h:101
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:179
const char * host(void) const
Definition: Uri.h:85
Uri()
Definition: Uri.h:35
static char * cleanup(const char *uri)
Definition: Uri.cc:993
void addRelativePath(const char *relUrl)
Definition: Uri.cc:796
int parsePort(Parser::Tokenizer &) const
Definition: Uri.cc:616
SBuf & absolute() const
Definition: Uri.cc:668
static const SBuf & Asterisk()
the static '*' pseudo-URI
Definition: Uri.cc:86
void port(const Port p)
reset authority port subcomponent
Definition: Uri.h:95
const SBuf & path() const
Definition: Uri.cc:126
void host(const char *src)
Definition: Uri.cc:100
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition: Uri.cc:252
SBuf hostOrIp() const
Definition: Uri.cc:115
static SBuf Encode(const SBuf &, const CharacterSet &expected)
Definition: Uri.cc:57
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:18
static const CharacterSet TCHAR
Definition: CharacterSet.h:105
CharacterSet & rename(const char *label)
change name; handy in const declarations that use operators
Definition: CharacterSet.h:61
static const CharacterSet DIGIT
Definition: CharacterSet.h:84
static const CharacterSet ALPHA
Definition: CharacterSet.h:76
static const CharacterSet HEXDIG
Definition: CharacterSet.h:88
CharacterSet & remove(const unsigned char c)
remove a given character from the character set
Definition: CharacterSet.cc:54
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1134
HttpRequestMethod method
Definition: HttpRequest.h:114
char * canonicalCleanUrl() const
Definition: HttpRequest.cc:814
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
HttpHeader header
Definition: Message.h:74
bool fromHost(const char *hostWithoutPort)
Definition: Address.cc:898
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:79
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
bool atEnd() const
whether the end of the buffer has been reached
Definition: Tokenizer.h:41
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:189
Definition: Raw.h:21
Definition: SBuf.h:94
const char * rawContent() const
Definition: SBuf.cc:509
static const size_type npos
Definition: SBuf.h:99
char at(size_type pos) const
Definition: SBuf.h:249
const char * c_str()
Definition: SBuf.cc:516
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:415
SBuf & appendf(const char *fmt,...) PRINTF_FORMAT_ARG2
Definition: SBuf.cc:229
size_type find(char c, size_type startPos=0) const
Definition: SBuf.cc:584
bool isEmpty() const
Definition: SBuf.h:431
const_iterator begin() const
Definition: SBuf.h:583
SBuf & append(const SBuf &S)
Definition: SBuf.cc:185
const_reverse_iterator rbegin() const
Definition: SBuf.h:591
void reserveSpace(size_type minSpace)
Definition: SBuf.h:440
size_t appendDomainLen
Definition: SquidConfig.h:223
int strip_query_terms
Definition: SquidConfig.h:300
struct SquidConfig::@106 onoff
char * appendDomain
Definition: SquidConfig.h:222
int uri_whitespace
Definition: SquidConfig.h:457
int check_hostnames
Definition: SquidConfig.h:316
int allow_underscore
Definition: SquidConfig.h:317
an std::runtime_error with thrower location info
Definition: TextException.h:21
A const & max(A const &lhs, A const &rhs)
#define w_space
#define MYNAME
Definition: Stream.h:236
#define DBG_IMPORTANT
Definition: Stream.h:38
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:194
#define URI_WHITESPACE_CHOP
Definition: defines.h:129
#define URI_WHITESPACE_STRIP
Definition: defines.h:126
#define URI_WHITESPACE_DENY
Definition: defines.h:130
#define URI_WHITESPACE_ALLOW
Definition: defines.h:127
#define URI_WHITESPACE_ENCODE
Definition: defines.h:128
#define MAX_URL
Definition: defines.h:78
static int port
Definition: ldap_backend.cc:70
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition: forward.h:25
static uint32 B
Definition: md4.c:43
const char * ProtocolType_str[]
uint16_t KnownPort
validated/supported port number; these values are never zero
Definition: UriScheme.h:23
@ PROTO_NONE
Definition: ProtocolType.h:24
@ PROTO_HTTPS
Definition: ProtocolType.h:27
@ PROTO_UNKNOWN
Definition: ProtocolType.h:41
@ PROTO_HTTP
Definition: ProtocolType.h:25
@ PROTO_FTP
Definition: ProtocolType.h:26
@ PROTO_WHOIS
Definition: ProtocolType.h:36
@ PROTO_MAX
Definition: ProtocolType.h:42
@ PROTO_URN
Definition: ProtocolType.h:35
@ PROTO_WAIS
Definition: ProtocolType.h:30
@ METHOD_TRACE
Definition: MethodType.h:30
@ METHOD_PUT
Definition: MethodType.h:27
@ METHOD_OPTIONS
Definition: MethodType.h:31
@ METHOD_CONNECT
Definition: MethodType.h:29
@ METHOD_GET
Definition: MethodType.h:25
@ METHOD_PURGE
Definition: MethodType.h:92
@ METHOD_HEAD
Definition: MethodType.h:28
#define xfree
#define xmalloc
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
#define LOCAL_ARRAY(type, name, size)
Definition: squid.h:68
Definition: parse.c:160
#define xisspace(x)
Definition: xis.h:15
#define xtolower(x)
Definition: xis.h:17
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors