Uri.cc
Go to the documentation of this file.
1/*
2 * Copyright (C) 1996-2022 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9/* DEBUG: section 23 URL Parsing */
10
11#include "squid.h"
12#include "anyp/Uri.h"
13#include "base/Raw.h"
14#include "globals.h"
15#include "HttpRequest.h"
16#include "parser/Tokenizer.h"
17#include "rfc1738.h"
18#include "SquidConfig.h"
19#include "SquidString.h"
20
21static const char valid_hostname_chars_u[] =
22 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
23 "abcdefghijklmnopqrstuvwxyz"
24 "0123456789-._"
25 "[:]"
26 ;
27static const char valid_hostname_chars[] =
28 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
29 "abcdefghijklmnopqrstuvwxyz"
30 "0123456789-."
31 "[:]"
32 ;
33
35static const CharacterSet &
37{
38 /*
39 * RFC 3986 section 3.2.1
40 *
41 * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
42 * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
43 * pct-encoded = "%" HEXDIG HEXDIG
44 * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
45 */
46 static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
49 return userInfoValid;
50}
51
55SBuf
56AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
57{
58 if (buf.isEmpty())
59 return buf;
60
61 Parser::Tokenizer tk(buf);
62 SBuf goodSection;
63 // optimization for the arguably common "no encoding necessary" case
64 if (tk.prefix(goodSection, ignore) && tk.atEnd())
65 return buf;
66
67 SBuf output;
68 output.reserveSpace(buf.length() * 3); // worst case: encode all chars
69 output.append(goodSection); // may be empty
70
71 while (!tk.atEnd()) {
72 // TODO: Add Tokenizer::parseOne(void).
73 const auto ch = tk.remaining()[0];
74 output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
75 (void)tk.skip(ch);
76
77 if (tk.prefix(goodSection, ignore))
78 output.append(goodSection);
79 }
80
81 return output;
82}
83
84const SBuf &
86{
87 static SBuf star("*");
88 return star;
89}
90
91const SBuf &
93{
94 static SBuf slash("/");
95 return slash;
96}
97
98void
99AnyP::Uri::host(const char *src)
100{
101 hostAddr_.fromHost(src);
102 if (hostAddr_.isAnyAddr()) {
103 xstrncpy(host_, src, sizeof(host_));
104 hostIsNumeric_ = false;
105 } else {
106 hostAddr_.toHostStr(host_, sizeof(host_));
107 debugs(23, 3, "given IP: " << hostAddr_);
108 hostIsNumeric_ = 1;
109 }
110 touch();
111}
112
113SBuf
115{
116 if (hostIsNumeric()) {
117 static char ip[MAX_IPSTRLEN];
118 const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
119 return SBuf(ip, hostStrLen);
120 } else
121 return SBuf(host());
122}
123
124const SBuf &
126{
127 // RFC 3986 section 3.3 says path can be empty (path-abempty).
128 // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
129 // at least when sending and using. We must still accept path-abempty as input.
130 if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
131 return SlashPath();
132
133 return path_;
134}
135
136void
138{
139 debugs(23, 5, "urlInitialize: Initializing...");
140 /* this ensures that the number of protocol strings is the same as
141 * the enum slots allocated because the last enum is always 'MAX'.
142 */
143 assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
144 /*
145 * These test that our matchDomainName() function works the
146 * way we expect it to.
147 */
148 assert(0 == matchDomainName("foo.com", "foo.com"));
149 assert(0 == matchDomainName(".foo.com", "foo.com"));
150 assert(0 == matchDomainName("foo.com", ".foo.com"));
151 assert(0 == matchDomainName(".foo.com", ".foo.com"));
152 assert(0 == matchDomainName("x.foo.com", ".foo.com"));
153 assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
154 assert(0 != matchDomainName("x.foo.com", "foo.com"));
155 assert(0 != matchDomainName("foo.com", "x.foo.com"));
156 assert(0 != matchDomainName("bar.com", "foo.com"));
157 assert(0 != matchDomainName(".bar.com", "foo.com"));
158 assert(0 != matchDomainName(".bar.com", ".foo.com"));
159 assert(0 != matchDomainName("bar.com", ".foo.com"));
160 assert(0 < matchDomainName("zzz.com", "foo.com"));
161 assert(0 > matchDomainName("aaa.com", "foo.com"));
162 assert(0 == matchDomainName("FOO.com", "foo.COM"));
163 assert(0 < matchDomainName("bfoo.com", "afoo.com"));
164 assert(0 > matchDomainName("afoo.com", "bfoo.com"));
165 assert(0 < matchDomainName("x-foo.com", ".foo.com"));
166
167 assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
168 assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
169 assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
170 assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
171
172 assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
173 assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
174 assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
175 assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
176
177 /* more cases? */
178}
179
187static AnyP::UriScheme
189{
190 /*
191 * RFC 3986 section 3.1 paragraph 2:
192 *
193 * Scheme names consist of a sequence of characters beginning with a
194 * letter and followed by any combination of letters, digits, plus
195 * ("+"), period ("."), or hyphen ("-").
196 *
197 * The underscore ("_") required to match "cache_object://" squid
198 * special URI scheme.
199 */
200 static const auto schemeChars =
201#if USE_HTTP_VIOLATIONS
202 CharacterSet("special", "_") +
203#endif
205
206 SBuf str;
207 if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
208 const auto protocol = AnyP::UriScheme::FindProtocolType(str);
209 if (protocol == AnyP::PROTO_UNKNOWN)
210 return AnyP::UriScheme(protocol, str.c_str());
211 return AnyP::UriScheme(protocol, nullptr);
212 }
213
214 throw TextException("invalid URI scheme", Here());
215}
216
224bool
226{
227 /* For IPv4 addresses check for a dot */
228 /* For IPv6 addresses also check for a colon */
229 if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
230 const uint64_t dlen = strlen(host);
231 const uint64_t want = dlen + Config.appendDomainLen;
232 if (want > SQUIDHOSTNAMELEN - 1) {
233 debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
234 return false;
235 }
236 strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
237 }
238 return true;
239}
240
241/*
242 * Parse a URI/URL.
243 *
244 * It is assumed that the URL is complete -
245 * ie, the end of the string is the end of the URL. Don't pass a partial
246 * URL here as this routine doesn't have any way of knowing whether
247 * it is partial or not (ie, it handles the case of no trailing slash as
248 * being "end of host with implied path of /".
249 *
250 * method is used to switch parsers. If method is Http::METHOD_CONNECT,
251 * then rather than a URL a hostname:port is looked for.
252 */
253bool
254AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
255{
256 try {
257
258 LOCAL_ARRAY(char, login, MAX_URL);
259 LOCAL_ARRAY(char, foundHost, MAX_URL);
260 LOCAL_ARRAY(char, urlpath, MAX_URL);
261 char *t = nullptr;
262 char *q = nullptr;
263 int foundPort;
264 int l;
265 int i;
266 const char *src;
267 char *dst;
268 foundHost[0] = urlpath[0] = login[0] = '\0';
269
270 if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
271 debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
272 return false;
273 }
274
275 if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
276 Asterisk().cmp(rawUrl) == 0) {
277 // XXX: these methods might also occur in HTTPS traffic. Handle this better.
278 setScheme(AnyP::PROTO_HTTP, nullptr);
279 port(getScheme().defaultPort());
280 path(Asterisk());
281 return true;
282 }
283
284 Parser::Tokenizer tok(rawUrl);
285 AnyP::UriScheme scheme;
286
287 if (method == Http::METHOD_CONNECT) {
288 /*
289 * RFC 7230 section 5.3.3: authority-form = authority
290 * "excluding any userinfo and its "@" delimiter"
291 *
292 * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
293 *
294 * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
295 */
296 foundPort = 443;
297
298 // XXX: use tokenizer
299 auto B = tok.buf();
300 const char *url = B.c_str();
301
302 if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
303 if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
304 return false;
305
306 } else {
307
308 scheme = uriParseScheme(tok);
309
310 if (scheme == AnyP::PROTO_NONE)
311 return false; // invalid scheme
312
313 if (scheme == AnyP::PROTO_URN) {
314 parseUrn(tok); // throws on any error
315 return true;
316 }
317
318 // URLs then have "//"
319 static const SBuf doubleSlash("//");
320 if (!tok.skip(doubleSlash))
321 return false;
322
323 auto B = tok.remaining();
324 const char *url = B.c_str();
325
326 /* Parse the URL: */
327 src = url;
328 i = 0;
329
330 /* Then everything until first /; that's host (and port; which we'll look for here later) */
331 // bug 1881: If we don't get a "/" then we imply it was there
332 // bug 3074: We could just be given a "?" or "#". These also imply "/"
333 // bug 3233: whitespace is also a hostname delimiter.
334 for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
335 *dst = *src;
336 }
337
338 /*
339 * We can't check for "i >= l" here because we could be at the end of the line
340 * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
341 * been -given- a valid URL and the path is just '/'.
342 */
343 if (i > l)
344 return false;
345 *dst = '\0';
346
347 // We are looking at path-abempty.
348 if (*src != '/') {
349 // path-empty, including the end of the `src` c-string cases
350 urlpath[0] = '/';
351 dst = &urlpath[1];
352 } else {
353 dst = urlpath;
354 }
355 /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
356 for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
357 *dst = *src;
358 }
359
360 /* We -could- be at the end of the buffer here */
361 if (i > l)
362 return false;
363 *dst = '\0';
364
365 foundPort = scheme.defaultPort(); // may be reset later
366
367 /* Is there any login information? (we should eventually parse it above) */
368 t = strrchr(foundHost, '@');
369 if (t != nullptr) {
370 strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
371 login[sizeof(login)-1] = '\0';
372 t = strrchr(login, '@');
373 *t = 0;
374 strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
375 foundHost[sizeof(foundHost)-1] = '\0';
376 // Bug 4498: URL-unescape the login info after extraction
377 rfc1738_unescape(login);
378 }
379
380 /* Is there any host information? (we should eventually parse it above) */
381 if (*foundHost == '[') {
382 /* strip any IPA brackets. valid under IPv6. */
383 dst = foundHost;
384 /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
385 src = foundHost;
386 ++src;
387 l = strlen(foundHost);
388 i = 1;
389 for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
390 *dst = *src;
391 }
392
393 /* we moved in-place, so truncate the actual hostname found */
394 *dst = '\0';
395 ++dst;
396
397 /* skip ahead to either start of port, or original EOS */
398 while (*dst != '\0' && *dst != ':')
399 ++dst;
400 t = dst;
401 } else {
402 t = strrchr(foundHost, ':');
403
404 if (t != strchr(foundHost,':') ) {
405 /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
406 /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
407 /* therefore we MUST accept the case where they are not bracketed at all. */
408 t = nullptr;
409 }
410 }
411
412 // Bug 3183 sanity check: If scheme is present, host must be too.
413 if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
414 debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
415 return false;
416 }
417
418 if (t && *t == ':') {
419 *t = '\0';
420 ++t;
421 foundPort = atoi(t);
422 }
423 }
424
425 for (t = foundHost; *t; ++t)
426 *t = xtolower(*t);
427
428 if (stringHasWhitespace(foundHost)) {
430 t = q = foundHost;
431 while (*t) {
432 if (!xisspace(*t)) {
433 *q = *t;
434 ++q;
435 }
436 ++t;
437 }
438 *q = '\0';
439 }
440 }
441
442 debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
443
445 strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
446 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
447 return false;
448 }
449
450 if (!urlAppendDomain(foundHost))
451 return false;
452
453 /* remove trailing dots from hostnames */
454 while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
455 foundHost[l] = '\0';
456
457 /* reject duplicate or leading dots */
458 if (strstr(foundHost, "..") || *foundHost == '.') {
459 debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
460 return false;
461 }
462
463 if (foundPort < 1 || foundPort > 65535) {
464 debugs(23, 3, "Invalid port '" << foundPort << "'");
465 return false;
466 }
467
468 if (stringHasWhitespace(urlpath)) {
469 debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
470
471 switch (Config.uri_whitespace) {
472
474 return false;
475
477 break;
478
480 t = rfc1738_escape_unescaped(urlpath);
481 xstrncpy(urlpath, t, MAX_URL);
482 break;
483
485 *(urlpath + strcspn(urlpath, w_space)) = '\0';
486 break;
487
489 default:
490 t = q = urlpath;
491 while (*t) {
492 if (!xisspace(*t)) {
493 *q = *t;
494 ++q;
495 }
496 ++t;
497 }
498 *q = '\0';
499 }
500 }
501
502 setScheme(scheme);
503 path(urlpath);
504 host(foundHost);
505 userInfo(SBuf(login));
506 port(foundPort);
507 return true;
508
509 } catch (...) {
510 debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
511 return false;
512 }
513}
514
529void
531{
532 static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
533 static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
534 SBuf nid;
535 if (!tok.prefix(nid, nidChars, 32))
536 throw TextException("NID not found", Here());
537
538 if (!tok.skip(':'))
539 throw TextException("NID too long or missing ':' delimiter", Here());
540
541 if (nid.length() < 2)
542 throw TextException("NID too short", Here());
543
544 if (!alphanum[*nid.begin()])
545 throw TextException("NID prefix is not alphanumeric", Here());
546
547 if (!alphanum[*nid.rbegin()])
548 throw TextException("NID suffix is not alphanumeric", Here());
549
550 setScheme(AnyP::PROTO_URN, nullptr);
551 host(nid.c_str());
552 // TODO validate path characters
553 path(tok.remaining());
554 debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
555}
556
557void
559{
560 absolute_.clear();
561 authorityHttp_.clear();
562 authorityWithPort_.clear();
563}
564
565SBuf &
566AnyP::Uri::authority(bool requirePort) const
567{
568 if (authorityHttp_.isEmpty()) {
569
570 // both formats contain Host/IP
571 authorityWithPort_.append(host());
572 authorityHttp_ = authorityWithPort_;
573
574 // authorityForm_ only has :port if it is non-default
575 authorityWithPort_.appendf(":%u",port());
576 if (port() != getScheme().defaultPort())
577 authorityHttp_ = authorityWithPort_;
578 }
579
580 return requirePort ? authorityWithPort_ : authorityHttp_;
581}
582
583SBuf &
585{
586 if (absolute_.isEmpty()) {
587 // TODO: most URL will be much shorter, avoid allocating this much
588 absolute_.reserveCapacity(MAX_URL);
589
590 absolute_.append(getScheme().image());
591 absolute_.append(":",1);
592 if (getScheme() != AnyP::PROTO_URN) {
593 absolute_.append("//", 2);
594 const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
595 getScheme() == AnyP::PROTO_UNKNOWN;
596
597 if (allowUserInfo && !userInfo().isEmpty()) {
598 static const CharacterSet uiChars = CharacterSet(UserInfoChars())
599 .remove('%')
600 .rename("userinfo-reserved");
601 absolute_.append(Encode(userInfo(), uiChars));
602 absolute_.append("@", 1);
603 }
604 absolute_.append(authority());
605 } else {
606 absolute_.append(host());
607 absolute_.append(":", 1);
608 }
609 absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
610 }
611
612 return absolute_;
613}
614
615/* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
616 * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
617 * and never copy the query-string part in the first place
618 */
619char *
621{
622 LOCAL_ARRAY(char, buf, MAX_URL);
623
624 snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
625 buf[sizeof(buf)-1] = '\0';
626
627 // URN, CONNECT method, and non-stripped URIs can go straight out
628 if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
629 // strip anything AFTER a question-mark
630 // leaving the '?' in place
631 if (auto t = strchr(buf, '?')) {
632 *(++t) = '\0';
633 }
634 }
635
636 if (stringHasCntl(buf))
638
639 return buf;
640}
641
648const char *
650{
651 LOCAL_ARRAY(char, buf, MAX_URL);
652
653 // method CONNECT and port HTTPS
654 if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
655 snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
656 return buf;
657 }
658
659 // else do the normal complete canonical thing.
660 return request->canonicalCleanUrl();
661}
662
675bool
676urlIsRelative(const char *url)
677{
678 if (!url)
679 return false; // no URL
680
681 /*
682 * RFC 3986 section 5.2.3
683 *
684 * path = path-abempty ; begins with "/" or is empty
685 * / path-absolute ; begins with "/" but not "//"
686 * / path-noscheme ; begins with a non-colon segment
687 * / path-rootless ; begins with a segment
688 * / path-empty ; zero characters
689 */
690
691 if (*url == '\0')
692 return true; // path-empty
693
694 if (*url == '/') {
695 // RFC 3986 section 5.2.3
696 // path-absolute ; begins with "/" but not "//"
697 if (url[1] == '/')
698 return true; // network-path reference, aka. 'scheme-relative URI'
699 else
700 return true; // path-absolute, aka 'absolute-path reference'
701 }
702
703 for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
704 if (*p == ':')
705 return false; // colon is forbidden in first segment
706 }
707
708 return true; // path-noscheme, path-abempty, path-rootless
709}
710
711void
712AnyP::Uri::addRelativePath(const char *relUrl)
713{
714 // URN cannot be merged
715 if (getScheme() == AnyP::PROTO_URN)
716 return;
717
718 // TODO: Handle . and .. segment normalization
719
720 const auto lastSlashPos = path_.rfind('/');
721 // TODO: To optimize and simplify, add and use SBuf::replace().
722 const auto relUrlLength = strlen(relUrl);
723 if (lastSlashPos == SBuf::npos) {
724 // start replacing the whole path
725 path_.reserveCapacity(1 + relUrlLength);
726 path_.assign("/", 1);
727 } else {
728 // start replacing just the last segment
729 path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
730 path_.chop(0, lastSlashPos+1);
731 }
732 path_.append(relUrl, relUrlLength);
733}
734
735int
736matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
737{
738 int dl;
739 int hl;
740
741 const bool hostIncludesSubdomains = (*h == '.');
742 while ('.' == *h)
743 ++h;
744
745 hl = strlen(h);
746
747 if (hl == 0)
748 return -1;
749
750 dl = strlen(d);
751
752 /*
753 * Start at the ends of the two strings and work towards the
754 * beginning.
755 */
756 while (xtolower(h[--hl]) == xtolower(d[--dl])) {
757 if (hl == 0 && dl == 0) {
758 /*
759 * We made it all the way to the beginning of both
760 * strings without finding any difference.
761 */
762 return 0;
763 }
764
765 if (0 == hl) {
766 /*
767 * The host string is shorter than the domain string.
768 * There is only one case when this can be a match.
769 * If the domain is just one character longer, and if
770 * that character is a leading '.' then we call it a
771 * match.
772 */
773
774 if (1 == dl && '.' == d[0])
775 return 0;
776 else
777 return -1;
778 }
779
780 if (0 == dl) {
781 /*
782 * The domain string is shorter than the host string.
783 * This is a match only if the first domain character
784 * is a leading '.'.
785 */
786
787 if ('.' == d[0]) {
788 if (flags & mdnRejectSubsubDomains) {
789 // Check for sub-sub domain and reject
790 while(--hl >= 0 && h[hl] != '.');
791 if (hl < 0) {
792 // No sub-sub domain found, but reject if there is a
793 // leading dot in given host string (which is removed
794 // before the check is started).
795 return hostIncludesSubdomains ? 1 : 0;
796 } else
797 return 1; // sub-sub domain, reject
798 } else
799 return 0;
800 } else
801 return 1;
802 }
803 }
804
805 /*
806 * We found different characters in the same position (from the end).
807 */
808
809 // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
810 // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
811 // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
812 if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
813 return 0;
814
815 /*
816 * If one of those character is '.' then its special. In order
817 * for splay tree sorting to work properly, "x-foo.com" must
818 * be greater than ".foo.com" even though '-' is less than '.'.
819 */
820 if ('.' == d[dl])
821 return 1;
822
823 if ('.' == h[hl])
824 return -1;
825
826 return (xtolower(h[hl]) - xtolower(d[dl]));
827}
828
829/*
830 * return true if we can serve requests for this method.
831 */
832bool
834{
835 /* protocol "independent" methods
836 *
837 * actually these methods are specific to HTTP:
838 * they are methods we receive on our HTTP port,
839 * and if we had a FTP listener would not be relevant
840 * there.
841 *
842 * So, we should delegate them to HTTP. The problem is that we
843 * do not have a default protocol from the client side of HTTP.
844 */
845
847 return true;
848
849 // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
850 // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
853
854 if (r->method == Http::METHOD_PURGE)
855 return true;
856
857 /* does method match the protocol? */
858 switch (r->url.getScheme()) {
859
860 case AnyP::PROTO_URN:
861 case AnyP::PROTO_HTTP:
863 return true;
864
865 case AnyP::PROTO_FTP:
866 if (r->method == Http::METHOD_PUT ||
867 r->method == Http::METHOD_GET ||
869 return true;
870 return false;
871
872 case AnyP::PROTO_WAIS:
874 if (r->method == Http::METHOD_GET ||
876 return true;
877 return false;
878
880#if USE_OPENSSL || USE_GNUTLS
881 return true;
882#else
883 /*
884 * Squid can't originate an SSL connection, so it should
885 * never receive an "https:" URL. It should always be
886 * CONNECT instead.
887 */
888 return false;
889#endif
890
891 default:
892 return false;
893 }
894
895 /* notreached */
896 return false;
897}
898
900 scheme_(aScheme),
901 hostIsNumeric_(false),
902 port_(0)
903{
904 *host_=0;
905}
906
907// TODO: fix code duplication with AnyP::Uri::parse()
908char *
909AnyP::Uri::cleanup(const char *uri)
910{
911 char *cleanedUri = nullptr;
912 switch (Config.uri_whitespace) {
915 cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
916 break;
917 }
918
921 break;
922
923 case URI_WHITESPACE_CHOP: {
924 const auto pos = strcspn(uri, w_space);
925 char *choppedUri = nullptr;
926 if (pos < strlen(uri))
927 choppedUri = xstrndup(uri, pos + 1);
928 cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
930 cleanedUri[pos] = '\0';
931 xfree(choppedUri);
932 break;
933 }
934
937 default: {
938 // TODO: avoid duplication with urlParse()
939 const char *t;
940 char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
941 char *q = tmp_uri;
942 t = uri;
943 while (*t) {
944 if (!xisspace(*t)) {
945 *q = *t;
946 ++q;
947 }
948 ++t;
949 }
950 *q = '\0';
951 cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
952 xfree(tmp_uri);
953 break;
954 }
955 }
956
957 assert(cleanedUri);
958 return cleanedUri;
959}
960
#define false
Definition: GnuRegex.c:240
#define Here()
source code location of the caller
Definition: Here.h:15
#define SQUIDSBUFPH
Definition: SBuf.h:31
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
class SquidConfig Config
Definition: SquidConfig.cc:12
int stringHasWhitespace(const char *)
Definition: String.cc:366
int stringHasCntl(const char *)
Definition: String.cc:373
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
bool urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:833
static const char valid_hostname_chars[]
Definition: Uri.cc:27
static const char valid_hostname_chars_u[]
Definition: Uri.cc:21
bool urlIsRelative(const char *url)
Definition: Uri.cc:676
void urlInitialize(void)
Definition: Uri.cc:137
int matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
Definition: Uri.cc:736
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:620
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition: Uri.cc:188
static const CharacterSet & UserInfoChars()
Characters which are valid within a URI userinfo section.
Definition: Uri.cc:36
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:649
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition: Uri.cc:225
MatchDomainNameFlags
Definition: Uri.h:223
@ mdnRejectSubsubDomains
Definition: Uri.h:226
@ mdnHonorWildcards
Definition: Uri.h:225
#define assert(EX)
Definition: assert.h:19
unsigned short defaultPort() const
Definition: UriScheme.cc:71
static AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition: UriScheme.cc:52
SBuf image() const
Definition: UriScheme.h:50
static const SBuf & SlashPath()
the static '/' default URL-path
Definition: Uri.cc:92
void parseUrn(Parser::Tokenizer &)
Definition: Uri.cc:530
AnyP::UriScheme const & getScheme() const
Definition: Uri.h:67
void touch()
clear the cached URI display forms
Definition: Uri.cc:558
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:566
void path(const char *p)
Definition: Uri.h:99
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:174
const char * host(void) const
Definition: Uri.h:85
Uri()
Definition: Uri.h:35
static char * cleanup(const char *uri)
Definition: Uri.cc:909
void addRelativePath(const char *relUrl)
Definition: Uri.cc:712
SBuf & absolute() const
Definition: Uri.cc:584
static const SBuf & Asterisk()
the static '*' pseudo-URI
Definition: Uri.cc:85
const SBuf & path() const
Definition: Uri.cc:125
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition: Uri.cc:254
SBuf hostOrIp() const
Definition: Uri.cc:114
static SBuf Encode(const SBuf &, const CharacterSet &expected)
Definition: Uri.cc:56
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:18
CharacterSet & rename(const char *label)
change name; handy in const declarations that use operators
Definition: CharacterSet.h:61
static const CharacterSet DIGIT
Definition: CharacterSet.h:84
static const CharacterSet ALPHA
Definition: CharacterSet.h:76
CharacterSet & remove(const unsigned char c)
remove a given character from the character set
Definition: CharacterSet.cc:54
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1130
HttpRequestMethod method
Definition: HttpRequest.h:114
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
HttpHeader header
Definition: Message.h:74
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:81
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
bool atEnd() const
whether the end of the buffer has been reached
Definition: Tokenizer.h:41
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:179
Definition: Raw.h:21
Definition: SBuf.h:94
const char * rawContent() const
Definition: SBuf.cc:509
static const size_type npos
Definition: SBuf.h:99
char at(size_type pos) const
Definition: SBuf.h:249
const char * c_str()
Definition: SBuf.cc:516
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:415
bool isEmpty() const
Definition: SBuf.h:431
const_iterator begin() const
Definition: SBuf.h:583
SBuf & append(const SBuf &S)
Definition: SBuf.cc:185
const_reverse_iterator rbegin() const
Definition: SBuf.h:591
void reserveSpace(size_type minSpace)
Definition: SBuf.h:440
SBuf & appendf(const char *fmt,...)
Definition: SBuf.cc:229
size_t appendDomainLen
Definition: SquidConfig.h:221
struct SquidConfig::@111 onoff
int strip_query_terms
Definition: SquidConfig.h:298
char * appendDomain
Definition: SquidConfig.h:220
int uri_whitespace
Definition: SquidConfig.h:455
int check_hostnames
Definition: SquidConfig.h:314
int allow_underscore
Definition: SquidConfig.h:315
an std::runtime_error with thrower location info
Definition: TextException.h:21
#define w_space
#define MYNAME
Definition: Stream.h:238
#define DBG_IMPORTANT
Definition: Stream.h:41
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:196
#define URI_WHITESPACE_CHOP
Definition: defines.h:129
#define URI_WHITESPACE_STRIP
Definition: defines.h:126
#define URI_WHITESPACE_DENY
Definition: defines.h:130
#define URI_WHITESPACE_ALLOW
Definition: defines.h:127
#define URI_WHITESPACE_ENCODE
Definition: defines.h:128
#define MAX_URL
Definition: defines.h:78
static int port
Definition: ldap_backend.cc:70
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition: forward.h:25
static uint32 B
Definition: md4.c:43
const char * ProtocolType_str[]
@ PROTO_NONE
Definition: ProtocolType.h:24
@ PROTO_HTTPS
Definition: ProtocolType.h:27
@ PROTO_UNKNOWN
Definition: ProtocolType.h:42
@ PROTO_HTTP
Definition: ProtocolType.h:25
@ PROTO_FTP
Definition: ProtocolType.h:26
@ PROTO_WHOIS
Definition: ProtocolType.h:37
@ PROTO_MAX
Definition: ProtocolType.h:43
@ PROTO_CACHE_OBJECT
Definition: ProtocolType.h:31
@ PROTO_URN
Definition: ProtocolType.h:36
@ PROTO_WAIS
Definition: ProtocolType.h:30
@ METHOD_TRACE
Definition: MethodType.h:30
@ METHOD_PUT
Definition: MethodType.h:27
@ METHOD_OPTIONS
Definition: MethodType.h:31
@ METHOD_CONNECT
Definition: MethodType.h:29
@ METHOD_GET
Definition: MethodType.h:25
@ METHOD_PURGE
Definition: MethodType.h:92
@ METHOD_HEAD
Definition: MethodType.h:28
#define xfree
#define xmalloc
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
#define LOCAL_ARRAY(type, name, size)
Definition: squid.h:68
char * url
Definition: tcp-banger2.c:114
char method[16]
Definition: tcp-banger2.c:115
Definition: parse.c:160
struct _request * request(char *urlin)
Definition: tcp-banger2.c:291
#define xisspace(x)
Definition: xis.h:17
#define xtolower(x)
Definition: xis.h:19
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors