Uri.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2022 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 23 URL Parsing */
10 
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "base/Raw.h"
14 #include "globals.h"
15 #include "HttpRequest.h"
16 #include "parser/Tokenizer.h"
17 #include "rfc1738.h"
18 #include "SquidConfig.h"
19 #include "SquidString.h"
20 
21 static const char valid_hostname_chars_u[] =
22  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
23  "abcdefghijklmnopqrstuvwxyz"
24  "0123456789-._"
25  "[:]"
26  ;
27 static const char valid_hostname_chars[] =
28  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
29  "abcdefghijklmnopqrstuvwxyz"
30  "0123456789-."
31  "[:]"
32  ;
33 
35 static const CharacterSet &
37 {
38  /*
39  * RFC 3986 section 3.2.1
40  *
41  * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
42  * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
43  * pct-encoded = "%" HEXDIG HEXDIG
44  * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
45  */
46  static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
49  return userInfoValid;
50 }
51 
55 SBuf
56 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
57 {
58  if (buf.isEmpty())
59  return buf;
60 
61  Parser::Tokenizer tk(buf);
62  SBuf goodSection;
63  // optimization for the arguably common "no encoding necessary" case
64  if (tk.prefix(goodSection, ignore) && tk.atEnd())
65  return buf;
66 
67  SBuf output;
68  output.reserveSpace(buf.length() * 3); // worst case: encode all chars
69  output.append(goodSection); // may be empty
70 
71  while (!tk.atEnd()) {
72  // TODO: Add Tokenizer::parseOne(void).
73  const auto ch = tk.remaining()[0];
74  output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
75  (void)tk.skip(ch);
76 
77  if (tk.prefix(goodSection, ignore))
78  output.append(goodSection);
79  }
80 
81  return output;
82 }
83 
84 const SBuf &
86 {
87  static SBuf star("*");
88  return star;
89 }
90 
91 const SBuf &
93 {
94  static SBuf slash("/");
95  return slash;
96 }
97 
98 void
99 AnyP::Uri::host(const char *src)
100 {
101  hostAddr_.fromHost(src);
102  if (hostAddr_.isAnyAddr()) {
103  xstrncpy(host_, src, sizeof(host_));
104  hostIsNumeric_ = false;
105  } else {
106  hostAddr_.toHostStr(host_, sizeof(host_));
107  debugs(23, 3, "given IP: " << hostAddr_);
108  hostIsNumeric_ = 1;
109  }
110  touch();
111 }
112 
113 SBuf
115 {
116  if (hostIsNumeric()) {
117  static char ip[MAX_IPSTRLEN];
118  const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
119  return SBuf(ip, hostStrLen);
120  } else
121  return SBuf(host());
122 }
123 
124 const SBuf &
126 {
127  // RFC 3986 section 3.3 says path can be empty (path-abempty).
128  // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
129  // at least when sending and using. We must still accept path-abempty as input.
130  if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
131  return SlashPath();
132 
133  return path_;
134 }
135 
136 void
138 {
139  debugs(23, 5, "urlInitialize: Initializing...");
140  /* this ensures that the number of protocol strings is the same as
141  * the enum slots allocated because the last enum is always 'MAX'.
142  */
143  assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
144  /*
145  * These test that our matchDomainName() function works the
146  * way we expect it to.
147  */
148  assert(0 == matchDomainName("foo.com", "foo.com"));
149  assert(0 == matchDomainName(".foo.com", "foo.com"));
150  assert(0 == matchDomainName("foo.com", ".foo.com"));
151  assert(0 == matchDomainName(".foo.com", ".foo.com"));
152  assert(0 == matchDomainName("x.foo.com", ".foo.com"));
153  assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
154  assert(0 != matchDomainName("x.foo.com", "foo.com"));
155  assert(0 != matchDomainName("foo.com", "x.foo.com"));
156  assert(0 != matchDomainName("bar.com", "foo.com"));
157  assert(0 != matchDomainName(".bar.com", "foo.com"));
158  assert(0 != matchDomainName(".bar.com", ".foo.com"));
159  assert(0 != matchDomainName("bar.com", ".foo.com"));
160  assert(0 < matchDomainName("zzz.com", "foo.com"));
161  assert(0 > matchDomainName("aaa.com", "foo.com"));
162  assert(0 == matchDomainName("FOO.com", "foo.COM"));
163  assert(0 < matchDomainName("bfoo.com", "afoo.com"));
164  assert(0 > matchDomainName("afoo.com", "bfoo.com"));
165  assert(0 < matchDomainName("x-foo.com", ".foo.com"));
166 
167  assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
168  assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
169  assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
170  assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
171 
172  assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
173  assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
174  assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
175  assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
176 
177  /* more cases? */
178 }
179 
187 static AnyP::UriScheme
189 {
190  /*
191  * RFC 3986 section 3.1 paragraph 2:
192  *
193  * Scheme names consist of a sequence of characters beginning with a
194  * letter and followed by any combination of letters, digits, plus
195  * ("+"), period ("."), or hyphen ("-").
196  *
197  * The underscore ("_") required to match "cache_object://" squid
198  * special URI scheme.
199  */
200  static const auto schemeChars =
201 #if USE_HTTP_VIOLATIONS
202  CharacterSet("special", "_") +
203 #endif
205 
206  SBuf str;
207  if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
208  const auto protocol = AnyP::UriScheme::FindProtocolType(str);
209  if (protocol == AnyP::PROTO_UNKNOWN)
210  return AnyP::UriScheme(protocol, str.c_str());
211  return AnyP::UriScheme(protocol, nullptr);
212  }
213 
214  throw TextException("invalid URI scheme", Here());
215 }
216 
224 bool
225 urlAppendDomain(char *host)
226 {
227  /* For IPv4 addresses check for a dot */
228  /* For IPv6 addresses also check for a colon */
229  if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
230  const uint64_t dlen = strlen(host);
231  const uint64_t want = dlen + Config.appendDomainLen;
232  if (want > SQUIDHOSTNAMELEN - 1) {
233  debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
234  return false;
235  }
236  strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
237  }
238  return true;
239 }
240 
241 /*
242  * Parse a URI/URL.
243  *
244  * It is assumed that the URL is complete -
245  * ie, the end of the string is the end of the URL. Don't pass a partial
246  * URL here as this routine doesn't have any way of knowing whether
247  * it is partial or not (ie, it handles the case of no trailing slash as
248  * being "end of host with implied path of /".
249  *
250  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
251  * then rather than a URL a hostname:port is looked for.
252  */
253 bool
254 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
255 {
256  try {
257 
258  LOCAL_ARRAY(char, login, MAX_URL);
259  LOCAL_ARRAY(char, foundHost, MAX_URL);
260  LOCAL_ARRAY(char, urlpath, MAX_URL);
261  char *t = NULL;
262  char *q = NULL;
263  int foundPort;
264  int l;
265  int i;
266  const char *src;
267  char *dst;
268  foundHost[0] = urlpath[0] = login[0] = '\0';
269 
270  if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
271  debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
272  return false;
273  }
274 
275  if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
276  Asterisk().cmp(rawUrl) == 0) {
277  // XXX: these methods might also occur in HTTPS traffic. Handle this better.
278  setScheme(AnyP::PROTO_HTTP, nullptr);
279  port(getScheme().defaultPort());
280  path(Asterisk());
281  return true;
282  }
283 
284  Parser::Tokenizer tok(rawUrl);
285  AnyP::UriScheme scheme;
286 
287  if (method == Http::METHOD_CONNECT) {
288  /*
289  * RFC 7230 section 5.3.3: authority-form = authority
290  * "excluding any userinfo and its "@" delimiter"
291  *
292  * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
293  *
294  * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
295  */
296  foundPort = 443;
297 
298  // XXX: use tokenizer
299  auto B = tok.buf();
300  const char *url = B.c_str();
301 
302  if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
303  if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
304  return false;
305 
306  } else {
307 
308  scheme = uriParseScheme(tok);
309 
310  if (scheme == AnyP::PROTO_NONE)
311  return false; // invalid scheme
312 
313  if (scheme == AnyP::PROTO_URN) {
314  parseUrn(tok); // throws on any error
315  return true;
316  }
317 
318  // URLs then have "//"
319  static const SBuf doubleSlash("//");
320  if (!tok.skip(doubleSlash))
321  return false;
322 
323  auto B = tok.remaining();
324  const char *url = B.c_str();
325 
326  /* Parse the URL: */
327  src = url;
328  i = 0;
329 
330  /* Then everything until first /; that's host (and port; which we'll look for here later) */
331  // bug 1881: If we don't get a "/" then we imply it was there
332  // bug 3074: We could just be given a "?" or "#". These also imply "/"
333  // bug 3233: whitespace is also a hostname delimiter.
334  for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
335  *dst = *src;
336  }
337 
338  /*
339  * We can't check for "i >= l" here because we could be at the end of the line
340  * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
341  * been -given- a valid URL and the path is just '/'.
342  */
343  if (i > l)
344  return false;
345  *dst = '\0';
346 
347  // We are looking at path-abempty.
348  if (*src != '/') {
349  // path-empty, including the end of the `src` c-string cases
350  urlpath[0] = '/';
351  dst = &urlpath[1];
352  } else {
353  dst = urlpath;
354  }
355  /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
356  for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
357  *dst = *src;
358  }
359 
360  /* We -could- be at the end of the buffer here */
361  if (i > l)
362  return false;
363  *dst = '\0';
364 
365  foundPort = scheme.defaultPort(); // may be reset later
366 
367  /* Is there any login information? (we should eventually parse it above) */
368  t = strrchr(foundHost, '@');
369  if (t != NULL) {
370  strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
371  login[sizeof(login)-1] = '\0';
372  t = strrchr(login, '@');
373  *t = 0;
374  strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
375  foundHost[sizeof(foundHost)-1] = '\0';
376  // Bug 4498: URL-unescape the login info after extraction
377  rfc1738_unescape(login);
378  }
379 
380  /* Is there any host information? (we should eventually parse it above) */
381  if (*foundHost == '[') {
382  /* strip any IPA brackets. valid under IPv6. */
383  dst = foundHost;
384  /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
385  src = foundHost;
386  ++src;
387  l = strlen(foundHost);
388  i = 1;
389  for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
390  *dst = *src;
391  }
392 
393  /* we moved in-place, so truncate the actual hostname found */
394  *dst = '\0';
395  ++dst;
396 
397  /* skip ahead to either start of port, or original EOS */
398  while (*dst != '\0' && *dst != ':')
399  ++dst;
400  t = dst;
401  } else {
402  t = strrchr(foundHost, ':');
403 
404  if (t != strchr(foundHost,':') ) {
405  /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
406  /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
407  /* therefore we MUST accept the case where they are not bracketed at all. */
408  t = NULL;
409  }
410  }
411 
412  // Bug 3183 sanity check: If scheme is present, host must be too.
413  if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
414  debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
415  return false;
416  }
417 
418  if (t && *t == ':') {
419  *t = '\0';
420  ++t;
421  foundPort = atoi(t);
422  }
423  }
424 
425  for (t = foundHost; *t; ++t)
426  *t = xtolower(*t);
427 
428  if (stringHasWhitespace(foundHost)) {
430  t = q = foundHost;
431  while (*t) {
432  if (!xisspace(*t)) {
433  *q = *t;
434  ++q;
435  }
436  ++t;
437  }
438  *q = '\0';
439  }
440  }
441 
442  debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
443 
445  strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
446  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
447  return false;
448  }
449 
450  if (!urlAppendDomain(foundHost))
451  return false;
452 
453  /* remove trailing dots from hostnames */
454  while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
455  foundHost[l] = '\0';
456 
457  /* reject duplicate or leading dots */
458  if (strstr(foundHost, "..") || *foundHost == '.') {
459  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
460  return false;
461  }
462 
463  if (foundPort < 1 || foundPort > 65535) {
464  debugs(23, 3, "Invalid port '" << foundPort << "'");
465  return false;
466  }
467 
468  if (stringHasWhitespace(urlpath)) {
469  debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
470 
471  switch (Config.uri_whitespace) {
472 
473  case URI_WHITESPACE_DENY:
474  return false;
475 
477  break;
478 
480  t = rfc1738_escape_unescaped(urlpath);
481  xstrncpy(urlpath, t, MAX_URL);
482  break;
483 
484  case URI_WHITESPACE_CHOP:
485  *(urlpath + strcspn(urlpath, w_space)) = '\0';
486  break;
487 
489  default:
490  t = q = urlpath;
491  while (*t) {
492  if (!xisspace(*t)) {
493  *q = *t;
494  ++q;
495  }
496  ++t;
497  }
498  *q = '\0';
499  }
500  }
501 
502  setScheme(scheme);
503  path(urlpath);
504  host(foundHost);
505  userInfo(SBuf(login));
506  port(foundPort);
507  return true;
508 
509  } catch (...) {
510  debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
511  return false;
512  }
513 }
514 
529 void
531 {
532  static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
533  static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
534  SBuf nid;
535  if (!tok.prefix(nid, nidChars, 32))
536  throw TextException("NID not found", Here());
537 
538  if (!tok.skip(':'))
539  throw TextException("NID too long or missing ':' delimiter", Here());
540 
541  if (nid.length() < 2)
542  throw TextException("NID too short", Here());
543 
544  if (!alphanum[*nid.begin()])
545  throw TextException("NID prefix is not alphanumeric", Here());
546 
547  if (!alphanum[*nid.rbegin()])
548  throw TextException("NID suffix is not alphanumeric", Here());
549 
550  setScheme(AnyP::PROTO_URN, nullptr);
551  host(nid.c_str());
552  // TODO validate path characters
553  path(tok.remaining());
554  debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
555 }
556 
557 void
559 {
560  absolute_.clear();
561  authorityHttp_.clear();
562  authorityWithPort_.clear();
563 }
564 
565 SBuf &
566 AnyP::Uri::authority(bool requirePort) const
567 {
568  if (authorityHttp_.isEmpty()) {
569 
570  // both formats contain Host/IP
571  authorityWithPort_.append(host());
572  authorityHttp_ = authorityWithPort_;
573 
574  // authorityForm_ only has :port if it is non-default
575  authorityWithPort_.appendf(":%u",port());
576  if (port() != getScheme().defaultPort())
577  authorityHttp_ = authorityWithPort_;
578  }
579 
580  return requirePort ? authorityWithPort_ : authorityHttp_;
581 }
582 
583 SBuf &
585 {
586  if (absolute_.isEmpty()) {
587  // TODO: most URL will be much shorter, avoid allocating this much
588  absolute_.reserveCapacity(MAX_URL);
589 
590  absolute_.append(getScheme().image());
591  absolute_.append(":",1);
592  if (getScheme() != AnyP::PROTO_URN) {
593  absolute_.append("//", 2);
594  const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
595  getScheme() == AnyP::PROTO_UNKNOWN;
596 
597  if (allowUserInfo && !userInfo().isEmpty()) {
598  static const CharacterSet uiChars = CharacterSet(UserInfoChars())
599  .remove('%')
600  .rename("userinfo-reserved");
601  absolute_.append(Encode(userInfo(), uiChars));
602  absolute_.append("@", 1);
603  }
604  absolute_.append(authority());
605  } else {
606  absolute_.append(host());
607  absolute_.append(":", 1);
608  }
609  absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
610  }
611 
612  return absolute_;
613 }
614 
615 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
616  * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
617  * and never copy the query-string part in the first place
618  */
619 char *
621 {
622  LOCAL_ARRAY(char, buf, MAX_URL);
623 
624  snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
625  buf[sizeof(buf)-1] = '\0';
626 
627  // URN, CONNECT method, and non-stripped URIs can go straight out
628  if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
629  // strip anything AFTER a question-mark
630  // leaving the '?' in place
631  if (auto t = strchr(buf, '?')) {
632  *(++t) = '\0';
633  }
634  }
635 
636  if (stringHasCntl(buf))
638 
639  return buf;
640 }
641 
648 const char *
650 {
651  LOCAL_ARRAY(char, buf, MAX_URL);
652 
653  // method CONNECT and port HTTPS
654  if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
655  snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
656  return buf;
657  }
658 
659  // else do the normal complete canonical thing.
660  return request->canonicalCleanUrl();
661 }
662 
675 bool
676 urlIsRelative(const char *url)
677 {
678  if (!url)
679  return false; // no URL
680 
681  /*
682  * RFC 3986 section 5.2.3
683  *
684  * path = path-abempty ; begins with "/" or is empty
685  * / path-absolute ; begins with "/" but not "//"
686  * / path-noscheme ; begins with a non-colon segment
687  * / path-rootless ; begins with a segment
688  * / path-empty ; zero characters
689  */
690 
691  if (*url == '\0')
692  return true; // path-empty
693 
694  if (*url == '/') {
695  // RFC 3986 section 5.2.3
696  // path-absolute ; begins with "/" but not "//"
697  if (url[1] == '/')
698  return true; // network-path reference, aka. 'scheme-relative URI'
699  else
700  return true; // path-absolute, aka 'absolute-path reference'
701  }
702 
703  for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
704  if (*p == ':')
705  return false; // colon is forbidden in first segment
706  }
707 
708  return true; // path-noscheme, path-abempty, path-rootless
709 }
710 
711 void
712 AnyP::Uri::addRelativePath(const char *relUrl)
713 {
714  // URN cannot be merged
715  if (getScheme() == AnyP::PROTO_URN)
716  return;
717 
718  // TODO: Handle . and .. segment normalization
719 
720  const auto lastSlashPos = path_.rfind('/');
721  // TODO: To optimize and simplify, add and use SBuf::replace().
722  const auto relUrlLength = strlen(relUrl);
723  if (lastSlashPos == SBuf::npos) {
724  // start replacing the whole path
725  path_.reserveCapacity(1 + relUrlLength);
726  path_.assign("/", 1);
727  } else {
728  // start replacing just the last segment
729  path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
730  path_.chop(0, lastSlashPos+1);
731  }
732  path_.append(relUrl, relUrlLength);
733 }
734 
735 int
736 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
737 {
738  int dl;
739  int hl;
740 
741  const bool hostIncludesSubdomains = (*h == '.');
742  while ('.' == *h)
743  ++h;
744 
745  hl = strlen(h);
746 
747  if (hl == 0)
748  return -1;
749 
750  dl = strlen(d);
751 
752  /*
753  * Start at the ends of the two strings and work towards the
754  * beginning.
755  */
756  while (xtolower(h[--hl]) == xtolower(d[--dl])) {
757  if (hl == 0 && dl == 0) {
758  /*
759  * We made it all the way to the beginning of both
760  * strings without finding any difference.
761  */
762  return 0;
763  }
764 
765  if (0 == hl) {
766  /*
767  * The host string is shorter than the domain string.
768  * There is only one case when this can be a match.
769  * If the domain is just one character longer, and if
770  * that character is a leading '.' then we call it a
771  * match.
772  */
773 
774  if (1 == dl && '.' == d[0])
775  return 0;
776  else
777  return -1;
778  }
779 
780  if (0 == dl) {
781  /*
782  * The domain string is shorter than the host string.
783  * This is a match only if the first domain character
784  * is a leading '.'.
785  */
786 
787  if ('.' == d[0]) {
788  if (flags & mdnRejectSubsubDomains) {
789  // Check for sub-sub domain and reject
790  while(--hl >= 0 && h[hl] != '.');
791  if (hl < 0) {
792  // No sub-sub domain found, but reject if there is a
793  // leading dot in given host string (which is removed
794  // before the check is started).
795  return hostIncludesSubdomains ? 1 : 0;
796  } else
797  return 1; // sub-sub domain, reject
798  } else
799  return 0;
800  } else
801  return 1;
802  }
803  }
804 
805  /*
806  * We found different characters in the same position (from the end).
807  */
808 
809  // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
810  // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
811  // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
812  if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
813  return 0;
814 
815  /*
816  * If one of those character is '.' then its special. In order
817  * for splay tree sorting to work properly, "x-foo.com" must
818  * be greater than ".foo.com" even though '-' is less than '.'.
819  */
820  if ('.' == d[dl])
821  return 1;
822 
823  if ('.' == h[hl])
824  return -1;
825 
826  return (xtolower(h[hl]) - xtolower(d[dl]));
827 }
828 
829 /*
830  * return true if we can serve requests for this method.
831  */
832 bool
834 {
835  /* protocol "independent" methods
836  *
837  * actually these methods are specific to HTTP:
838  * they are methods we receive on our HTTP port,
839  * and if we had a FTP listener would not be relevant
840  * there.
841  *
842  * So, we should delegate them to HTTP. The problem is that we
843  * do not have a default protocol from the client side of HTTP.
844  */
845 
846  if (r->method == Http::METHOD_CONNECT)
847  return true;
848 
849  // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
850  // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
853 
854  if (r->method == Http::METHOD_PURGE)
855  return true;
856 
857  /* does method match the protocol? */
858  switch (r->url.getScheme()) {
859 
860  case AnyP::PROTO_URN:
861  case AnyP::PROTO_HTTP:
863  return true;
864 
865  case AnyP::PROTO_FTP:
866  if (r->method == Http::METHOD_PUT ||
867  r->method == Http::METHOD_GET ||
868  r->method == Http::METHOD_HEAD )
869  return true;
870  return false;
871 
872  case AnyP::PROTO_GOPHER:
873  case AnyP::PROTO_WAIS:
874  case AnyP::PROTO_WHOIS:
875  if (r->method == Http::METHOD_GET ||
877  return true;
878  return false;
879 
880  case AnyP::PROTO_HTTPS:
881 #if USE_OPENSSL || USE_GNUTLS
882  return true;
883 #else
884  /*
885  * Squid can't originate an SSL connection, so it should
886  * never receive an "https:" URL. It should always be
887  * CONNECT instead.
888  */
889  return false;
890 #endif
891 
892  default:
893  return false;
894  }
895 
896  /* notreached */
897  return false;
898 }
899 
901  scheme_(aScheme),
902  hostIsNumeric_(false),
903  port_(0)
904 {
905  *host_=0;
906 }
907 
908 // TODO: fix code duplication with AnyP::Uri::parse()
909 char *
910 AnyP::Uri::cleanup(const char *uri)
911 {
912  char *cleanedUri = nullptr;
913  switch (Config.uri_whitespace) {
914  case URI_WHITESPACE_ALLOW: {
916  cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
917  break;
918  }
919 
922  break;
923 
924  case URI_WHITESPACE_CHOP: {
925  const auto pos = strcspn(uri, w_space);
926  char *choppedUri = nullptr;
927  if (pos < strlen(uri))
928  choppedUri = xstrndup(uri, pos + 1);
929  cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
931  cleanedUri[pos] = '\0';
932  xfree(choppedUri);
933  break;
934  }
935 
936  case URI_WHITESPACE_DENY:
938  default: {
939  // TODO: avoid duplication with urlParse()
940  const char *t;
941  char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
942  char *q = tmp_uri;
943  t = uri;
944  while (*t) {
945  if (!xisspace(*t)) {
946  *q = *t;
947  ++q;
948  }
949  ++t;
950  }
951  *q = '\0';
952  cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
953  xfree(tmp_uri);
954  break;
955  }
956  }
957 
958  assert(cleanedUri);
959  return cleanedUri;
960 }
961 
static char * cleanup(const char *uri)
Definition: Uri.cc:910
#define URI_WHITESPACE_ENCODE
Definition: defines.h:135
char method[16]
Definition: tcp-banger2.c:115
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:81
#define Here()
source code location of the caller
Definition: Here.h:15
static const char valid_hostname_chars_u[]
Definition: Uri.cc:21
@ METHOD_HEAD
Definition: MethodType.h:28
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
#define xmalloc
int stringHasCntl(const char *)
Definition: String.cc:373
#define URI_WHITESPACE_STRIP
Definition: defines.h:133
const_reverse_iterator rbegin() const
Definition: SBuf.h:591
#define URI_WHITESPACE_CHOP
Definition: defines.h:136
static AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition: UriScheme.cc:52
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:174
#define LOCAL_ARRAY(type, name, size)
Definition: squid.h:75
HttpHeader header
Definition: Message.h:75
int check_hostnames
Definition: SquidConfig.h:322
@ mdnHonorWildcards
Definition: Uri.h:225
bool urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:833
bool isEmpty() const
Definition: SBuf.h:431
void reserveSpace(size_type minSpace)
Definition: SBuf.h:440
@ PROTO_NONE
Definition: ProtocolType.h:24
SBuf hostOrIp() const
Definition: Uri.cc:114
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition: Uri.cc:254
const char * ProtocolType_str[]
Definition: SBuf.h:94
bool atEnd() const
whether the end of the buffer has been reached
Definition: Tokenizer.h:41
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:179
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
int matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
Definition: Uri.cc:736
#define xtolower(x)
Definition: xis.h:19
#define w_space
static int port
Definition: ldap_backend.cc:70
@ PROTO_UNKNOWN
Definition: ProtocolType.h:43
static const CharacterSet ALPHA
Definition: CharacterSet.h:76
AnyP::UriScheme const & getScheme() const
Definition: Uri.h:67
@ METHOD_OPTIONS
Definition: MethodType.h:31
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition: forward.h:25
@ PROTO_URN
Definition: ProtocolType.h:37
Definition: Raw.h:21
#define MAX_URL
Definition: defines.h:78
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
static const CharacterSet & UserInfoChars()
Characters which are valid within a URI userinfo section.
Definition: Uri.cc:36
#define URI_WHITESPACE_DENY
Definition: defines.h:137
int strip_query_terms
Definition: SquidConfig.h:306
#define NULL
Definition: types.h:166
const char * rawContent() const
Definition: SBuf.cc:509
@ PROTO_MAX
Definition: ProtocolType.h:44
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
char at(size_type pos) const
Definition: SBuf.h:249
MatchDomainNameFlags
Definition: Uri.h:223
@ METHOD_CONNECT
Definition: MethodType.h:29
const_iterator begin() const
Definition: SBuf.h:583
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1162
void parseUrn(Parser::Tokenizer &)
Definition: Uri.cc:530
#define assert(EX)
Definition: assert.h:19
SBuf image() const
Definition: UriScheme.h:50
@ METHOD_PUT
Definition: MethodType.h:27
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
@ METHOD_TRACE
Definition: MethodType.h:30
@ PROTO_GOPHER
Definition: ProtocolType.h:30
static const CharacterSet DIGIT
Definition: CharacterSet.h:84
const char * c_str()
Definition: SBuf.cc:516
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:566
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:415
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56
SBuf & append(const SBuf &S)
Definition: SBuf.cc:185
#define xfree
void addRelativePath(const char *relUrl)
Definition: Uri.cc:712
CharacterSet & rename(const char *label)
change name; handy in const declarations that use operators
Definition: CharacterSet.h:61
int allow_underscore
Definition: SquidConfig.h:323
static const size_type npos
Definition: SBuf.h:99
static const char valid_hostname_chars[]
Definition: Uri.cc:27
@ METHOD_PURGE
Definition: MethodType.h:92
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:649
CharacterSet & remove(const unsigned char c)
remove a given character from the character set
Definition: CharacterSet.cc:54
@ PROTO_CACHE_OBJECT
Definition: ProtocolType.h:32
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition: Uri.cc:225
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
static const SBuf & SlashPath()
the static '/' default URL-path
Definition: Uri.cc:92
void urlInitialize(void)
Definition: Uri.cc:137
SBuf & absolute() const
Definition: Uri.cc:584
@ PROTO_WHOIS
Definition: ProtocolType.h:38
@ PROTO_HTTPS
Definition: ProtocolType.h:27
HttpRequestMethod method
Definition: HttpRequest.h:114
void path(const char *p)
Definition: Uri.h:99
@ PROTO_FTP
Definition: ProtocolType.h:26
@ PROTO_HTTP
Definition: ProtocolType.h:25
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
Definition: parse.c:160
struct SquidConfig::@110 onoff
@ mdnRejectSubsubDomains
Definition: Uri.h:226
unsigned short defaultPort() const
Definition: UriScheme.cc:71
an std::runtime_error with thrower location info
Definition: TextException.h:21
@ PROTO_WAIS
Definition: ProtocolType.h:31
char * url
Definition: tcp-banger2.c:114
Uri()
Definition: Uri.h:35
#define DBG_IMPORTANT
Definition: Stream.h:41
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
#define MYNAME
Definition: Stream.h:238
bool urlIsRelative(const char *url)
Definition: Uri.cc:676
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
SBuf & appendf(const char *fmt,...)
Definition: SBuf.cc:229
void touch()
clear the cached URI display forms
Definition: Uri.cc:558
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:620
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:18
#define xisspace(x)
Definition: xis.h:17
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * appendDomain
Definition: SquidConfig.h:220
static uint32 B
Definition: md4.c:43
size_t appendDomainLen
Definition: SquidConfig.h:221
const SBuf & path() const
Definition: Uri.cc:125
@ METHOD_GET
Definition: MethodType.h:25
int stringHasWhitespace(const char *)
Definition: String.cc:366
const char * host(void) const
Definition: Uri.h:85
#define false
Definition: GnuRegex.c:233
static const SBuf & Asterisk()
the static '*' pseudo-URI
Definition: Uri.cc:85
int uri_whitespace
Definition: SquidConfig.h:463
struct _request * request(char *urlin)
Definition: tcp-banger2.c:291
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:196
static SBuf Encode(const SBuf &, const CharacterSet &expected)
Definition: Uri.cc:56
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
#define SQUIDSBUFPH
Definition: SBuf.h:31
class SquidConfig Config
Definition: SquidConfig.cc:12
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition: Uri.cc:188
#define URI_WHITESPACE_ALLOW
Definition: defines.h:134

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors