Uri.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 23 URL Parsing */
10 
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "globals.h"
14 #include "HttpRequest.h"
15 #include "parser/Tokenizer.h"
16 #include "rfc1738.h"
17 #include "SquidConfig.h"
18 #include "SquidString.h"
19 
20 static const char valid_hostname_chars_u[] =
21  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22  "abcdefghijklmnopqrstuvwxyz"
23  "0123456789-._"
24  "[:]"
25  ;
26 static const char valid_hostname_chars[] =
27  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
28  "abcdefghijklmnopqrstuvwxyz"
29  "0123456789-."
30  "[:]"
31  ;
32 
34 static const CharacterSet &
36 {
37  /*
38  * RFC 3986 section 3.2.1
39  *
40  * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
41  * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
42  * pct-encoded = "%" HEXDIG HEXDIG
43  * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
44  */
45  static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
48  return userInfoValid;
49 }
50 
54 SBuf
55 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
56 {
57  if (buf.isEmpty())
58  return buf;
59 
61  SBuf goodSection;
62  // optimization for the arguably common "no encoding necessary" case
63  if (tk.prefix(goodSection, ignore) && tk.atEnd())
64  return buf;
65 
66  SBuf output;
67  output.reserveSpace(buf.length() * 3); // worst case: encode all chars
68  output.append(goodSection); // may be empty
69 
70  while (!tk.atEnd()) {
71  // TODO: Add Tokenizer::parseOne(void).
72  const auto ch = tk.remaining()[0];
73  output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
74  (void)tk.skip(ch);
75 
76  if (tk.prefix(goodSection, ignore))
77  output.append(goodSection);
78  }
79 
80  return output;
81 }
82 
83 const SBuf &
85 {
86  static SBuf star("*");
87  return star;
88 }
89 
90 const SBuf &
92 {
93  static SBuf slash("/");
94  return slash;
95 }
96 
97 void
98 AnyP::Uri::host(const char *src)
99 {
100  hostAddr_.setEmpty();
101  hostAddr_ = src;
102  if (hostAddr_.isAnyAddr()) {
103  xstrncpy(host_, src, sizeof(host_));
104  hostIsNumeric_ = false;
105  } else {
106  hostAddr_.toHostStr(host_, sizeof(host_));
107  debugs(23, 3, "given IP: " << hostAddr_);
108  hostIsNumeric_ = 1;
109  }
110  touch();
111 }
112 
113 SBuf
115 {
116  static char ip[MAX_IPSTRLEN];
117  if (hostIsNumeric())
118  return SBuf(hostIP().toStr(ip, sizeof(ip)));
119  else
120  return SBuf(host());
121 }
122 
123 const SBuf &
125 {
126  // RFC 3986 section 3.3 says path can be empty (path-abempty).
127  // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
128  // at least when sending and using. We must still accept path-abempty as input.
129  if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
130  return SlashPath();
131 
132  return path_;
133 }
134 
135 void
137 {
138  debugs(23, 5, "urlInitialize: Initializing...");
139  /* this ensures that the number of protocol strings is the same as
140  * the enum slots allocated because the last enum is always 'MAX'.
141  */
142  assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
143  /*
144  * These test that our matchDomainName() function works the
145  * way we expect it to.
146  */
147  assert(0 == matchDomainName("foo.com", "foo.com"));
148  assert(0 == matchDomainName(".foo.com", "foo.com"));
149  assert(0 == matchDomainName("foo.com", ".foo.com"));
150  assert(0 == matchDomainName(".foo.com", ".foo.com"));
151  assert(0 == matchDomainName("x.foo.com", ".foo.com"));
152  assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
153  assert(0 != matchDomainName("x.foo.com", "foo.com"));
154  assert(0 != matchDomainName("foo.com", "x.foo.com"));
155  assert(0 != matchDomainName("bar.com", "foo.com"));
156  assert(0 != matchDomainName(".bar.com", "foo.com"));
157  assert(0 != matchDomainName(".bar.com", ".foo.com"));
158  assert(0 != matchDomainName("bar.com", ".foo.com"));
159  assert(0 < matchDomainName("zzz.com", "foo.com"));
160  assert(0 > matchDomainName("aaa.com", "foo.com"));
161  assert(0 == matchDomainName("FOO.com", "foo.COM"));
162  assert(0 < matchDomainName("bfoo.com", "afoo.com"));
163  assert(0 > matchDomainName("afoo.com", "bfoo.com"));
164  assert(0 < matchDomainName("x-foo.com", ".foo.com"));
165 
166  assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
167  assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
168  assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
169  assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
170 
171  assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
172  assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
173  assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
174  assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
175 
176  /* more cases? */
177 }
178 
186 static AnyP::UriScheme
188 {
189  /*
190  * RFC 3986 section 3.1 paragraph 2:
191  *
192  * Scheme names consist of a sequence of characters beginning with a
193  * letter and followed by any combination of letters, digits, plus
194  * ("+"), period ("."), or hyphen ("-").
195  *
196  * The underscore ("_") required to match "cache_object://" squid
197  * special URI scheme.
198  */
199  static const auto schemeChars =
200 #if USE_HTTP_VIOLATIONS
201  CharacterSet("special", "_") +
202 #endif
204 
205  SBuf str;
206  if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
207  const auto protocol = AnyP::UriScheme::FindProtocolType(str);
208  if (protocol == AnyP::PROTO_UNKNOWN)
209  return AnyP::UriScheme(protocol, str.c_str());
210  return AnyP::UriScheme(protocol, nullptr);
211  }
212 
213  throw TextException("invalid URI scheme", Here());
214 }
215 
223 bool
224 urlAppendDomain(char *host)
225 {
226  /* For IPv4 addresses check for a dot */
227  /* For IPv6 addresses also check for a colon */
228  if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
229  const uint64_t dlen = strlen(host);
230  const uint64_t want = dlen + Config.appendDomainLen;
231  if (want > SQUIDHOSTNAMELEN - 1) {
232  debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
233  return false;
234  }
235  strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
236  }
237  return true;
238 }
239 
240 /*
241  * Parse a URI/URL.
242  *
243  * It is assumed that the URL is complete -
244  * ie, the end of the string is the end of the URL. Don't pass a partial
245  * URL here as this routine doesn't have any way of knowing whether
246  * it is partial or not (ie, it handles the case of no trailing slash as
247  * being "end of host with implied path of /".
248  *
249  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
250  * then rather than a URL a hostname:port is looked for.
251  */
252 bool
254 {
255  try {
256 
257  LOCAL_ARRAY(char, login, MAX_URL);
258  LOCAL_ARRAY(char, foundHost, MAX_URL);
259  LOCAL_ARRAY(char, urlpath, MAX_URL);
260  char *t = NULL;
261  char *q = NULL;
262  int foundPort;
263  int l;
264  int i;
265  const char *src;
266  char *dst;
267  foundHost[0] = urlpath[0] = login[0] = '\0';
268 
269  if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
270  debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
271  return false;
272  }
273 
275  Asterisk().cmp(rawUrl) == 0) {
276  // XXX: these methods might also occur in HTTPS traffic. Handle this better.
277  setScheme(AnyP::PROTO_HTTP, nullptr);
278  port(getScheme().defaultPort());
279  path(Asterisk());
280  return true;
281  }
282 
283  Parser::Tokenizer tok(rawUrl);
284  AnyP::UriScheme scheme;
285 
286  if (method == Http::METHOD_CONNECT) {
287  /*
288  * RFC 7230 section 5.3.3: authority-form = authority
289  * "excluding any userinfo and its "@" delimiter"
290  *
291  * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
292  *
293  * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
294  */
295  foundPort = 443;
296 
297  // XXX: use tokenizer
298  auto B = tok.buf();
299  const char *url = B.c_str();
300 
301  if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
302  if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
303  return false;
304 
305  } else {
306 
307  scheme = uriParseScheme(tok);
308 
309  if (scheme == AnyP::PROTO_NONE)
310  return false; // invalid scheme
311 
312  if (scheme == AnyP::PROTO_URN) {
313  parseUrn(tok); // throws on any error
314  return true;
315  }
316 
317  // URLs then have "//"
318  static const SBuf doubleSlash("//");
319  if (!tok.skip(doubleSlash))
320  return false;
321 
322  auto B = tok.remaining();
323  const char *url = B.c_str();
324 
325  /* Parse the URL: */
326  src = url;
327  i = 0;
328 
329  /* Then everything until first /; that's host (and port; which we'll look for here later) */
330  // bug 1881: If we don't get a "/" then we imply it was there
331  // bug 3074: We could just be given a "?" or "#". These also imply "/"
332  // bug 3233: whitespace is also a hostname delimiter.
333  for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
334  *dst = *src;
335  }
336 
337  /*
338  * We can't check for "i >= l" here because we could be at the end of the line
339  * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
340  * been -given- a valid URL and the path is just '/'.
341  */
342  if (i > l)
343  return false;
344  *dst = '\0';
345 
346  // We are looking at path-abempty.
347  if (*src != '/') {
348  // path-empty, including the end of the `src` c-string cases
349  urlpath[0] = '/';
350  dst = &urlpath[1];
351  } else {
352  dst = urlpath;
353  }
354  /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
355  for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
356  *dst = *src;
357  }
358 
359  /* We -could- be at the end of the buffer here */
360  if (i > l)
361  return false;
362  *dst = '\0';
363 
364  foundPort = scheme.defaultPort(); // may be reset later
365 
366  /* Is there any login information? (we should eventually parse it above) */
367  t = strrchr(foundHost, '@');
368  if (t != NULL) {
369  strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
370  login[sizeof(login)-1] = '\0';
371  t = strrchr(login, '@');
372  *t = 0;
373  strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
374  foundHost[sizeof(foundHost)-1] = '\0';
375  // Bug 4498: URL-unescape the login info after extraction
376  rfc1738_unescape(login);
377  }
378 
379  /* Is there any host information? (we should eventually parse it above) */
380  if (*foundHost == '[') {
381  /* strip any IPA brackets. valid under IPv6. */
382  dst = foundHost;
383  /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
384  src = foundHost;
385  ++src;
386  l = strlen(foundHost);
387  i = 1;
388  for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
389  *dst = *src;
390  }
391 
392  /* we moved in-place, so truncate the actual hostname found */
393  *dst = '\0';
394  ++dst;
395 
396  /* skip ahead to either start of port, or original EOS */
397  while (*dst != '\0' && *dst != ':')
398  ++dst;
399  t = dst;
400  } else {
401  t = strrchr(foundHost, ':');
402 
403  if (t != strchr(foundHost,':') ) {
404  /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
405  /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
406  /* therefore we MUST accept the case where they are not bracketed at all. */
407  t = NULL;
408  }
409  }
410 
411  // Bug 3183 sanity check: If scheme is present, host must be too.
412  if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
413  debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
414  return false;
415  }
416 
417  if (t && *t == ':') {
418  *t = '\0';
419  ++t;
420  foundPort = atoi(t);
421  }
422  }
423 
424  for (t = foundHost; *t; ++t)
425  *t = xtolower(*t);
426 
427  if (stringHasWhitespace(foundHost)) {
429  t = q = foundHost;
430  while (*t) {
431  if (!xisspace(*t)) {
432  *q = *t;
433  ++q;
434  }
435  ++t;
436  }
437  *q = '\0';
438  }
439  }
440 
441  debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
442 
444  strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
445  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
446  return false;
447  }
448 
449  if (!urlAppendDomain(foundHost))
450  return false;
451 
452  /* remove trailing dots from hostnames */
453  while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
454  foundHost[l] = '\0';
455 
456  /* reject duplicate or leading dots */
457  if (strstr(foundHost, "..") || *foundHost == '.') {
458  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
459  return false;
460  }
461 
462  if (foundPort < 1 || foundPort > 65535) {
463  debugs(23, 3, "Invalid port '" << foundPort << "'");
464  return false;
465  }
466 
467 #if HARDCODE_DENY_PORTS
468  /* These ports are filtered in the default squid.conf, but
469  * maybe someone wants them hardcoded... */
470  if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
471  debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
472  return false;
473  }
474 #endif
475 
476  if (stringHasWhitespace(urlpath)) {
477  debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
478 
479  switch (Config.uri_whitespace) {
480 
481  case URI_WHITESPACE_DENY:
482  return false;
483 
485  break;
486 
488  t = rfc1738_escape_unescaped(urlpath);
489  xstrncpy(urlpath, t, MAX_URL);
490  break;
491 
492  case URI_WHITESPACE_CHOP:
493  *(urlpath + strcspn(urlpath, w_space)) = '\0';
494  break;
495 
497  default:
498  t = q = urlpath;
499  while (*t) {
500  if (!xisspace(*t)) {
501  *q = *t;
502  ++q;
503  }
504  ++t;
505  }
506  *q = '\0';
507  }
508  }
509 
510  setScheme(scheme);
511  path(urlpath);
512  host(foundHost);
513  userInfo(SBuf(login));
514  port(foundPort);
515  return true;
516 
517  } catch (...) {
518  debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
519  return false;
520  }
521 }
522 
537 void
539 {
540  static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
541  static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
542  SBuf nid;
543  if (!tok.prefix(nid, nidChars, 32))
544  throw TextException("NID not found", Here());
545 
546  if (!tok.skip(':'))
547  throw TextException("NID too long or missing ':' delimiter", Here());
548 
549  if (nid.length() < 2)
550  throw TextException("NID too short", Here());
551 
552  if (!alphanum[*nid.begin()])
553  throw TextException("NID prefix is not alphanumeric", Here());
554 
555  if (!alphanum[*nid.rbegin()])
556  throw TextException("NID suffix is not alphanumeric", Here());
557 
558  setScheme(AnyP::PROTO_URN, nullptr);
559  host(nid.c_str());
560  // TODO validate path characters
561  path(tok.remaining());
562  debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
563 }
564 
565 void
567 {
568  absolute_.clear();
569  authorityHttp_.clear();
570  authorityWithPort_.clear();
571 }
572 
573 SBuf &
574 AnyP::Uri::authority(bool requirePort) const
575 {
576  if (authorityHttp_.isEmpty()) {
577 
578  // both formats contain Host/IP
579  authorityWithPort_.append(host());
580  authorityHttp_ = authorityWithPort_;
581 
582  // authorityForm_ only has :port if it is non-default
583  authorityWithPort_.appendf(":%u",port());
584  if (port() != getScheme().defaultPort())
585  authorityHttp_ = authorityWithPort_;
586  }
587 
588  return requirePort ? authorityWithPort_ : authorityHttp_;
589 }
590 
591 SBuf &
593 {
594  if (absolute_.isEmpty()) {
595  // TODO: most URL will be much shorter, avoid allocating this much
596  absolute_.reserveCapacity(MAX_URL);
597 
598  absolute_.append(getScheme().image());
599  absolute_.append(":",1);
600  if (getScheme() != AnyP::PROTO_URN) {
601  absolute_.append("//", 2);
602  const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
603  getScheme() == AnyP::PROTO_UNKNOWN;
604 
605  if (allowUserInfo && !userInfo().isEmpty()) {
606  static const CharacterSet uiChars = CharacterSet(UserInfoChars())
607  .remove('%')
608  .rename("userinfo-reserved");
609  absolute_.append(Encode(userInfo(), uiChars));
610  absolute_.append("@", 1);
611  }
612  absolute_.append(authority());
613  } else {
614  absolute_.append(host());
615  absolute_.append(":", 1);
616  }
617  absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
618  }
619 
620  return absolute_;
621 }
622 
623 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
624  * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
625  * and never copy the query-string part in the first place
626  */
627 char *
629 {
630  LOCAL_ARRAY(char, buf, MAX_URL);
631 
632  snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
633  buf[sizeof(buf)-1] = '\0';
634 
635  // URN, CONNECT method, and non-stripped URIs can go straight out
637  // strip anything AFTER a question-mark
638  // leaving the '?' in place
639  if (auto t = strchr(buf, '?')) {
640  *(++t) = '\0';
641  }
642  }
643 
644  if (stringHasCntl(buf))
646 
647  return buf;
648 }
649 
656 const char *
658 {
659  LOCAL_ARRAY(char, buf, MAX_URL);
660 
661  // method CONNECT and port HTTPS
662  if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
663  snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
664  return buf;
665  }
666 
667  // else do the normal complete canonical thing.
668  return request->canonicalCleanUrl();
669 }
670 
683 bool
684 urlIsRelative(const char *url)
685 {
686  if (!url)
687  return false; // no URL
688 
689  /*
690  * RFC 3986 section 5.2.3
691  *
692  * path = path-abempty ; begins with "/" or is empty
693  * / path-absolute ; begins with "/" but not "//"
694  * / path-noscheme ; begins with a non-colon segment
695  * / path-rootless ; begins with a segment
696  * / path-empty ; zero characters
697  */
698 
699  if (*url == '\0')
700  return true; // path-empty
701 
702  if (*url == '/') {
703  // RFC 3986 section 5.2.3
704  // path-absolute ; begins with "/" but not "//"
705  if (url[1] == '/')
706  return true; // network-path reference, aka. 'scheme-relative URI'
707  else
708  return true; // path-absolute, aka 'absolute-path reference'
709  }
710 
711  for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
712  if (*p == ':')
713  return false; // colon is forbidden in first segment
714  }
715 
716  return true; // path-noscheme, path-abempty, path-rootless
717 }
718 
719 void
720 AnyP::Uri::addRelativePath(const char *relUrl)
721 {
722  // URN cannot be merged
723  if (getScheme() == AnyP::PROTO_URN)
724  return;
725 
726  // TODO: Handle . and .. segment normalization
727 
728  const auto lastSlashPos = path_.rfind('/');
729  // TODO: To optimize and simplify, add and use SBuf::replace().
730  const auto relUrlLength = strlen(relUrl);
731  if (lastSlashPos == SBuf::npos) {
732  // start replacing the whole path
733  path_.reserveCapacity(1 + relUrlLength);
734  path_.assign("/", 1);
735  } else {
736  // start replacing just the last segment
737  path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
738  path_.chop(0, lastSlashPos+1);
739  }
740  path_.append(relUrl, relUrlLength);
741 }
742 
743 int
744 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
745 {
746  int dl;
747  int hl;
748 
749  const bool hostIncludesSubdomains = (*h == '.');
750  while ('.' == *h)
751  ++h;
752 
753  hl = strlen(h);
754 
755  if (hl == 0)
756  return -1;
757 
758  dl = strlen(d);
759 
760  /*
761  * Start at the ends of the two strings and work towards the
762  * beginning.
763  */
764  while (xtolower(h[--hl]) == xtolower(d[--dl])) {
765  if (hl == 0 && dl == 0) {
766  /*
767  * We made it all the way to the beginning of both
768  * strings without finding any difference.
769  */
770  return 0;
771  }
772 
773  if (0 == hl) {
774  /*
775  * The host string is shorter than the domain string.
776  * There is only one case when this can be a match.
777  * If the domain is just one character longer, and if
778  * that character is a leading '.' then we call it a
779  * match.
780  */
781 
782  if (1 == dl && '.' == d[0])
783  return 0;
784  else
785  return -1;
786  }
787 
788  if (0 == dl) {
789  /*
790  * The domain string is shorter than the host string.
791  * This is a match only if the first domain character
792  * is a leading '.'.
793  */
794 
795  if ('.' == d[0]) {
796  if (flags & mdnRejectSubsubDomains) {
797  // Check for sub-sub domain and reject
798  while(--hl >= 0 && h[hl] != '.');
799  if (hl < 0) {
800  // No sub-sub domain found, but reject if there is a
801  // leading dot in given host string (which is removed
802  // before the check is started).
803  return hostIncludesSubdomains ? 1 : 0;
804  } else
805  return 1; // sub-sub domain, reject
806  } else
807  return 0;
808  } else
809  return 1;
810  }
811  }
812 
813  /*
814  * We found different characters in the same position (from the end).
815  */
816 
817  // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
818  // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
819  // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
820  if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
821  return 0;
822 
823  /*
824  * If one of those character is '.' then its special. In order
825  * for splay tree sorting to work properly, "x-foo.com" must
826  * be greater than ".foo.com" even though '-' is less than '.'.
827  */
828  if ('.' == d[dl])
829  return 1;
830 
831  if ('.' == h[hl])
832  return -1;
833 
834  return (xtolower(h[hl]) - xtolower(d[dl]));
835 }
836 
837 /*
838  * return true if we can serve requests for this method.
839  */
840 int
842 {
843  int rc = 0;
844  /* protocol "independent" methods
845  *
846  * actually these methods are specific to HTTP:
847  * they are methods we receive on our HTTP port,
848  * and if we had a FTP listener would not be relevant
849  * there.
850  *
851  * So, we should delegate them to HTTP. The problem is that we
852  * do not have a default protocol from the client side of HTTP.
853  */
854 
855  if (r->method == Http::METHOD_CONNECT)
856  return 1;
857 
858  // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
859  // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
862 
863  if (r->method == Http::METHOD_PURGE)
864  return 1;
865 
866  /* does method match the protocol? */
867  switch (r->url.getScheme()) {
868 
869  case AnyP::PROTO_URN:
870 
871  case AnyP::PROTO_HTTP:
872 
874  rc = 1;
875  break;
876 
877  case AnyP::PROTO_FTP:
878 
879  if (r->method == Http::METHOD_PUT)
880  rc = 1;
881 
882  case AnyP::PROTO_GOPHER:
883 
884  case AnyP::PROTO_WAIS:
885 
886  case AnyP::PROTO_WHOIS:
887  if (r->method == Http::METHOD_GET)
888  rc = 1;
889  else if (r->method == Http::METHOD_HEAD)
890  rc = 1;
891 
892  break;
893 
894  case AnyP::PROTO_HTTPS:
895 #if USE_OPENSSL
896  rc = 1;
897 #elif USE_GNUTLS
898  rc = 1;
899 #else
900  /*
901  * Squid can't originate an SSL connection, so it should
902  * never receive an "https:" URL. It should always be
903  * CONNECT instead.
904  */
905  rc = 0;
906 #endif
907  break;
908 
909  default:
910  break;
911  }
912 
913  return rc;
914 }
915 
917  scheme_(aScheme),
918  hostIsNumeric_(false),
919  port_(0)
920 {
921  *host_=0;
922 }
923 
924 // TODO: fix code duplication with AnyP::Uri::parse()
925 char *
926 AnyP::Uri::cleanup(const char *uri)
927 {
928  int flags = 0;
929  char *cleanedUri = nullptr;
930  switch (Config.uri_whitespace) {
932  flags |= RFC1738_ESCAPE_NOSPACE;
933  // fall through to next case
935  flags |= RFC1738_ESCAPE_UNESCAPED;
936  cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
937  break;
938 
939  case URI_WHITESPACE_CHOP: {
940  flags |= RFC1738_ESCAPE_UNESCAPED;
941  const auto pos = strcspn(uri, w_space);
942  char *choppedUri = nullptr;
943  if (pos < strlen(uri))
944  choppedUri = xstrndup(uri, pos + 1);
945  cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
946  cleanedUri[pos] = '\0';
947  xfree(choppedUri);
948  }
949  break;
950 
951  case URI_WHITESPACE_DENY:
953  default: {
954  // TODO: avoid duplication with urlParse()
955  const char *t;
956  char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
957  char *q = tmp_uri;
958  t = uri;
959  while (*t) {
960  if (!xisspace(*t)) {
961  *q = *t;
962  ++q;
963  }
964  ++t;
965  }
966  *q = '\0';
967  cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
968  xfree(tmp_uri);
969  }
970  break;
971  }
972 
973  assert(cleanedUri);
974  return cleanedUri;
975 }
976 
static char * cleanup(const char *uri)
Definition: Uri.cc:926
#define URI_WHITESPACE_ENCODE
Definition: defines.h:196
char method[16]
Definition: tcp-banger2.c:115
#define MYNAME
Definition: Debug.h:170
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:81
#define Here()
source code location of the caller
Definition: Here.h:15
static const char valid_hostname_chars_u[]
Definition: Uri.cc:20
@ METHOD_HEAD
Definition: MethodType.h:28
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
static const AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition: UriScheme.cc:52
#define xmalloc
int stringHasCntl(const char *)
Definition: String.cc:387
#define URI_WHITESPACE_STRIP
Definition: defines.h:194
const_reverse_iterator rbegin() const
Definition: SBuf.h:580
#define URI_WHITESPACE_CHOP
Definition: defines.h:197
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:174
#define LOCAL_ARRAY(type, name, size)
Definition: squid.h:75
HttpHeader header
Definition: Message.h:75
int check_hostnames
Definition: SquidConfig.h:322
@ mdnHonorWildcards
Definition: Uri.h:225
void reserveSpace(size_type minSpace)
Definition: SBuf.h:429
@ PROTO_NONE
Definition: ProtocolType.h:24
SBuf hostOrIp() const
Definition: Uri.cc:114
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition: Uri.cc:253
const char * ProtocolType_str[]
Definition: SBuf.h:86
bool atEnd() const
whether the end of the buffer has been reached
Definition: Tokenizer.h:41
double const StatHist & B
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:179
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
int urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:841
#define DBG_CRITICAL
Definition: Debug.h:45
int matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
Definition: Uri.cc:744
#define xtolower(x)
Definition: xis.h:19
#define DBG_IMPORTANT
Definition: Debug.h:46
#define w_space
static int port
Definition: ldap_backend.cc:69
@ PROTO_UNKNOWN
Definition: ProtocolType.h:43
static const CharacterSet ALPHA
Definition: CharacterSet.h:76
@ METHOD_OPTIONS
Definition: MethodType.h:31
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition: forward.h:23
@ PROTO_URN
Definition: ProtocolType.h:37
Definition: Debug.h:188
#define MAX_URL
Definition: defines.h:118
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
static const CharacterSet & UserInfoChars()
Characters which are valid within a URI userinfo section.
Definition: Uri.cc:35
#define URI_WHITESPACE_DENY
Definition: defines.h:198
int strip_query_terms
Definition: SquidConfig.h:305
#define NULL
Definition: types.h:166
const char * rawContent() const
Definition: SBuf.cc:519
@ PROTO_MAX
Definition: ProtocolType.h:44
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Debug.h:128
char at(size_type pos) const
Definition: SBuf.h:238
MatchDomainNameFlags
Definition: Uri.h:223
@ METHOD_CONNECT
Definition: MethodType.h:29
const_iterator begin() const
Definition: SBuf.h:572
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1178
void parseUrn(Parser::Tokenizer &)
Definition: Uri.cc:538
#define assert(EX)
Definition: assert.h:19
SBuf image() const
Definition: UriScheme.h:50
@ METHOD_PUT
Definition: MethodType.h:27
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
const AnyP::UriScheme & getScheme() const
Definition: Uri.h:67
@ METHOD_TRACE
Definition: MethodType.h:30
@ PROTO_GOPHER
Definition: ProtocolType.h:30
static const CharacterSet DIGIT
Definition: CharacterSet.h:84
const char * c_str()
Definition: SBuf.cc:526
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:574
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:404
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56
SBuf & append(const SBuf &S)
Definition: SBuf.cc:195
#define xfree
void addRelativePath(const char *relUrl)
Definition: Uri.cc:720
struct SquidConfig::@112 onoff
CharacterSet & rename(const char *label)
change name; handy in const declarations that use operators
Definition: CharacterSet.h:61
int allow_underscore
Definition: SquidConfig.h:323
static const size_type npos
Definition: SBuf.h:92
static const char valid_hostname_chars[]
Definition: Uri.cc:26
@ METHOD_PURGE
Definition: MethodType.h:94
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:657
CharacterSet & remove(const unsigned char c)
remove a given character from the character set
Definition: CharacterSet.cc:54
@ PROTO_CACHE_OBJECT
Definition: ProtocolType.h:32
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition: Uri.cc:224
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
static const SBuf & SlashPath()
the static '/' default URL-path
Definition: Uri.cc:91
void urlInitialize(void)
Definition: Uri.cc:136
SBuf & absolute() const
Definition: Uri.cc:592
@ PROTO_WHOIS
Definition: ProtocolType.h:38
@ PROTO_HTTPS
Definition: ProtocolType.h:27
HttpRequestMethod method
Definition: HttpRequest.h:114
void path(const char *p)
Definition: Uri.h:99
@ PROTO_FTP
Definition: ProtocolType.h:26
@ PROTO_HTTP
Definition: ProtocolType.h:25
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
Definition: parse.c:160
@ mdnRejectSubsubDomains
Definition: Uri.h:226
unsigned short defaultPort() const
Definition: UriScheme.cc:71
an std::runtime_error with thrower location info
Definition: TextException.h:19
@ PROTO_WAIS
Definition: ProtocolType.h:31
char * url
Definition: tcp-banger2.c:114
size_t HttpReply *STUB StoreEntry const KeyScope scope const HttpRequestMethod & method
Definition: stub_store.cc:105
Uri()
Definition: Uri.h:35
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
bool urlIsRelative(const char *url)
Definition: Uri.cc:684
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
SBuf & appendf(const char *fmt,...)
Definition: SBuf.cc:239
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:628
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:17
#define xisspace(x)
Definition: xis.h:17
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * appendDomain
Definition: SquidConfig.h:218
size_t appendDomainLen
Definition: SquidConfig.h:219
const SBuf & path() const
Definition: Uri.cc:124
@ METHOD_GET
Definition: MethodType.h:25
int stringHasWhitespace(const char *)
Definition: String.cc:380
const char * host(void) const
Definition: Uri.h:85
#define false
Definition: GnuRegex.c:233
static const SBuf & Asterisk()
the static '*' pseudo-URI
Definition: Uri.cc:84
int uri_whitespace
Definition: SquidConfig.h:461
struct _request * request(char *urlin)
Definition: tcp-banger2.c:291
static SBuf Encode(const SBuf &, const CharacterSet &expected)
Definition: Uri.cc:55
void const char * buf
Definition: stub_helper.cc:16
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
void touch()
clear the cached URI display forms
Definition: Uri.cc:566
#define SQUIDSBUFPH
Definition: SBuf.h:31
class SquidConfig Config
Definition: SquidConfig.cc:12
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition: Uri.cc:187
#define URI_WHITESPACE_ALLOW
Definition: defines.h:195

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors