Uri.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 23 URL Parsing */
10 
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "globals.h"
14 #include "HttpRequest.h"
15 #include "parser/Tokenizer.h"
16 #include "rfc1738.h"
17 #include "SquidConfig.h"
18 #include "SquidString.h"
19 
20 static const char valid_hostname_chars_u[] =
21  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
22  "abcdefghijklmnopqrstuvwxyz"
23  "0123456789-._"
24  "[:]"
25  ;
26 static const char valid_hostname_chars[] =
27  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
28  "abcdefghijklmnopqrstuvwxyz"
29  "0123456789-."
30  "[:]"
31  ;
32 
33 const SBuf &
35 {
36  static SBuf star("*");
37  return star;
38 }
39 
40 const SBuf &
42 {
43  static SBuf slash("/");
44  return slash;
45 }
46 
47 void
48 AnyP::Uri::host(const char *src)
49 {
51  hostAddr_ = src;
52  if (hostAddr_.isAnyAddr()) {
53  xstrncpy(host_, src, sizeof(host_));
54  hostIsNumeric_ = false;
55  } else {
56  hostAddr_.toHostStr(host_, sizeof(host_));
57  debugs(23, 3, "given IP: " << hostAddr_);
58  hostIsNumeric_ = 1;
59  }
60  touch();
61 }
62 
63 SBuf
65 {
66  static char ip[MAX_IPSTRLEN];
67  if (hostIsNumeric())
68  return SBuf(hostIP().toStr(ip, sizeof(ip)));
69  else
70  return SBuf(host());
71 }
72 
73 const SBuf &
75 {
76  // RFC 3986 section 3.3 says path can be empty (path-abempty).
77  // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
78  // at least when sending and using. We must still accept path-abempty as input.
80  return SlashPath();
81 
82  return path_;
83 }
84 
85 void
87 {
88  debugs(23, 5, "urlInitialize: Initializing...");
89  /* this ensures that the number of protocol strings is the same as
90  * the enum slots allocated because the last enum is always 'MAX'.
91  */
92  assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
93  /*
94  * These test that our matchDomainName() function works the
95  * way we expect it to.
96  */
97  assert(0 == matchDomainName("foo.com", "foo.com"));
98  assert(0 == matchDomainName(".foo.com", "foo.com"));
99  assert(0 == matchDomainName("foo.com", ".foo.com"));
100  assert(0 == matchDomainName(".foo.com", ".foo.com"));
101  assert(0 == matchDomainName("x.foo.com", ".foo.com"));
102  assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
103  assert(0 != matchDomainName("x.foo.com", "foo.com"));
104  assert(0 != matchDomainName("foo.com", "x.foo.com"));
105  assert(0 != matchDomainName("bar.com", "foo.com"));
106  assert(0 != matchDomainName(".bar.com", "foo.com"));
107  assert(0 != matchDomainName(".bar.com", ".foo.com"));
108  assert(0 != matchDomainName("bar.com", ".foo.com"));
109  assert(0 < matchDomainName("zzz.com", "foo.com"));
110  assert(0 > matchDomainName("aaa.com", "foo.com"));
111  assert(0 == matchDomainName("FOO.com", "foo.COM"));
112  assert(0 < matchDomainName("bfoo.com", "afoo.com"));
113  assert(0 > matchDomainName("afoo.com", "bfoo.com"));
114  assert(0 < matchDomainName("x-foo.com", ".foo.com"));
115 
116  assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
117  assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
118  assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
119  assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
120 
121  assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
122  assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
123  assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
124  assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
125 
126  /* more cases? */
127 }
128 
136 static AnyP::UriScheme
138 {
139  /*
140  * RFC 3986 section 3.1 paragraph 2:
141  *
142  * Scheme names consist of a sequence of characters beginning with a
143  * letter and followed by any combination of letters, digits, plus
144  * ("+"), period ("."), or hyphen ("-").
145  *
146  * The underscore ("_") required to match "cache_object://" squid
147  * special URI scheme.
148  */
149  static const auto schemeChars =
150 #if USE_HTTP_VIOLATIONS
151  CharacterSet("special", "_") +
152 #endif
154 
155  SBuf str;
156  if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
157  const auto protocol = AnyP::UriScheme::FindProtocolType(str);
158  if (protocol == AnyP::PROTO_UNKNOWN)
159  return AnyP::UriScheme(protocol, str.c_str());
160  return AnyP::UriScheme(protocol, nullptr);
161  }
162 
163  throw TextException("invalid URI scheme", Here());
164 }
165 
173 bool
175 {
176  /* For IPv4 addresses check for a dot */
177  /* For IPv6 addresses also check for a colon */
178  if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
179  const uint64_t dlen = strlen(host);
180  const uint64_t want = dlen + Config.appendDomainLen;
181  if (want > SQUIDHOSTNAMELEN - 1) {
182  debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
183  return false;
184  }
185  strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
186  }
187  return true;
188 }
189 
190 /*
191  * Parse a URI/URL.
192  *
193  * It is assumed that the URL is complete -
194  * ie, the end of the string is the end of the URL. Don't pass a partial
195  * URL here as this routine doesn't have any way of knowing whether
196  * it is partial or not (ie, it handles the case of no trailing slash as
197  * being "end of host with implied path of /".
198  *
199  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
200  * then rather than a URL a hostname:port is looked for.
201  */
202 bool
204 {
205  try {
206 
207  LOCAL_ARRAY(char, login, MAX_URL);
208  LOCAL_ARRAY(char, foundHost, MAX_URL);
209  LOCAL_ARRAY(char, urlpath, MAX_URL);
210  char *t = NULL;
211  char *q = NULL;
212  int foundPort;
213  int l;
214  int i;
215  const char *src;
216  char *dst;
217  foundHost[0] = urlpath[0] = login[0] = '\0';
218 
219  if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
220  debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
221  return false;
222  }
223 
224  if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
225  Asterisk().cmp(rawUrl) == 0) {
226  // XXX: these methods might also occur in HTTPS traffic. Handle this better.
227  setScheme(AnyP::PROTO_HTTP, nullptr);
229  path(Asterisk());
230  return true;
231  }
232 
233  Parser::Tokenizer tok(rawUrl);
234  AnyP::UriScheme scheme;
235 
236  if (method == Http::METHOD_CONNECT) {
237  /*
238  * RFC 7230 section 5.3.3: authority-form = authority
239  * "excluding any userinfo and its "@" delimiter"
240  *
241  * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
242  *
243  * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
244  */
245  foundPort = 443;
246 
247  // XXX: use tokenizer
248  auto B = tok.buf();
249  const char *url = B.c_str();
250 
251  if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
252  if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
253  return false;
254 
255  } else {
256 
257  scheme = uriParseScheme(tok);
258 
259  if (scheme == AnyP::PROTO_NONE)
260  return false; // invalid scheme
261 
262  if (scheme == AnyP::PROTO_URN) {
263  parseUrn(tok); // throws on any error
264  return true;
265  }
266 
267  // URLs then have "//"
268  static const SBuf doubleSlash("//");
269  if (!tok.skip(doubleSlash))
270  return false;
271 
272  auto B = tok.remaining();
273  const char *url = B.c_str();
274 
275  /* Parse the URL: */
276  src = url;
277  i = 0;
278 
279  /* Then everything until first /; thats host (and port; which we'll look for here later) */
280  // bug 1881: If we don't get a "/" then we imply it was there
281  // bug 3074: We could just be given a "?" or "#". These also imply "/"
282  // bug 3233: whitespace is also a hostname delimiter.
283  for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
284  *dst = *src;
285  }
286 
287  /*
288  * We can't check for "i >= l" here because we could be at the end of the line
289  * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
290  * been -given- a valid URL and the path is just '/'.
291  */
292  if (i > l)
293  return false;
294  *dst = '\0';
295 
296  // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
297  if (*src == '?' || *src == '#' || *src == '\0') {
298  urlpath[0] = '/';
299  dst = &urlpath[1];
300  } else {
301  dst = urlpath;
302  }
303  /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
304  for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
305  *dst = *src;
306  }
307 
308  /* We -could- be at the end of the buffer here */
309  if (i > l)
310  return false;
311  /* If the URL path is empty we set it to be "/" */
312  if (dst == urlpath) {
313  *dst = '/';
314  ++dst;
315  }
316  *dst = '\0';
317 
318  foundPort = scheme.defaultPort(); // may be reset later
319 
320  /* Is there any login information? (we should eventually parse it above) */
321  t = strrchr(foundHost, '@');
322  if (t != NULL) {
323  strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
324  login[sizeof(login)-1] = '\0';
325  t = strrchr(login, '@');
326  *t = 0;
327  strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
328  foundHost[sizeof(foundHost)-1] = '\0';
329  // Bug 4498: URL-unescape the login info after extraction
330  rfc1738_unescape(login);
331  }
332 
333  /* Is there any host information? (we should eventually parse it above) */
334  if (*foundHost == '[') {
335  /* strip any IPA brackets. valid under IPv6. */
336  dst = foundHost;
337  /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
338  src = foundHost;
339  ++src;
340  l = strlen(foundHost);
341  i = 1;
342  for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
343  *dst = *src;
344  }
345 
346  /* we moved in-place, so truncate the actual hostname found */
347  *dst = '\0';
348  ++dst;
349 
350  /* skip ahead to either start of port, or original EOS */
351  while (*dst != '\0' && *dst != ':')
352  ++dst;
353  t = dst;
354  } else {
355  t = strrchr(foundHost, ':');
356 
357  if (t != strchr(foundHost,':') ) {
358  /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
359  /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
360  /* therefore we MUST accept the case where they are not bracketed at all. */
361  t = NULL;
362  }
363  }
364 
365  // Bug 3183 sanity check: If scheme is present, host must be too.
366  if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
367  debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
368  return false;
369  }
370 
371  if (t && *t == ':') {
372  *t = '\0';
373  ++t;
374  foundPort = atoi(t);
375  }
376  }
377 
378  for (t = foundHost; *t; ++t)
379  *t = xtolower(*t);
380 
381  if (stringHasWhitespace(foundHost)) {
383  t = q = foundHost;
384  while (*t) {
385  if (!xisspace(*t)) {
386  *q = *t;
387  ++q;
388  }
389  ++t;
390  }
391  *q = '\0';
392  }
393  }
394 
395  debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
396 
398  strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
399  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
400  return false;
401  }
402 
403  if (!urlAppendDomain(foundHost))
404  return false;
405 
406  /* remove trailing dots from hostnames */
407  while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
408  foundHost[l] = '\0';
409 
410  /* reject duplicate or leading dots */
411  if (strstr(foundHost, "..") || *foundHost == '.') {
412  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
413  return false;
414  }
415 
416  if (foundPort < 1 || foundPort > 65535) {
417  debugs(23, 3, "Invalid port '" << foundPort << "'");
418  return false;
419  }
420 
421 #if HARDCODE_DENY_PORTS
422  /* These ports are filtered in the default squid.conf, but
423  * maybe someone wants them hardcoded... */
424  if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
425  debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
426  return false;
427  }
428 #endif
429 
430  if (stringHasWhitespace(urlpath)) {
431  debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
432 
433  switch (Config.uri_whitespace) {
434 
435  case URI_WHITESPACE_DENY:
436  return false;
437 
439  break;
440 
442  t = rfc1738_escape_unescaped(urlpath);
443  xstrncpy(urlpath, t, MAX_URL);
444  break;
445 
446  case URI_WHITESPACE_CHOP:
447  *(urlpath + strcspn(urlpath, w_space)) = '\0';
448  break;
449 
451  default:
452  t = q = urlpath;
453  while (*t) {
454  if (!xisspace(*t)) {
455  *q = *t;
456  ++q;
457  }
458  ++t;
459  }
460  *q = '\0';
461  }
462  }
463 
464  setScheme(scheme);
465  path(urlpath);
466  host(foundHost);
467  userInfo(SBuf(login));
468  port(foundPort);
469  return true;
470 
471  } catch (...) {
472  debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
473  return false;
474  }
475 }
476 
491 void
493 {
494  static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
495  static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
496  SBuf nid;
497  if (!tok.prefix(nid, nidChars, 32))
498  throw TextException("NID not found", Here());
499 
500  if (!tok.skip(':'))
501  throw TextException("NID too long or missing ':' delimiter", Here());
502 
503  if (nid.length() < 2)
504  throw TextException("NID too short", Here());
505 
506  if (!alphanum[*nid.begin()])
507  throw TextException("NID prefix is not alphanumeric", Here());
508 
509  if (!alphanum[*nid.rbegin()])
510  throw TextException("NID suffix is not alphanumeric", Here());
511 
512  setScheme(AnyP::PROTO_URN, nullptr);
513  host(nid.c_str());
514  // TODO validate path characters
515  path(tok.remaining());
516  debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
517 }
518 
519 void
521 {
522  absolute_.clear();
525 }
526 
527 SBuf &
528 AnyP::Uri::authority(bool requirePort) const
529 {
530  if (authorityHttp_.isEmpty()) {
531 
532  // both formats contain Host/IP
535 
536  // authorityForm_ only has :port if it is non-default
538  if (port() != getScheme().defaultPort())
540  }
541 
542  return requirePort ? authorityWithPort_ : authorityHttp_;
543 }
544 
545 SBuf &
547 {
548  if (absolute_.isEmpty()) {
549  // TODO: most URL will be much shorter, avoid allocating this much
551 
552  absolute_.append(getScheme().image());
553  absolute_.append(":",1);
554  if (getScheme() != AnyP::PROTO_URN) {
555  absolute_.append("//", 2);
556  const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
558 
559  if (allowUserInfo && !userInfo().isEmpty()) {
561  absolute_.append("@", 1);
562  }
564  } else {
565  absolute_.append(host());
566  absolute_.append(":", 1);
567  }
568  absolute_.append(path());
569  }
570 
571  return absolute_;
572 }
573 
578 char *
580 {
581  LOCAL_ARRAY(char, buf, MAX_URL);
582 
583  snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
584  buf[sizeof(buf)-1] = '\0';
585 
586  // URN, CONNECT method, and non-stripped URIs can go straight out
587  if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
588  // strip anything AFTER a question-mark
589  // leaving the '?' in place
590  if (auto t = strchr(buf, '?')) {
591  *(++t) = '\0';
592  }
593  }
594 
595  if (stringHasCntl(buf))
597 
598  return buf;
599 }
600 
607 const char *
609 {
610  LOCAL_ARRAY(char, buf, MAX_URL);
611 
612  // method CONNECT and port HTTPS
613  if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
614  snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
615  return buf;
616  }
617 
618  // else do the normal complete canonical thing.
619  return request->canonicalCleanUrl();
620 }
621 
622 /*
623  * Test if a URL is relative.
624  *
625  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
626  * appear before a ':'.
627  */
628 bool
629 urlIsRelative(const char *url)
630 {
631  const char *p;
632 
633  if (url == NULL) {
634  return (false);
635  }
636  if (*url == '\0') {
637  return (false);
638  }
639 
640  for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
641 
642  if (*p == ':') {
643  return (false);
644  }
645  return (true);
646 }
647 
648 /*
649  * Convert a relative URL to an absolute URL using the context of a given
650  * request.
651  *
652  * It is assumed that you have already ensured that the URL is relative.
653  *
654  * If NULL is returned it is an indication that the method in use in the
655  * request does not distinguish between relative and absolute and you should
656  * use the url unchanged.
657  *
658  * If non-NULL is returned, it is up to the caller to free the resulting
659  * memory using safe_free().
660  */
661 char *
662 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
663 {
664 
665  if (req->method.id() == Http::METHOD_CONNECT) {
666  return (NULL);
667  }
668 
669  char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
670 
671  if (req->url.getScheme() == AnyP::PROTO_URN) {
672  // XXX: this is what the original code did, but it seems to break the
673  // intended behaviour of this function. It returns the stored URN path,
674  // not converting the given one into a URN...
675  snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
676  return (urlbuf);
677  }
678 
679  SBuf authorityForm = req->url.authority(); // host[:port]
680  const SBuf &scheme = req->url.getScheme().image();
681  size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
682  SQUIDSBUFPRINT(scheme),
683  SQUIDSBUFPRINT(req->url.userInfo()),
684  !req->url.userInfo().isEmpty() ? "@" : "",
685  SQUIDSBUFPRINT(authorityForm));
686 
687  // if the first char is '/' assume its a relative path
688  // XXX: this breaks on scheme-relative URLs,
689  // but we should not see those outside ESI, and rarely there.
690  // XXX: also breaks on any URL containing a '/' in the query-string portion
691  if (relUrl[0] == '/') {
692  xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
693  } else {
694  SBuf path = req->url.path();
695  SBuf::size_type lastSlashPos = path.rfind('/');
696 
697  if (lastSlashPos == SBuf::npos) {
698  // replace the whole path with the given bit(s)
699  urlbuf[urllen] = '/';
700  ++urllen;
701  xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
702  } else {
703  // replace only the last (file?) segment with the given bit(s)
704  ++lastSlashPos;
705  if (lastSlashPos > MAX_URL - urllen - 1) {
706  // XXX: crops bits in the middle of the combined URL.
707  lastSlashPos = MAX_URL - urllen - 1;
708  }
709  SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
710  urllen += lastSlashPos;
711  if (urllen + 1 < MAX_URL) {
712  xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
713  }
714  }
715  }
716 
717  return (urlbuf);
718 }
719 
720 int
721 matchDomainName(const char *h, const char *d, uint8_t flags)
722 {
723  int dl;
724  int hl;
725 
726  const bool hostIncludesSubdomains = (*h == '.');
727  while ('.' == *h)
728  ++h;
729 
730  hl = strlen(h);
731 
732  if (hl == 0)
733  return -1;
734 
735  dl = strlen(d);
736 
737  /*
738  * Start at the ends of the two strings and work towards the
739  * beginning.
740  */
741  while (xtolower(h[--hl]) == xtolower(d[--dl])) {
742  if (hl == 0 && dl == 0) {
743  /*
744  * We made it all the way to the beginning of both
745  * strings without finding any difference.
746  */
747  return 0;
748  }
749 
750  if (0 == hl) {
751  /*
752  * The host string is shorter than the domain string.
753  * There is only one case when this can be a match.
754  * If the domain is just one character longer, and if
755  * that character is a leading '.' then we call it a
756  * match.
757  */
758 
759  if (1 == dl && '.' == d[0])
760  return 0;
761  else
762  return -1;
763  }
764 
765  if (0 == dl) {
766  /*
767  * The domain string is shorter than the host string.
768  * This is a match only if the first domain character
769  * is a leading '.'.
770  */
771 
772  if ('.' == d[0]) {
773  if (flags & mdnRejectSubsubDomains) {
774  // Check for sub-sub domain and reject
775  while(--hl >= 0 && h[hl] != '.');
776  if (hl < 0) {
777  // No sub-sub domain found, but reject if there is a
778  // leading dot in given host string (which is removed
779  // before the check is started).
780  return hostIncludesSubdomains ? 1 : 0;
781  } else
782  return 1; // sub-sub domain, reject
783  } else
784  return 0;
785  } else
786  return 1;
787  }
788  }
789 
790  /*
791  * We found different characters in the same position (from the end).
792  */
793 
794  // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
795  // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
796  // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
797  if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
798  return 0;
799 
800  /*
801  * If one of those character is '.' then its special. In order
802  * for splay tree sorting to work properly, "x-foo.com" must
803  * be greater than ".foo.com" even though '-' is less than '.'.
804  */
805  if ('.' == d[dl])
806  return 1;
807 
808  if ('.' == h[hl])
809  return -1;
810 
811  return (xtolower(h[hl]) - xtolower(d[dl]));
812 }
813 
814 /*
815  * return true if we can serve requests for this method.
816  */
817 int
819 {
820  int rc = 0;
821  /* protocol "independent" methods
822  *
823  * actually these methods are specific to HTTP:
824  * they are methods we recieve on our HTTP port,
825  * and if we had a FTP listener would not be relevant
826  * there.
827  *
828  * So, we should delegate them to HTTP. The problem is that we
829  * do not have a default protocol from the client side of HTTP.
830  */
831 
832  if (r->method == Http::METHOD_CONNECT)
833  return 1;
834 
835  // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
836  // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
839 
840  if (r->method == Http::METHOD_PURGE)
841  return 1;
842 
843  /* does method match the protocol? */
844  switch (r->url.getScheme()) {
845 
846  case AnyP::PROTO_URN:
847 
848  case AnyP::PROTO_HTTP:
849 
851  rc = 1;
852  break;
853 
854  case AnyP::PROTO_FTP:
855 
856  if (r->method == Http::METHOD_PUT)
857  rc = 1;
858 
859  case AnyP::PROTO_GOPHER:
860 
861  case AnyP::PROTO_WAIS:
862 
863  case AnyP::PROTO_WHOIS:
864  if (r->method == Http::METHOD_GET)
865  rc = 1;
866  else if (r->method == Http::METHOD_HEAD)
867  rc = 1;
868 
869  break;
870 
871  case AnyP::PROTO_HTTPS:
872 #if USE_OPENSSL
873  rc = 1;
874 #elif USE_GNUTLS
875  rc = 1;
876 #else
877  /*
878  * Squid can't originate an SSL connection, so it should
879  * never receive an "https:" URL. It should always be
880  * CONNECT instead.
881  */
882  rc = 0;
883 #endif
884  break;
885 
886  default:
887  break;
888  }
889 
890  return rc;
891 }
892 
893 /*
894  * Quick-n-dirty host extraction from a URL. Steps:
895  * Look for a colon
896  * Skip any '/' after the colon
897  * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
898  * Look for an ending '/' or ':' and terminate
899  * Look for login info preceeded by '@'
900  */
901 
903 {
904 
905 public:
906  char * extract(char const *url);
907 
908 private:
909  static char Host [SQUIDHOSTNAMELEN];
910  void init(char const *);
911  void findHostStart();
912  void trimTrailingChars();
913  void trimAuth();
914  char const *hostStart;
915  char const *url;
916 };
917 
918 char *
919 urlHostname(const char *url)
920 {
921  return URLHostName().extract(url);
922 }
923 
925 
926 void
927 URLHostName::init(char const *aUrl)
928 {
929  Host[0] = '\0';
930  url = aUrl;
931 }
932 
933 void
935 {
936  if (NULL == (hostStart = strchr(url, ':')))
937  return;
938 
939  ++hostStart;
940 
941  while (*hostStart != '\0' && *hostStart == '/')
942  ++hostStart;
943 
944  if (*hostStart == ']')
945  ++hostStart;
946 }
947 
948 void
950 {
951  char *t;
952 
953  if ((t = strchr(Host, '/')))
954  *t = '\0';
955 
956  if ((t = strrchr(Host, ':')))
957  *t = '\0';
958 
959  if ((t = strchr(Host, ']')))
960  *t = '\0';
961 }
962 
963 void
965 {
966  char *t;
967 
968  if ((t = strrchr(Host, '@'))) {
969  ++t;
970  memmove(Host, t, strlen(t) + 1);
971  }
972 }
973 
974 char *
975 URLHostName::extract(char const *aUrl)
976 {
977  init(aUrl);
978  findHostStart();
979 
980  if (hostStart == NULL)
981  return NULL;
982 
983  xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
984 
985  trimTrailingChars();
986 
987  trimAuth();
988 
989  return Host;
990 }
991 
993  scheme_(aScheme),
995  port_(0)
996 {
997  *host_=0;
998 }
999 
1000 // TODO: fix code duplication with AnyP::Uri::parse()
1001 char *
1002 AnyP::Uri::cleanup(const char *uri)
1003 {
1004  int flags = 0;
1005  char *cleanedUri = nullptr;
1006  switch (Config.uri_whitespace) {
1007  case URI_WHITESPACE_ALLOW:
1008  flags |= RFC1738_ESCAPE_NOSPACE;
1009  // fall through to next case
1010  case URI_WHITESPACE_ENCODE:
1011  flags |= RFC1738_ESCAPE_UNESCAPED;
1012  cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1013  break;
1014 
1015  case URI_WHITESPACE_CHOP: {
1016  flags |= RFC1738_ESCAPE_UNESCAPED;
1017  const auto pos = strcspn(uri, w_space);
1018  char *choppedUri = nullptr;
1019  if (pos < strlen(uri))
1020  choppedUri = xstrndup(uri, pos + 1);
1021  cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
1022  cleanedUri[pos] = '\0';
1023  xfree(choppedUri);
1024  }
1025  break;
1026 
1027  case URI_WHITESPACE_DENY:
1028  case URI_WHITESPACE_STRIP:
1029  default: {
1030  // TODO: avoid duplication with urlParse()
1031  const char *t;
1032  char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1033  char *q = tmp_uri;
1034  t = uri;
1035  while (*t) {
1036  if (!xisspace(*t)) {
1037  *q = *t;
1038  ++q;
1039  }
1040  ++t;
1041  }
1042  *q = '\0';
1043  cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1044  xfree(tmp_uri);
1045  }
1046  break;
1047  }
1048 
1049  assert(cleanedUri);
1050  return cleanedUri;
1051 }
1052 
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:404
char * urlHostname(const char *url)
Definition: Uri.cc:919
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
#define assert(EX)
Definition: assert.h:17
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:528
SBuf & appendf(const char *fmt,...)
Definition: SBuf.cc:239
void trimAuth()
Definition: Uri.cc:964
char * urlMakeAbsolute(const HttpRequest *req, const char *relUrl)
Definition: Uri.cc:662
SBuf image() const
Definition: UriScheme.h:50
const_reverse_iterator rbegin() const
Definition: SBuf.h:580
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition: Uri.cc:137
bool urlIsRelative(const char *url)
Definition: Uri.cc:629
void path(const char *p)
Definition: Uri.h:97
Definition: SBuf.h:86
static uint32 B
Definition: md4.c:43
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:608
unsigned short port() const
Definition: Uri.h:93
HttpRequestMethod method
Definition: HttpRequest.h:114
int stringHasWhitespace(const char *)
Definition: String.cc:380
struct _request * request(char *urlin)
Definition: tcp-banger2.c:291
int i
Definition: membanger.c:49
SBuf & append(const SBuf &S)
Definition: SBuf.cc:195
static const char valid_hostname_chars_u[]
Definition: Uri.cc:20
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56
unsigned short defaultPort() const
Definition: UriScheme.cc:71
int allow_underscore
Definition: SquidConfig.h:320
SBuf path_
URI path segment.
Definition: Uri.h:164
static const SBuf & SlashPath()
the static &#39;/&#39; default URL-path
Definition: Uri.cc:41
SBuf authorityHttp_
RFC 7230 section 5.3.3 authority, maybe without default-port.
Definition: Uri.h:167
void clear()
Definition: SBuf.cc:178
bool isEmpty() const
Definition: SBuf.h:420
char * appendDomain
Definition: SquidConfig.h:215
#define xisspace(x)
Definition: xis.h:17
#define DBG_CRITICAL
Definition: Debug.h:45
char * p
Definition: membanger.c:43
void SBufToCstring(char *d, const SBuf &s)
Definition: SBuf.h:741
static const CharacterSet ALPHA
Definition: CharacterSet.h:73
static const SBuf & Asterisk()
the static &#39;*&#39; pseudo-URI
Definition: Uri.cc:34
#define w_space
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:579
static const char valid_hostname_chars[]
Definition: Uri.cc:26
SBuf & absolute() const
Definition: Uri.cc:546
const char * host(void) const
Definition: Uri.h:83
size_t appendDomainLen
Definition: SquidConfig.h:216
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition: Uri.cc:174
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
void urlInitialize(void)
Definition: Uri.cc:86
char const * hostStart
Definition: Uri.cc:914
int urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:818
void port(unsigned short p)
Definition: Uri.h:92
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Debug.h:128
#define DBG_IMPORTANT
Definition: Debug.h:46
#define URI_WHITESPACE_ALLOW
Definition: defines.h:195
Uri()
Definition: Uri.h:35
static const CharacterSet DIGIT
Definition: CharacterSet.h:81
void setScheme(const AnyP::ProtocolType &p, const char *str)
convert the URL scheme to that given
Definition: Uri.h:70
void touch()
clear the cached URI display forms
Definition: Uri.cc:520
void trimTrailingChars()
Definition: Uri.cc:949
char at(size_type pos) const
Definition: SBuf.h:238
int uri_whitespace
Definition: SquidConfig.h:456
int hostIsNumeric(void) const
Definition: Uri.h:84
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
SBuf buf() const
yet unparsed data
Definition: Tokenizer.h:35
void userInfo(const SBuf &s)
Definition: Uri.h:79
const SBuf & path() const
Definition: Uri.cc:74
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
#define URI_WHITESPACE_STRIP
Definition: defines.h:194
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition: Uri.cc:203
char * canonicalCleanUrl() const
Definition: HttpRequest.cc:775
static const AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition: UriScheme.cc:52
const char * c_str()
Definition: SBuf.cc:526
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:17
unsigned short port_
URL port.
Definition: Uri.h:161
Ip::Address hostAddr_
binary representation of the URI authority if it is a raw-IP
Definition: Uri.h:159
void init(char const *)
Definition: Uri.cc:927
int check_hostnames
Definition: SquidConfig.h:319
void setEmpty()
Fast reset of the stored content to what would be after default constructor.
Definition: Address.cc:184
Definition: parse.c:160
void host(const char *src)
Definition: Uri.cc:48
#define URI_WHITESPACE_ENCODE
Definition: defines.h:196
Http::MethodType id() const
Definition: RequestMethod.h:76
#define LOCAL_ARRAY(type, name, size)
Definition: leakcheck.h:18
bool isAnyAddr() const
Definition: Address.cc:170
char const * url
Definition: Uri.cc:915
void const char * buf
Definition: stub_helper.cc:16
struct SquidConfig::@112 onoff
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:179
an std::runtime_error with thrower location info
Definition: TextException.h:19
static char * cleanup(const char *uri)
Definition: Uri.cc:1002
HttpHeader header
Definition: Message.h:75
#define URI_WHITESPACE_CHOP
Definition: defines.h:197
void findHostStart()
Definition: Uri.cc:934
bool hostIsNumeric_
whether the authority &#39;host&#39; is a raw-IP
Definition: Uri.h:158
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:81
char * extract(char const *url)
Definition: Uri.cc:975
#define MYNAME
Definition: Debug.h:170
SBuf authorityWithPort_
RFC 7230 section 5.3.3 authority with explicit port.
Definition: Uri.h:168
#define xmalloc
SBuf substr(size_type pos, size_type n=npos) const
Definition: SBuf.cc:586
const_iterator begin() const
Definition: SBuf.h:572
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:157
size_t HttpReply *STUB StoreEntry const KeyScope scope const HttpRequestMethod & method
Definition: stub_store.cc:108
static const size_type npos
Definition: SBuf.h:92
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
const char * ProtocolType_str[]
SBuf hostOrIp() const
Definition: Uri.cc:64
const SBuf & userInfo() const
Definition: Uri.h:80
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition: forward.h:23
Definition: Debug.h:188
#define MAX_URL
Definition: defines.h:118
AnyP::UriScheme scheme_
Definition: Uri.h:151
void defaultPort()
reset the port to the default port number for the current scheme
Definition: Uri.h:95
#define URI_WHITESPACE_DENY
Definition: defines.h:198
Ip::Address const & hostIP(void) const
Definition: Uri.h:85
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
AnyP::UriScheme const & getScheme() const
Definition: Uri.h:67
#define SQUIDSBUFPH
Definition: SBuf.h:31
unsigned int toHostStr(char *buf, const unsigned int len) const
Definition: Address.cc:852
int matchDomainName(const char *h, const char *d, uint8_t flags)
Definition: Uri.cc:721
#define xfree
MemBlob::size_type size_type
Definition: SBuf.h:89
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
#define xtolower(x)
Definition: xis.h:19
void parseUrn(Parser::Tokenizer &)
Definition: Uri.cc:492
#define Here()
source code location of the caller
Definition: Here.h:15
SBuf absolute_
RFC 7230 section 5.3.2 absolute-URI.
Definition: Uri.h:169
class SquidConfig Config
Definition: SquidConfig.cc:12
static char Host[SQUIDHOSTNAMELEN]
Definition: Uri.cc:909
#define NULL
Definition: types.h:166
int strip_query_terms
Definition: SquidConfig.h:302
int stringHasCntl(const char *)
Definition: String.cc:387
const char * rawContent() const
Definition: SBuf.cc:519
#define false
Definition: GnuRegex.c:233
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1151
size_type rfind(char c, size_type endPos=npos) const
Definition: SBuf.cc:702

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors