Uri.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 23 URL Parsing */
10 
11 #include "squid.h"
12 #include "anyp/Uri.h"
13 #include "globals.h"
14 #include "HttpRequest.h"
15 #include "rfc1738.h"
16 #include "SquidConfig.h"
17 #include "SquidString.h"
18 
19 static const char valid_hostname_chars_u[] =
20  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
21  "abcdefghijklmnopqrstuvwxyz"
22  "0123456789-._"
23  "[:]"
24  ;
25 static const char valid_hostname_chars[] =
26  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
27  "abcdefghijklmnopqrstuvwxyz"
28  "0123456789-."
29  "[:]"
30  ;
31 
32 const SBuf &
34 {
35  static SBuf star("*");
36  return star;
37 }
38 
39 const SBuf &
41 {
42  static SBuf slash("/");
43  return slash;
44 }
45 
46 void
47 AnyP::Uri::host(const char *src)
48 {
50  hostAddr_ = src;
51  if (hostAddr_.isAnyAddr()) {
52  xstrncpy(host_, src, sizeof(host_));
53  hostIsNumeric_ = false;
54  } else {
55  hostAddr_.toHostStr(host_, sizeof(host_));
56  debugs(23, 3, "given IP: " << hostAddr_);
57  hostIsNumeric_ = 1;
58  }
59  touch();
60 }
61 
62 const SBuf &
64 {
65  // RFC 3986 section 3.3 says path can be empty (path-abempty).
66  // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
67  // at least when sending and using. We must still accept path-abempty as input.
69  return SlashPath();
70 
71  return path_;
72 }
73 
74 void
76 {
77  debugs(23, 5, "urlInitialize: Initializing...");
78  /* this ensures that the number of protocol strings is the same as
79  * the enum slots allocated because the last enum is always 'MAX'.
80  */
81  assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
82  /*
83  * These test that our matchDomainName() function works the
84  * way we expect it to.
85  */
86  assert(0 == matchDomainName("foo.com", "foo.com"));
87  assert(0 == matchDomainName(".foo.com", "foo.com"));
88  assert(0 == matchDomainName("foo.com", ".foo.com"));
89  assert(0 == matchDomainName(".foo.com", ".foo.com"));
90  assert(0 == matchDomainName("x.foo.com", ".foo.com"));
91  assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
92  assert(0 != matchDomainName("x.foo.com", "foo.com"));
93  assert(0 != matchDomainName("foo.com", "x.foo.com"));
94  assert(0 != matchDomainName("bar.com", "foo.com"));
95  assert(0 != matchDomainName(".bar.com", "foo.com"));
96  assert(0 != matchDomainName(".bar.com", ".foo.com"));
97  assert(0 != matchDomainName("bar.com", ".foo.com"));
98  assert(0 < matchDomainName("zzz.com", "foo.com"));
99  assert(0 > matchDomainName("aaa.com", "foo.com"));
100  assert(0 == matchDomainName("FOO.com", "foo.COM"));
101  assert(0 < matchDomainName("bfoo.com", "afoo.com"));
102  assert(0 > matchDomainName("afoo.com", "bfoo.com"));
103  assert(0 < matchDomainName("x-foo.com", ".foo.com"));
104 
105  assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
106  assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
107  assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
108  assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
109 
110  assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
111  assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
112  assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
113  assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
114 
115  /* more cases? */
116 }
117 
123 urlParseProtocol(const char *b)
124 {
125  // make e point to the ':' character
126  const char *e = b + strcspn(b, ":");
127  int len = e - b;
128 
129  /* test common stuff first */
130 
131  if (strncasecmp(b, "http", len) == 0)
132  return AnyP::PROTO_HTTP;
133 
134  if (strncasecmp(b, "ftp", len) == 0)
135  return AnyP::PROTO_FTP;
136 
137  if (strncasecmp(b, "https", len) == 0)
138  return AnyP::PROTO_HTTPS;
139 
140  if (strncasecmp(b, "file", len) == 0)
141  return AnyP::PROTO_FTP;
142 
143  if (strncasecmp(b, "coap", len) == 0)
144  return AnyP::PROTO_COAP;
145 
146  if (strncasecmp(b, "coaps", len) == 0)
147  return AnyP::PROTO_COAPS;
148 
149  if (strncasecmp(b, "gopher", len) == 0)
150  return AnyP::PROTO_GOPHER;
151 
152  if (strncasecmp(b, "wais", len) == 0)
153  return AnyP::PROTO_WAIS;
154 
155  if (strncasecmp(b, "cache_object", len) == 0)
157 
158  if (strncasecmp(b, "urn", len) == 0)
159  return AnyP::PROTO_URN;
160 
161  if (strncasecmp(b, "whois", len) == 0)
162  return AnyP::PROTO_WHOIS;
163 
164  if (len > 0)
165  return AnyP::PROTO_UNKNOWN;
166 
167  return AnyP::PROTO_NONE;
168 }
169 
170 /*
171  * Parse a URI/URL.
172  *
173  * Stores parsed values in the `request` argument.
174  *
175  * This abuses HttpRequest as a way of representing the parsed url
176  * and its components.
177  * method is used to switch parsers and to init the HttpRequest.
178  * If method is Http::METHOD_CONNECT, then rather than a URL a hostname:port is
179  * looked for.
180  * The url is non const so that if its too long we can NULL-terminate it in place.
181  */
182 
183 /*
184  * This routine parses a URL. Its assumed that the URL is complete -
185  * ie, the end of the string is the end of the URL. Don't pass a partial
186  * URL here as this routine doesn't have any way of knowing whether
187  * its partial or not (ie, it handles the case of no trailing slash as
188  * being "end of host with implied path of /".
189  */
190 bool
191 AnyP::Uri::parse(const HttpRequestMethod& method, const char *url)
192 {
193  LOCAL_ARRAY(char, proto, MAX_URL);
194  LOCAL_ARRAY(char, login, MAX_URL);
195  LOCAL_ARRAY(char, foundHost, MAX_URL);
196  LOCAL_ARRAY(char, urlpath, MAX_URL);
197  char *t = NULL;
198  char *q = NULL;
199  int foundPort;
201  int l;
202  int i;
203  const char *src;
204  char *dst;
205  proto[0] = foundHost[0] = urlpath[0] = login[0] = '\0';
206 
207  if ((l = strlen(url)) + Config.appendDomainLen > (MAX_URL - 1)) {
208  debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
209  return false;
210  }
211  if (method == Http::METHOD_CONNECT) {
212  /*
213  * RFC 7230 section 5.3.3: authority-form = authority
214  * "excluding any userinfo and its "@" delimiter"
215  *
216  * RFC 3986 section 3.2: authority = [ userinfo "@" ] host [ ":" port ]
217  *
218  * As an HTTP(S) proxy we assume HTTPS (443) if no port provided.
219  */
220  foundPort = 443;
221 
222  if (sscanf(url, "[%[^]]]:%d", foundHost, &foundPort) < 1)
223  if (sscanf(url, "%[^:]:%d", foundHost, &foundPort) < 1)
224  return false;
225 
226  } else if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
227  AnyP::Uri::Asterisk().cmp(url) == 0) {
228  parseFinish(AnyP::PROTO_HTTP, nullptr, url, foundHost, SBuf(), 80 /* HTTP default port */);
229  return true;
230  } else if (strncmp(url, "urn:", 4) == 0) {
231  debugs(23, 3, "Split URI '" << url << "' into proto='urn', path='" << (url+4) << "'");
232  debugs(50, 5, "urn=" << (url+4));
233  setScheme(AnyP::PROTO_URN, nullptr);
234  path(url + 4);
235  return true;
236  } else {
237  /* Parse the URL: */
238  src = url;
239  i = 0;
240  /* Find first : - everything before is protocol */
241  for (i = 0, dst = proto; i < l && *src != ':'; ++i, ++src, ++dst) {
242  *dst = *src;
243  }
244  if (i >= l)
245  return false;
246  *dst = '\0';
247 
248  /* Then its :// */
249  if ((i+3) > l || *src != ':' || *(src + 1) != '/' || *(src + 2) != '/')
250  return false;
251  i += 3;
252  src += 3;
253 
254  /* Then everything until first /; thats host (and port; which we'll look for here later) */
255  // bug 1881: If we don't get a "/" then we imply it was there
256  // bug 3074: We could just be given a "?" or "#". These also imply "/"
257  // bug 3233: whitespace is also a hostname delimiter.
258  for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
259  *dst = *src;
260  }
261 
262  /*
263  * We can't check for "i >= l" here because we could be at the end of the line
264  * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
265  * been -given- a valid URL and the path is just '/'.
266  */
267  if (i > l)
268  return false;
269  *dst = '\0';
270 
271  // bug 3074: received 'path' starting with '?', '#', or '\0' implies '/'
272  if (*src == '?' || *src == '#' || *src == '\0') {
273  urlpath[0] = '/';
274  dst = &urlpath[1];
275  } else {
276  dst = urlpath;
277  }
278  /* Then everything from / (inclusive) until \r\n or \0 - thats urlpath */
279  for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
280  *dst = *src;
281  }
282 
283  /* We -could- be at the end of the buffer here */
284  if (i > l)
285  return false;
286  /* If the URL path is empty we set it to be "/" */
287  if (dst == urlpath) {
288  *dst = '/';
289  ++dst;
290  }
291  *dst = '\0';
292 
293  protocol = urlParseProtocol(proto);
294  foundPort = AnyP::UriScheme(protocol).defaultPort();
295 
296  /* Is there any login information? (we should eventually parse it above) */
297  t = strrchr(foundHost, '@');
298  if (t != NULL) {
299  strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
300  login[sizeof(login)-1] = '\0';
301  t = strrchr(login, '@');
302  *t = 0;
303  strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
304  foundHost[sizeof(foundHost)-1] = '\0';
305  // Bug 4498: URL-unescape the login info after extraction
306  rfc1738_unescape(login);
307  }
308 
309  /* Is there any host information? (we should eventually parse it above) */
310  if (*foundHost == '[') {
311  /* strip any IPA brackets. valid under IPv6. */
312  dst = foundHost;
313  /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
314  src = foundHost;
315  ++src;
316  l = strlen(foundHost);
317  i = 1;
318  for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
319  *dst = *src;
320  }
321 
322  /* we moved in-place, so truncate the actual hostname found */
323  *dst = '\0';
324  ++dst;
325 
326  /* skip ahead to either start of port, or original EOS */
327  while (*dst != '\0' && *dst != ':')
328  ++dst;
329  t = dst;
330  } else {
331  t = strrchr(foundHost, ':');
332 
333  if (t != strchr(foundHost,':') ) {
334  /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
335  /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
336  /* therefore we MUST accept the case where they are not bracketed at all. */
337  t = NULL;
338  }
339  }
340 
341  // Bug 3183 sanity check: If scheme is present, host must be too.
342  if (protocol != AnyP::PROTO_NONE && foundHost[0] == '\0') {
343  debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
344  return false;
345  }
346 
347  if (t && *t == ':') {
348  *t = '\0';
349  ++t;
350  foundPort = atoi(t);
351  }
352  }
353 
354  for (t = foundHost; *t; ++t)
355  *t = xtolower(*t);
356 
357  if (stringHasWhitespace(foundHost)) {
359  t = q = foundHost;
360  while (*t) {
361  if (!xisspace(*t)) {
362  *q = *t;
363  ++q;
364  }
365  ++t;
366  }
367  *q = '\0';
368  }
369  }
370 
371  debugs(23, 3, "Split URL '" << url << "' into proto='" << proto << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
372 
374  strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
375  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
376  return false;
377  }
378 
379  /* For IPV6 addresses also check for a colon */
380  if (Config.appendDomain && !strchr(foundHost, '.') && !strchr(foundHost, ':'))
381  strncat(foundHost, Config.appendDomain, SQUIDHOSTNAMELEN - strlen(foundHost) - 1);
382 
383  /* remove trailing dots from hostnames */
384  while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
385  foundHost[l] = '\0';
386 
387  /* reject duplicate or leading dots */
388  if (strstr(foundHost, "..") || *foundHost == '.') {
389  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
390  return false;
391  }
392 
393  if (foundPort < 1 || foundPort > 65535) {
394  debugs(23, 3, "Invalid port '" << foundPort << "'");
395  return false;
396  }
397 
398 #if HARDCODE_DENY_PORTS
399  /* These ports are filtered in the default squid.conf, but
400  * maybe someone wants them hardcoded... */
401  if (foundPort == 7 || foundPort == 9 || foundPort == 19) {
402  debugs(23, DBG_CRITICAL, MYNAME << "Deny access to port " << foundPort);
403  return false;
404  }
405 #endif
406 
407  if (stringHasWhitespace(urlpath)) {
408  debugs(23, 2, "URI has whitespace: {" << url << "}");
409 
410  switch (Config.uri_whitespace) {
411 
412  case URI_WHITESPACE_DENY:
413  return false;
414 
416  break;
417 
419  t = rfc1738_escape_unescaped(urlpath);
420  xstrncpy(urlpath, t, MAX_URL);
421  break;
422 
423  case URI_WHITESPACE_CHOP:
424  *(urlpath + strcspn(urlpath, w_space)) = '\0';
425  break;
426 
428  default:
429  t = q = urlpath;
430  while (*t) {
431  if (!xisspace(*t)) {
432  *q = *t;
433  ++q;
434  }
435  ++t;
436  }
437  *q = '\0';
438  }
439  }
440 
441  parseFinish(protocol, proto, urlpath, foundHost, SBuf(login), foundPort);
442  return true;
443 }
444 
446 void
448  const char *const protoStr, // for unknown protocols
449  const char *const aUrlPath,
450  const char *const aHost,
451  const SBuf &aLogin,
452  const int aPort)
453 {
454  setScheme(protocol, protoStr);
455  path(aUrlPath);
456  host(aHost);
457  userInfo(aLogin);
458  port(aPort);
459 }
460 
461 void
463 {
464  absolute_.clear();
467 }
468 
469 SBuf &
470 AnyP::Uri::authority(bool requirePort) const
471 {
472  if (authorityHttp_.isEmpty()) {
473 
474  // both formats contain Host/IP
477 
478  // authorityForm_ only has :port if it is non-default
480  if (port() != getScheme().defaultPort())
482  }
483 
484  return requirePort ? authorityWithPort_ : authorityHttp_;
485 }
486 
487 SBuf &
489 {
490  if (absolute_.isEmpty()) {
491  // TODO: most URL will be much shorter, avoid allocating this much
493 
494  absolute_.append(getScheme().image());
495  absolute_.append(":",1);
496  if (getScheme() != AnyP::PROTO_URN) {
497  absolute_.append("//", 2);
498  const bool omitUserInfo = getScheme() == AnyP::PROTO_HTTP ||
500  userInfo().isEmpty();
501  if (!omitUserInfo) {
503  absolute_.append("@", 1);
504  }
506  }
507  absolute_.append(path());
508  }
509 
510  return absolute_;
511 }
512 
517 char *
519 {
520  LOCAL_ARRAY(char, buf, MAX_URL);
521 
522  snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
523  buf[sizeof(buf)-1] = '\0';
524 
525  // URN, CONNECT method, and non-stripped URIs can go straight out
526  if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
527  // strip anything AFTER a question-mark
528  // leaving the '?' in place
529  if (auto t = strchr(buf, '?')) {
530  *(++t) = '\0';
531  }
532  }
533 
534  if (stringHasCntl(buf))
536 
537  return buf;
538 }
539 
546 const char *
548 {
549  LOCAL_ARRAY(char, buf, MAX_URL);
550 
551  // method CONNECT and port HTTPS
552  if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
553  snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
554  return buf;
555  }
556 
557  // else do the normal complete canonical thing.
558  return request->canonicalCleanUrl();
559 }
560 
561 /*
562  * Test if a URL is relative.
563  *
564  * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
565  * appear before a ':'.
566  */
567 bool
568 urlIsRelative(const char *url)
569 {
570  const char *p;
571 
572  if (url == NULL) {
573  return (false);
574  }
575  if (*url == '\0') {
576  return (false);
577  }
578 
579  for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
580 
581  if (*p == ':') {
582  return (false);
583  }
584  return (true);
585 }
586 
587 /*
588  * Convert a relative URL to an absolute URL using the context of a given
589  * request.
590  *
591  * It is assumed that you have already ensured that the URL is relative.
592  *
593  * If NULL is returned it is an indication that the method in use in the
594  * request does not distinguish between relative and absolute and you should
595  * use the url unchanged.
596  *
597  * If non-NULL is returned, it is up to the caller to free the resulting
598  * memory using safe_free().
599  */
600 char *
601 urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
602 {
603 
604  if (req->method.id() == Http::METHOD_CONNECT) {
605  return (NULL);
606  }
607 
608  char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
609 
610  if (req->url.getScheme() == AnyP::PROTO_URN) {
611  // XXX: this is what the original code did, but it seems to break the
612  // intended behaviour of this function. It returns the stored URN path,
613  // not converting the given one into a URN...
614  snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
615  return (urlbuf);
616  }
617 
618  SBuf authorityForm = req->url.authority(); // host[:port]
619  const SBuf &scheme = req->url.getScheme().image();
620  size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
621  SQUIDSBUFPRINT(scheme),
622  SQUIDSBUFPRINT(req->url.userInfo()),
623  !req->url.userInfo().isEmpty() ? "@" : "",
624  SQUIDSBUFPRINT(authorityForm));
625 
626  // if the first char is '/' assume its a relative path
627  // XXX: this breaks on scheme-relative URLs,
628  // but we should not see those outside ESI, and rarely there.
629  // XXX: also breaks on any URL containing a '/' in the query-string portion
630  if (relUrl[0] == '/') {
631  xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
632  } else {
633  SBuf path = req->url.path();
634  SBuf::size_type lastSlashPos = path.rfind('/');
635 
636  if (lastSlashPos == SBuf::npos) {
637  // replace the whole path with the given bit(s)
638  urlbuf[urllen] = '/';
639  ++urllen;
640  xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
641  } else {
642  // replace only the last (file?) segment with the given bit(s)
643  ++lastSlashPos;
644  if (lastSlashPos > MAX_URL - urllen - 1) {
645  // XXX: crops bits in the middle of the combined URL.
646  lastSlashPos = MAX_URL - urllen - 1;
647  }
648  SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
649  urllen += lastSlashPos;
650  if (urllen + 1 < MAX_URL) {
651  xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
652  }
653  }
654  }
655 
656  return (urlbuf);
657 }
658 
659 int
660 matchDomainName(const char *h, const char *d, uint8_t flags)
661 {
662  int dl;
663  int hl;
664 
665  const bool hostIncludesSubdomains = (*h == '.');
666  while ('.' == *h)
667  ++h;
668 
669  hl = strlen(h);
670 
671  if (hl == 0)
672  return -1;
673 
674  dl = strlen(d);
675 
676  /*
677  * Start at the ends of the two strings and work towards the
678  * beginning.
679  */
680  while (xtolower(h[--hl]) == xtolower(d[--dl])) {
681  if (hl == 0 && dl == 0) {
682  /*
683  * We made it all the way to the beginning of both
684  * strings without finding any difference.
685  */
686  return 0;
687  }
688 
689  if (0 == hl) {
690  /*
691  * The host string is shorter than the domain string.
692  * There is only one case when this can be a match.
693  * If the domain is just one character longer, and if
694  * that character is a leading '.' then we call it a
695  * match.
696  */
697 
698  if (1 == dl && '.' == d[0])
699  return 0;
700  else
701  return -1;
702  }
703 
704  if (0 == dl) {
705  /*
706  * The domain string is shorter than the host string.
707  * This is a match only if the first domain character
708  * is a leading '.'.
709  */
710 
711  if ('.' == d[0]) {
712  if (flags & mdnRejectSubsubDomains) {
713  // Check for sub-sub domain and reject
714  while(--hl >= 0 && h[hl] != '.');
715  if (hl < 0) {
716  // No sub-sub domain found, but reject if there is a
717  // leading dot in given host string (which is removed
718  // before the check is started).
719  return hostIncludesSubdomains ? 1 : 0;
720  } else
721  return 1; // sub-sub domain, reject
722  } else
723  return 0;
724  } else
725  return 1;
726  }
727  }
728 
729  /*
730  * We found different characters in the same position (from the end).
731  */
732 
733  // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
734  // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
735  // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
736  if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
737  return 0;
738 
739  /*
740  * If one of those character is '.' then its special. In order
741  * for splay tree sorting to work properly, "x-foo.com" must
742  * be greater than ".foo.com" even though '-' is less than '.'.
743  */
744  if ('.' == d[dl])
745  return 1;
746 
747  if ('.' == h[hl])
748  return -1;
749 
750  return (xtolower(h[hl]) - xtolower(d[dl]));
751 }
752 
753 /*
754  * return true if we can serve requests for this method.
755  */
756 int
758 {
759  int rc = 0;
760  /* protocol "independent" methods
761  *
762  * actually these methods are specific to HTTP:
763  * they are methods we recieve on our HTTP port,
764  * and if we had a FTP listener would not be relevant
765  * there.
766  *
767  * So, we should delegate them to HTTP. The problem is that we
768  * do not have a default protocol from the client side of HTTP.
769  */
770 
771  if (r->method == Http::METHOD_CONNECT)
772  return 1;
773 
774  // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
775  // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
778 
779  if (r->method == Http::METHOD_PURGE)
780  return 1;
781 
782  /* does method match the protocol? */
783  switch (r->url.getScheme()) {
784 
785  case AnyP::PROTO_URN:
786 
787  case AnyP::PROTO_HTTP:
788 
790  rc = 1;
791  break;
792 
793  case AnyP::PROTO_FTP:
794 
795  if (r->method == Http::METHOD_PUT)
796  rc = 1;
797 
798  case AnyP::PROTO_GOPHER:
799 
800  case AnyP::PROTO_WAIS:
801 
802  case AnyP::PROTO_WHOIS:
803  if (r->method == Http::METHOD_GET)
804  rc = 1;
805  else if (r->method == Http::METHOD_HEAD)
806  rc = 1;
807 
808  break;
809 
810  case AnyP::PROTO_HTTPS:
811 #if USE_OPENSSL
812  rc = 1;
813 #elif USE_GNUTLS
814  rc = 1;
815 #else
816  /*
817  * Squid can't originate an SSL connection, so it should
818  * never receive an "https:" URL. It should always be
819  * CONNECT instead.
820  */
821  rc = 0;
822 #endif
823  break;
824 
825  default:
826  break;
827  }
828 
829  return rc;
830 }
831 
832 /*
833  * Quick-n-dirty host extraction from a URL. Steps:
834  * Look for a colon
835  * Skip any '/' after the colon
836  * Copy the next SQUID_MAXHOSTNAMELEN bytes to host[]
837  * Look for an ending '/' or ':' and terminate
838  * Look for login info preceeded by '@'
839  */
840 
842 {
843 
844 public:
845  char * extract(char const *url);
846 
847 private:
848  static char Host [SQUIDHOSTNAMELEN];
849  void init(char const *);
850  void findHostStart();
851  void trimTrailingChars();
852  void trimAuth();
853  char const *hostStart;
854  char const *url;
855 };
856 
857 char *
858 urlHostname(const char *url)
859 {
860  return URLHostName().extract(url);
861 }
862 
864 
865 void
866 URLHostName::init(char const *aUrl)
867 {
868  Host[0] = '\0';
869  url = aUrl;
870 }
871 
872 void
874 {
875  if (NULL == (hostStart = strchr(url, ':')))
876  return;
877 
878  ++hostStart;
879 
880  while (*hostStart != '\0' && *hostStart == '/')
881  ++hostStart;
882 
883  if (*hostStart == ']')
884  ++hostStart;
885 }
886 
887 void
889 {
890  char *t;
891 
892  if ((t = strchr(Host, '/')))
893  *t = '\0';
894 
895  if ((t = strrchr(Host, ':')))
896  *t = '\0';
897 
898  if ((t = strchr(Host, ']')))
899  *t = '\0';
900 }
901 
902 void
904 {
905  char *t;
906 
907  if ((t = strrchr(Host, '@'))) {
908  ++t;
909  memmove(Host, t, strlen(t) + 1);
910  }
911 }
912 
913 char *
914 URLHostName::extract(char const *aUrl)
915 {
916  init(aUrl);
917  findHostStart();
918 
919  if (hostStart == NULL)
920  return NULL;
921 
922  xstrncpy(Host, hostStart, SQUIDHOSTNAMELEN);
923 
924  trimTrailingChars();
925 
926  trimAuth();
927 
928  return Host;
929 }
930 
932  scheme_(aScheme),
934  port_(0)
935 {
936  *host_=0;
937 }
938 
939 // TODO: fix code duplication with AnyP::Uri::parse()
940 char *
941 AnyP::Uri::cleanup(const char *uri)
942 {
943  int flags = 0;
944  char *cleanedUri = nullptr;
945  switch (Config.uri_whitespace) {
947  flags |= RFC1738_ESCAPE_NOSPACE;
948  // fall through to next case
950  flags |= RFC1738_ESCAPE_UNESCAPED;
951  cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
952  break;
953 
954  case URI_WHITESPACE_CHOP: {
955  flags |= RFC1738_ESCAPE_UNESCAPED;
956  const auto pos = strcspn(uri, w_space);
957  char *choppedUri = nullptr;
958  if (pos < strlen(uri))
959  choppedUri = xstrndup(uri, pos + 1);
960  cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri, flags), MAX_URL);
961  cleanedUri[pos] = '\0';
962  xfree(choppedUri);
963  }
964  break;
965 
966  case URI_WHITESPACE_DENY:
968  default: {
969  // TODO: avoid duplication with urlParse()
970  const char *t;
971  char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
972  char *q = tmp_uri;
973  t = uri;
974  while (*t) {
975  if (!xisspace(*t)) {
976  *q = *t;
977  ++q;
978  }
979  ++t;
980  }
981  *q = '\0';
982  cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
983  xfree(tmp_uri);
984  }
985  break;
986  }
987 
988  assert(cleanedUri);
989  return cleanedUri;
990 }
991 
char * urlHostname(const char *url)
Definition: Uri.cc:858
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
#define assert(EX)
Definition: assert.h:17
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:470
SBuf & appendf(const char *fmt,...)
Definition: SBuf.cc:239
void trimAuth()
Definition: Uri.cc:903
char * urlMakeAbsolute(const HttpRequest *req, const char *relUrl)
Definition: Uri.cc:601
SBuf image() const
Definition: UriScheme.h:50
bool urlIsRelative(const char *url)
Definition: Uri.cc:568
void path(const char *p)
Definition: Uri.h:86
Definition: SBuf.h:86
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:547
unsigned short port() const
Definition: Uri.h:84
HttpRequestMethod method
Definition: HttpRequest.h:114
int stringHasWhitespace(const char *)
Definition: String.cc:380
struct _request * request(char *urlin)
Definition: tcp-banger2.c:291
int i
Definition: membanger.c:49
SBuf & append(const SBuf &S)
Definition: SBuf.cc:195
static const char valid_hostname_chars_u[]
Definition: Uri.cc:19
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56
unsigned short defaultPort() const
Definition: UriScheme.cc:52
int allow_underscore
Definition: SquidConfig.h:320
SBuf path_
URI path segment.
Definition: Uri.h:153
static const SBuf & SlashPath()
the static &#39;/&#39; default URL-path
Definition: Uri.cc:40
SBuf authorityHttp_
RFC 7230 section 5.3.3 authority, maybe without default-port.
Definition: Uri.h:156
void clear()
Definition: SBuf.cc:178
bool isEmpty() const
Definition: SBuf.h:420
char * appendDomain
Definition: SquidConfig.h:215
#define xisspace(x)
Definition: xis.h:17
#define DBG_CRITICAL
Definition: Debug.h:45
char * p
Definition: membanger.c:43
void SBufToCstring(char *d, const SBuf &s)
Definition: SBuf.h:741
static const SBuf & Asterisk()
the static &#39;*&#39; pseudo-URI
Definition: Uri.cc:33
#define w_space
struct SquidConfig::@111 onoff
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:518
static const char valid_hostname_chars[]
Definition: Uri.cc:25
SBuf & absolute() const
Definition: Uri.cc:488
const char * host(void) const
Definition: Uri.h:79
size_t appendDomainLen
Definition: SquidConfig.h:216
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
void urlInitialize(void)
Definition: Uri.cc:75
char const * hostStart
Definition: Uri.cc:853
int urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:757
void port(unsigned short p)
Definition: Uri.h:83
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Debug.h:124
#define DBG_IMPORTANT
Definition: Debug.h:46
#define URI_WHITESPACE_ALLOW
Definition: defines.h:195
Uri()
Definition: Uri.h:35
bool parse(const HttpRequestMethod &, const char *url)
Definition: Uri.cc:191
void setScheme(const AnyP::ProtocolType &p, const char *str)
convert the URL scheme to that given
Definition: Uri.h:70
void touch()
clear the cached URI display forms
Definition: Uri.cc:462
void trimTrailingChars()
Definition: Uri.cc:888
int uri_whitespace
Definition: SquidConfig.h:456
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
void userInfo(const SBuf &s)
Definition: Uri.h:75
const SBuf & path() const
Definition: Uri.cc:63
#define URI_WHITESPACE_STRIP
Definition: defines.h:194
char * canonicalCleanUrl() const
Definition: HttpRequest.cc:777
unsigned short port_
URL port.
Definition: Uri.h:150
Ip::Address hostAddr_
binary representation of the URI authority if it is a raw-IP
Definition: Uri.h:148
void init(char const *)
Definition: Uri.cc:866
int check_hostnames
Definition: SquidConfig.h:319
void setEmpty()
Fast reset of the stored content to what would be after default constructor.
Definition: Address.cc:184
void host(const char *src)
Definition: Uri.cc:47
#define URI_WHITESPACE_ENCODE
Definition: defines.h:196
Http::MethodType id() const
Definition: RequestMethod.h:73
#define LOCAL_ARRAY(type, name, size)
Definition: leakcheck.h:18
bool isAnyAddr() const
Definition: Address.cc:170
int unsigned int const char *desc STUB void int len
Definition: stub_fd.cc:20
void parseFinish(const AnyP::ProtocolType, const char *const, const char *const, const char *const, const SBuf &, const int)
Update the URL object with parsed URI data.
Definition: Uri.cc:447
char const * url
Definition: Uri.cc:854
void const char * buf
Definition: stub_helper.cc:16
static char * cleanup(const char *uri)
Definition: Uri.cc:941
HttpHeader header
Definition: Message.h:75
#define URI_WHITESPACE_CHOP
Definition: defines.h:197
void findHostStart()
Definition: Uri.cc:873
bool hostIsNumeric_
whether the authority &#39;host&#39; is a raw-IP
Definition: Uri.h:147
char * extract(char const *url)
Definition: Uri.cc:914
#define MYNAME
Definition: Debug.h:166
SBuf authorityWithPort_
RFC 7230 section 5.3.3 authority with explicit port.
Definition: Uri.h:157
#define xmalloc
AnyP::ProtocolType urlParseProtocol(const char *b)
Definition: Uri.cc:123
SBuf substr(size_type pos, size_type n=npos) const
Definition: SBuf.cc:586
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:146
size_t HttpReply *STUB StoreEntry const KeyScope scope const HttpRequestMethod & method
Definition: stub_store.cc:112
static const size_type npos
Definition: SBuf.h:92
ProtocolType
Definition: ProtocolType.h:22
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
const char * ProtocolType_str[]
const SBuf & userInfo() const
Definition: Uri.h:76
#define MAX_URL
Definition: defines.h:118
AnyP::UriScheme scheme_
Definition: Uri.h:140
#define URI_WHITESPACE_DENY
Definition: defines.h:198
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
AnyP::UriScheme const & getScheme() const
Definition: Uri.h:67
#define SQUIDSBUFPH
Definition: SBuf.h:31
unsigned int toHostStr(char *buf, const unsigned int len) const
Definition: Address.cc:852
int matchDomainName(const char *h, const char *d, uint8_t flags)
Definition: Uri.cc:660
#define xfree
MemBlob::size_type size_type
Definition: SBuf.h:89
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
#define xtolower(x)
Definition: xis.h:19
SBuf absolute_
RFC 7230 section 5.3.2 absolute-URI.
Definition: Uri.h:158
class SquidConfig Config
Definition: SquidConfig.cc:12
static char Host[SQUIDHOSTNAMELEN]
Definition: Uri.cc:848
#define NULL
Definition: types.h:166
int strip_query_terms
Definition: SquidConfig.h:302
int stringHasCntl(const char *)
Definition: String.cc:387
#define false
Definition: GnuRegex.c:233
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1148
size_type rfind(char c, size_type endPos=npos) const
Definition: SBuf.cc:702

 

Introduction

Documentation

Support

Miscellaneous

Web Site Translations

Mirrors