=== modified file 'src/HttpMsg.cc' --- src/HttpMsg.cc 2010-08-24 20:35:02 +0000 +++ src/HttpMsg.cc 2010-08-31 13:05:01 +0000 @@ -401,6 +401,10 @@ hdr->req_start = hdr->req_end = -1; hdr->hdr_start = hdr->hdr_end = -1; debugs(74, 5, "httpParseInit: Request buffer is " << buf); + hdr->m_start = hdr->m_end = -1; + hdr->u_start = hdr->u_end = -1; + hdr->v_start = hdr->v_end = -1; + hdr->v_maj = hdr->v_min = 0; } #if MSGDODEBUG @@ -447,188 +451,234 @@ /** * Attempt to parse the request line. - * - * This will set the values in hmsg that it determines. One may end up - * with a partially-parsed buffer; the return value tells you whether - * the values are valid or not. - * - * \retval 1 if parsed correctly - * \retval 0 if more is needed - * \retval -1 if error - * - * TODO: - * * have it indicate "error" and "not enough" as two separate conditions! - * * audit this code as off-by-one errors are probably everywhere! + * Governed by: + * RFC 1945 section 5.1 + * RFC 2616 section 5.1 + * + * Parsing state is stored between calls. However the current implementation + * begins parsing from scratch on every call. + * The return value tells you whether the parsing state fields are valid or not. + * + * \retval -1 an error occurred. request_parse_status indicates HTTP status result. + * \retval 1 successful parse + * \retval 0 more data is needed to complete the parse */ int HttpParserParseReqLine(HttpParser *hmsg) { - int i = 0; + int i = 0, maj, min; int retcode = 0; - unsigned int maj = 0, min = 0; - int last_whitespace = -1, line_end = -1; + int second_word = -1; + int first_whitespace = -1, last_whitespace = -1, line_end = -1; + // NP: line_end tracks the last byte BEFORE the \r\n or \n - debugs(74, 5, "httpParserParseReqLine: parsing " << hmsg->buf); + debugs(74, 5, HERE << "parsing " << hmsg->buf); PROF_start(HttpParserParseReqLine); - /* Find \r\n - end of URL+Version (and the request) */ + + // Single-pass parse: (provided we have the whole line anyways) + + hmsg->req_start = 0; +#if 0 && USE_HTTP_VIOLATION + // NOTE: original parser version ignored prefix whitespace before the method. + // for now this hack retains that behaviour. WHY do we need this !? + // + for(;hmsg->req_start < hmsg->bufsiz && xisspace(hmsg->buf[hmsg->req_start]); hmsg->req_start++); +#endif hmsg->req_end = -1; for (i = 0; i < hmsg->bufsiz; i++) { + // track first and last whitespace (SP only) + if (hmsg->buf[i] == ' ') { + last_whitespace = i; + if (first_whitespace < hmsg->req_start) + first_whitespace = i; + } + + // track next non-SP/non-HT byte after first_whitespace + if (second_word < first_whitespace && hmsg->buf[i] != ' ' && hmsg->buf[i] != '\t') { + second_word = i; + } + + // locate line terminator if (hmsg->buf[i] == '\n') { hmsg->req_end = i; + line_end = i - 1; break; } - if (i < hmsg->bufsiz - 1 && hmsg->buf[i] == '\r' && hmsg->buf[i + 1] == '\n') { - hmsg->req_end = i + 1; - break; + if (i < hmsg->bufsiz - 1 && hmsg->buf[i] == '\r') { + if (hmsg->buf[i + 1] == '\n') { + hmsg->req_end = i + 1; + line_end = i - 1; + break; + } else { + // RFC 2616 section 5.1 + // "No CR or LF is allowed except in the final CRLF sequence" + hmsg->request_parse_status = HTTP_BAD_REQUEST; + return -1; + } } } if (hmsg->req_end == -1) { - retcode = 0; - goto finish; - } - assert(hmsg->buf[hmsg->req_end] == '\n'); - /* Start at the beginning again */ - i = 0; - - /* Find first non-whitespace - beginning of method */ - for (; i < hmsg->req_end && (xisspace(hmsg->buf[i])); i++); - if (i >= hmsg->req_end) { - retcode = 0; - goto finish; - } - hmsg->m_start = i; - hmsg->req_start = i; - - /* Find first whitespace - end of method */ - for (; i < hmsg->req_end && (! xisspace(hmsg->buf[i])); i++); - if (i >= hmsg->req_end) { - retcode = 0; - goto finish; - } - hmsg->m_end = i - 1; - - /* Find first non-whitespace - beginning of URL+Version */ - for (; i < hmsg->req_end && (xisspace(hmsg->buf[i])); i++); - if (i >= hmsg->req_end) { - retcode = 0; - goto finish; - } - hmsg->u_start = i; - - /* Find \r\n or \n - thats the end of the line. Keep track of the last whitespace! */ - for (; i <= hmsg->req_end; i++) { - /* If \n - its end of line */ - if (hmsg->buf[i] == '\n') { - line_end = i; - break; - } - /* XXX could be off-by-one wrong! */ - if (hmsg->buf[i] == '\r' && (i + 1) <= hmsg->req_end && hmsg->buf[i+1] == '\n') { - line_end = i; - break; - } - /* If its a whitespace, note it as it'll delimit our version */ - if (hmsg->buf[i] == ' ' || hmsg->buf[i] == '\t') { - last_whitespace = i; - } - } - if (i > hmsg->req_end) { - retcode = 0; - goto finish; + PROF_stop(HttpParserParseReqLine); + debugs(74, 5, "Parser: retval 0: from " << hmsg->req_start << + "->" << hmsg->req_end << ": needs more data to complete first line."); + return 0; // This is the only spot in this function where (0) is valid result. + } + + // NP: we have now seen EOL, more-data (0) cannot occur. + // From here on any failure is -1, success is 1 + + + // Input Validation: + + // Process what we now know about the line structure into field offsets + // generating HTTP status for any aborts as we go. + + // First non-whitespace = beginning of method + if (hmsg->req_start > line_end) { + hmsg->request_parse_status = HTTP_BAD_REQUEST; + retcode = -1; + goto finish; + } + hmsg->m_start = hmsg->req_start; + + // First whitespace = end of method + if (first_whitespace > line_end || first_whitespace < hmsg->req_start) { + hmsg->request_parse_status = HTTP_BAD_REQUEST; // no method + retcode = -1; + goto finish; + } + hmsg->m_end = first_whitespace - 1; + if (hmsg->m_end < hmsg->m_start) { + hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing URI? + retcode = -1; + goto finish; + } + + // First non-whitespace after first SP = beginning of URL+Version + if (second_word > line_end || second_word < hmsg->req_start) { + hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing URI + retcode = -1; + goto finish; + } + hmsg->u_start = second_word; + + // RFC 1945: SP and version following URI are optional, marking version 0.9 + // we identify this by the last whitespace being earlier than URI start + if (last_whitespace < second_word && last_whitespace >= hmsg->req_start) { + hmsg->v_maj = 0; + hmsg->v_min = 9; + hmsg->u_end = line_end; + hmsg->request_parse_status = HTTP_OK; // HTTP/0.9 + retcode = 1; + goto finish; + } else { + // otherwise last whitespace is somewhere after end of URI. + hmsg->u_end = last_whitespace; + // crop any trailing whitespace in the area we think of as URI + for(;hmsg->u_end >= hmsg->u_start && xisspace(hmsg->buf[hmsg->u_end]); hmsg->u_end--); + } + if (hmsg->u_end < hmsg->u_start) { + hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing URI + retcode = -1; + goto finish; + } + + // Last whitespace SP = before start of protocol/version + if (last_whitespace >= line_end) { + hmsg->request_parse_status = HTTP_BAD_REQUEST; // missing version + retcode = -1; + goto finish; + } + hmsg->v_start = last_whitespace + 1; + hmsg->v_end = line_end; + + // We only accept HTTP protocol requests right now. + // TODO: accept other protocols; RFC 2326 (RTSP protocol) etc + if ((hmsg->v_end - hmsg->v_start +1) < 5 || strncasecmp(&hmsg->buf[hmsg->v_start], "HTTP/", 5) != 0) { +#if USE_HTTP_VIOLATIONS + // being lax; old parser accepted strange versions + hmsg->v_maj = 0; + hmsg->v_min = 9; + hmsg->u_end = line_end; + hmsg->request_parse_status = HTTP_OK; // treat as HTTP/0.9 + retcode = 1; + goto finish; +#else + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; // protocol not supported / implemented. + retcode = -1; + goto finish; +#endif } /* At this point we don't need the 'i' value; so we'll recycle it for version parsing */ - /* - * At this point: line_end points to the first eol char (\r or \n); - * last_whitespace points to the last whitespace char in the URL. - * We know we have a full buffer here! - */ - if (last_whitespace == -1) { - maj = 0; - min = 9; - hmsg->u_end = line_end - 1; - assert(hmsg->u_end >= hmsg->u_start); - } else { - /* Find the first non-whitespace after last_whitespace */ - /* XXX why <= vs < ? I do need to really re-audit all of this ..*/ - for (i = last_whitespace; i <= hmsg->req_end && xisspace(hmsg->buf[i]); i++); - if (i > hmsg->req_end) { - retcode = 0; - goto finish; - } - - /* is it http/ ? if so, we try parsing. If not, the URL is the whole line; version is 0.9 */ - if (i + 5 >= hmsg->req_end || (strncasecmp(&hmsg->buf[i], "HTTP/", 5) != 0)) { - maj = 0; - min = 9; - hmsg->u_end = line_end - 1; - assert(hmsg->u_end >= hmsg->u_start); - } else { - /* Ok, lets try parsing! Yes, this needs refactoring! */ - hmsg->v_start = i; - i += 5; - - /* next should be 1 or more digits */ - maj = 0; - for (; i < hmsg->req_end && (isdigit(hmsg->buf[i])) && maj < 65536; i++) { - maj = maj * 10; - maj = maj + (hmsg->buf[i]) - '0'; - } - if (maj >= 65536) { - retcode = -1; - goto finish; - } - if (i >= hmsg->req_end) { - retcode = 0; - goto finish; - } - - /* next should be .; we -have- to have this as we have a whole line.. */ - if (hmsg->buf[i] != '.') { - retcode = 0; - goto finish; - } - if (i + 1 >= hmsg->req_end) { - retcode = 0; - goto finish; - } - - /* next should be one or more digits */ - i++; - min = 0; - for (; i < hmsg->req_end && (isdigit(hmsg->buf[i])) && min < 65536; i++) { - min = min * 10; - min = min + (hmsg->buf[i]) - '0'; - } - - if (min >= 65536) { - retcode = -1; - goto finish; - } - - /* Find whitespace, end of version */ - hmsg->v_end = i; - hmsg->u_end = last_whitespace - 1; - } - } + i = hmsg->v_start + sizeof("HTTP/") -1; + + /* next should be 1 or more digits */ + if (!isdigit(hmsg->buf[i])) { + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; + retcode = -1; + goto finish; + } + maj = 0; + for (; i <= line_end && (isdigit(hmsg->buf[i])) && maj < 65536; i++) { + maj = maj * 10; + maj = maj + (hmsg->buf[i]) - '0'; + } + // catch too-big values or missing remainders + if (maj >= 65536 || i > line_end) { + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; + retcode = -1; + goto finish; + } + hmsg->v_maj = maj; + + /* next should be .; we -have- to have this as we have a whole line.. */ + if (hmsg->buf[i] != '.') { + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; + retcode = -1; + goto finish; + } + // catch missing minor part + if (++i > line_end) { + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; + retcode = -1; + goto finish; + } + + /* next should be one or more digits */ + if (!isdigit(hmsg->buf[i])) { + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; + retcode = -1; + goto finish; + } + min = 0; + for (; i <= line_end && (isdigit(hmsg->buf[i])) && min < 65536; i++) { + min = min * 10; + min = min + (hmsg->buf[i]) - '0'; + } + // catch too-big values or trailing garbage + if (min >= 65536 || i < line_end) { + hmsg->request_parse_status = HTTP_HTTP_VERSION_NOT_SUPPORTED; + retcode = -1; + goto finish; + } + hmsg->v_min = min; /* * Rightio - we have all the schtuff. Return true; we've got enough. */ retcode = 1; + hmsg->request_parse_status = HTTP_OK; finish: - hmsg->v_maj = maj; - hmsg->v_min = min; PROF_stop(HttpParserParseReqLine); debugs(74, 5, "Parser: retval " << retcode << ": from " << hmsg->req_start << "->" << hmsg->req_end << ": method " << hmsg->m_start << "->" << hmsg->m_end << "; url " << hmsg->u_start << "->" << hmsg->u_end << - "; version " << hmsg->v_start << "->" << hmsg->v_end << " (" << maj << - "/" << min << ")"); + "; version " << hmsg->v_start << "->" << hmsg->v_end << " (" << hmsg->v_maj << + "/" << hmsg->v_min << ")"); return retcode; } -