# We use native strings for all the re patterns, to take advantage of string # formatting, and then convert to bytestrings when compiling the final re # objects. # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#whitespace # OWS = *( SP / HTAB ) # ; optional whitespace OWS = r"[ \t]*" # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.token.separators # token = 1*tchar # # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" # / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" # / DIGIT / ALPHA # ; any VCHAR, except delimiters token = r"[-!#$%&'*+.^_`|~0-9a-zA-Z]+" # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#header.fields # field-name = token field_name = token # The standard says: # # field-value = *( field-content / obs-fold ) # field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] # field-vchar = VCHAR / obs-text # obs-fold = CRLF 1*( SP / HTAB ) # ; obsolete line folding # ; see Section 3.2.4 # # https://tools.ietf.org/html/rfc5234#appendix-B.1 # # VCHAR = %x21-7E # ; visible (printing) characters # # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#rule.quoted-string # obs-text = %x80-FF # # However, the standard definition of field-content is WRONG! It disallows # fields containing a single visible character surrounded by whitespace, # e.g. "foo a bar". # # See: https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189 # # So our definition of field_content attempts to fix it up... # # Also, we allow lots of control characters, because apparently people assume # that they're legal in practice (e.g., google analytics makes cookies with # \x01 in them!): # https://github.com/python-hyper/h11/issues/57 # We still don't allow NUL or whitespace, because those are often treated as # meta-characters and letting them through can lead to nasty issues like SSRF. vchar = r"[\x21-\x7e]" vchar_or_obs_text = r"[^\x00\s]" field_vchar = vchar_or_obs_text field_content = r"{field_vchar}+(?:[ \t]+{field_vchar}+)*".format(**globals()) # We handle obs-fold at a different level, and our fixed-up field_content # already grows to swallow the whole value, so ? instead of * field_value = r"({field_content})?".format(**globals()) # header-field = field-name ":" OWS field-value OWS header_field = ( r"(?P{field_name})" r":" r"{OWS}" r"(?P{field_value})" r"{OWS}".format(**globals()) ) # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#request.line # # request-line = method SP request-target SP HTTP-version CRLF # method = token # HTTP-version = HTTP-name "/" DIGIT "." DIGIT # HTTP-name = %x48.54.54.50 ; "HTTP", case-sensitive # # request-target is complicated (see RFC 7230 sec 5.3) -- could be path, full # URL, host+port (for connect), or even "*", but in any case we are guaranteed # that it contists of the visible printing characters. method = token request_target = r"{vchar}+".format(**globals()) http_version = r"HTTP/(?P[0-9]\.[0-9])" request_line = ( r"(?P{method})" r" " r"(?P{request_target})" r" " r"{http_version}".format(**globals()) ) # https://svn.tools.ietf.org/svn/wg/httpbis/specs/rfc7230.html#status.line # # status-line = HTTP-version SP status-code SP reason-phrase CRLF # status-code = 3DIGIT # reason-phrase = *( HTAB / SP / VCHAR / obs-text ) status_code = r"[0-9]{3}" reason_phrase = r"([ \t]|{vchar_or_obs_text})*".format(**globals()) status_line = ( r"{http_version}" r" " r"(?P{status_code})" # However, there are apparently a few too many servers out there that just # leave out the reason phrase: # https://github.com/scrapy/scrapy/issues/345#issuecomment-281756036 # https://github.com/seanmonstar/httparse/issues/29 # so make it optional. ?: is a non-capturing group. r"(?: (?P{reason_phrase}))?".format(**globals()) ) HEXDIG = r"[0-9A-Fa-f]" # Actually # # chunk-size = 1*HEXDIG # # but we impose an upper-limit to avoid ridiculosity. len(str(2**64)) == 20 chunk_size = r"({HEXDIG}){{1,20}}".format(**globals()) # Actually # # chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] ) # # but we aren't parsing the things so we don't really care. chunk_ext = r";.*" chunk_header = ( r"(?P{chunk_size})" r"(?P{chunk_ext})?" r"\r\n".format(**globals()) )