import re from ._abnf import field_name, field_value from ._util import bytesify, LocalProtocolError, validate # Facts # ----- # # Headers are: # keys: case-insensitive ascii # values: mixture of ascii and raw bytes # # "Historically, HTTP has allowed field content with text in the ISO-8859-1 # charset [ISO-8859-1], supporting other charsets only through use of # [RFC2047] encoding. In practice, most HTTP header field values use only a # subset of the US-ASCII charset [USASCII]. Newly defined header fields SHOULD # limit their field values to US-ASCII octets. A recipient SHOULD treat other # octets in field content (obs-text) as opaque data." # And it deprecates all non-ascii values # # Leading/trailing whitespace in header names is forbidden # # Values get leading/trailing whitespace stripped # # Content-Disposition actually needs to contain unicode semantically; to # accomplish this it has a terrifically weird way of encoding the filename # itself as ascii (and even this still has lots of cross-browser # incompatibilities) # # Order is important: # "a proxy MUST NOT change the order of these field values when forwarding a # message" # (and there are several headers where the order indicates a preference) # # Multiple occurences of the same header: # "A sender MUST NOT generate multiple header fields with the same field name # in a message unless either the entire field value for that header field is # defined as a comma-separated list [or the header is Set-Cookie which gets a # special exception]" - RFC 7230. (cookies are in RFC 6265) # # So every header aside from Set-Cookie can be merged by b", ".join if it # occurs repeatedly. But, of course, they can't necessarily be split by # .split(b","), because quoting. # # Given all this mess (case insensitive, duplicates allowed, order is # important, ...), there doesn't appear to be any standard way to handle # headers in Python -- they're almost like dicts, but... actually just # aren't. For now we punt and just use a super simple representation: headers # are a list of pairs # # [(name1, value1), (name2, value2), ...] # # where all entries are bytestrings, names are lowercase and have no # leading/trailing whitespace, and values are bytestrings with no # leading/trailing whitespace. Searching and updating are done via naive O(n) # methods. # # Maybe a dict-of-lists would be better? _content_length_re = re.compile(br"[0-9]+") _field_name_re = re.compile(field_name.encode("ascii")) _field_value_re = re.compile(field_value.encode("ascii")) class Headers: """ A list-like interface that allows iterating over headers as byte-pairs of (lowercased-name, value). Internally we actually store the representation as three-tuples, including both the raw original casing, in order to preserve casing over-the-wire, and the lowercased name, for case-insensitive comparisions. r = Request( method="GET", target="/", headers=[("Host", "example.org"), ("Connection", "keep-alive")], http_version="1.1", ) assert r.headers == [ (b"host", b"example.org"), (b"connection", b"keep-alive") ] assert r.headers.raw_items() == [ (b"Host", b"example.org"), (b"Connection", b"keep-alive") ] """ __slots__ = "_full_items" def __init__(self, full_items): self._full_items = full_items def __iter__(self): for _, name, value in self._full_items: yield name, value def __bool__(self): return bool(self._full_items) def __eq__(self, other): return list(self) == list(other) def __len__(self): return len(self._full_items) def __repr__(self): return "" % repr(list(self)) def __getitem__(self, idx): _, name, value = self._full_items[idx] return (name, value) def raw_items(self): return [(raw_name, value) for raw_name, _, value in self._full_items] def normalize_and_validate(headers, _parsed=False): new_headers = [] seen_content_length = None saw_transfer_encoding = False for name, value in headers: # For headers coming out of the parser, we can safely skip some steps, # because it always returns bytes and has already run these regexes # over the data: if not _parsed: name = bytesify(name) value = bytesify(value) validate(_field_name_re, name, "Illegal header name {!r}", name) validate(_field_value_re, value, "Illegal header value {!r}", value) raw_name = name name = name.lower() if name == b"content-length": lengths = {length.strip() for length in value.split(b",")} if len(lengths) != 1: raise LocalProtocolError("conflicting Content-Length headers") value = lengths.pop() validate(_content_length_re, value, "bad Content-Length") if seen_content_length is None: seen_content_length = value new_headers.append((raw_name, name, value)) elif seen_content_length != value: raise LocalProtocolError("conflicting Content-Length headers") elif name == b"transfer-encoding": # "A server that receives a request message with a transfer coding # it does not understand SHOULD respond with 501 (Not # Implemented)." # https://tools.ietf.org/html/rfc7230#section-3.3.1 if saw_transfer_encoding: raise LocalProtocolError( "multiple Transfer-Encoding headers", error_status_hint=501 ) # "All transfer-coding names are case-insensitive" # -- https://tools.ietf.org/html/rfc7230#section-4 value = value.lower() if value != b"chunked": raise LocalProtocolError( "Only Transfer-Encoding: chunked is supported", error_status_hint=501, ) saw_transfer_encoding = True new_headers.append((raw_name, name, value)) else: new_headers.append((raw_name, name, value)) return Headers(new_headers) def get_comma_header(headers, name): # Should only be used for headers whose value is a list of # comma-separated, case-insensitive values. # # The header name `name` is expected to be lower-case bytes. # # Connection: meets these criteria (including cast insensitivity). # # Content-Length: technically is just a single value (1*DIGIT), but the # standard makes reference to implementations that do multiple values, and # using this doesn't hurt. Ditto, case insensitivity doesn't things either # way. # # Transfer-Encoding: is more complex (allows for quoted strings), so # splitting on , is actually wrong. For example, this is legal: # # Transfer-Encoding: foo; options="1,2", chunked # # and should be parsed as # # foo; options="1,2" # chunked # # but this naive function will parse it as # # foo; options="1 # 2" # chunked # # However, this is okay because the only thing we are going to do with # any Transfer-Encoding is reject ones that aren't just "chunked", so # both of these will be treated the same anyway. # # Expect: the only legal value is the literal string # "100-continue". Splitting on commas is harmless. Case insensitive. # out = [] for _, found_name, found_raw_value in headers._full_items: if found_name == name: found_raw_value = found_raw_value.lower() for found_split_value in found_raw_value.split(b","): found_split_value = found_split_value.strip() if found_split_value: out.append(found_split_value) return out def set_comma_header(headers, name, new_values): # The header name `name` is expected to be lower-case bytes. # # Note that when we store the header we use title casing for the header # names, in order to match the conventional HTTP header style. # # Simply calling `.title()` is a blunt approach, but it's correct # here given the cases where we're using `set_comma_header`... # # Connection, Content-Length, Transfer-Encoding. new_headers = [] for found_raw_name, found_name, found_raw_value in headers._full_items: if found_name != name: new_headers.append((found_raw_name, found_raw_value)) for new_value in new_values: new_headers.append((name.title(), new_value)) return normalize_and_validate(new_headers) def has_expect_100_continue(request): # https://tools.ietf.org/html/rfc7231#section-5.1.1 # "A server that receives a 100-continue expectation in an HTTP/1.0 request # MUST ignore that expectation." if request.http_version < b"1.1": return False expect = get_comma_header(request.headers, b"expect") return b"100-continue" in expect