}. Call C{withAttribute} with a series of attribute names and values. Specify the list of filter attributes names and values as: - keyword arguments, as in C{(align="right")}, or - as an explicit dict with C{**} operator, when an attribute name is also a Python reserved word, as in C{**{"class":"Customer", "align":"right"}} - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) For attribute names with a namespace prefix, you must use the second form. Attribute names are matched insensitive to upper/lower case. If just testing for C{class} (with or without a namespace), use C{L{withClass}}. To verify that the attribute exists, but without specifying a value, pass C{withAttribute.ANY_VALUE} as the value. """ if args: attrs = args[:] else: attrs = attrDict.items() attrs = [(k,v) for k,v in attrs] def pa(s,l,tokens): for attrName,attrValue in attrs: if attrName not in tokens: raise ParseException(s,l,"no matching attribute " + attrName) if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue: raise ParseException(s,l,"attribute '%s' has value '%s', must be '%s'" % (attrName, tokens[attrName], attrValue)) return pa withAttribute.ANY_VALUE = object() def withClass(classname, namespace=''): """Simplified version of C{L{withAttribute}} when matching on a div class - made difficult because C{class} is a reserved word in Python. """ classattr = "%s:class" % namespace if namespace else "class" return withAttribute(**{classattr : classname}) opAssoc = _Constants() opAssoc.LEFT = object() opAssoc.RIGHT = object() def infixNotation( baseExpr, opList, lpar=Suppress('('), rpar=Suppress(')') ): """Helper method for constructing grammars of expressions made up of operators working in a precedence hierarchy. Operators may be unary or binary, left- or right-associative. Parse actions can also be attached to operator expressions. Parameters: - baseExpr - expression representing the most basic element for the nested - opList - list of tuples, one for each operator precedence level in the expression grammar; each tuple is of the form (opExpr, numTerms, rightLeftAssoc, parseAction), where: - opExpr is the pyparsing expression for the operator; may also be a string, which will be converted to a Literal; if numTerms is 3, opExpr is a tuple of two expressions, for the two operators separating the 3 terms - numTerms is the number of terms for this operator (must be 1, 2, or 3) - rightLeftAssoc is the indicator whether the operator is right or left associative, using the pyparsing-defined constants C{opAssoc.RIGHT} and C{opAssoc.LEFT}. - parseAction is the parse action to be associated with expressions matching this operator expression (the parse action tuple member may be omitted) - lpar - expression for matching left-parentheses (default=Suppress('(')) - rpar - expression for matching right-parentheses (default=Suppress(')')) """ ret = Forward() lastExpr = baseExpr | ( lpar + ret + rpar ) for i,operDef in enumerate(opList): opExpr,arity,rightLeftAssoc,pa = (operDef + (None,))[:4] termName = "%s term" % opExpr if arity < 3 else "%s%s term" % opExpr if arity == 3: if opExpr is None or len(opExpr) != 2: raise ValueError("if numterms=3, opExpr must be a tuple or list of two expressions") opExpr1, opExpr2 = opExpr thisExpr = Forward().setName(termName) if rightLeftAssoc == opAssoc.LEFT: if arity == 1: matchExpr = FollowedBy(lastExpr + opExpr) + Group( lastExpr + OneOrMore( opExpr ) ) elif arity == 2: if opExpr is not None: matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( lastExpr + OneOrMore( opExpr + lastExpr ) ) else: matchExpr = FollowedBy(lastExpr+lastExpr) + Group( lastExpr + OneOrMore(lastExpr) ) elif arity == 3: matchExpr = FollowedBy(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) + \ Group( lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr ) else: raise ValueError("operator must be unary (1), binary (2), or ternary (3)") elif rightLeftAssoc == opAssoc.RIGHT: if arity == 1: # try to avoid LR with this extra test if not isinstance(opExpr, Optional): opExpr = Optional(opExpr) matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( opExpr + thisExpr ) elif arity == 2: if opExpr is not None: matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( lastExpr + OneOrMore( opExpr + thisExpr ) ) else: matchExpr = FollowedBy(lastExpr + thisExpr) + Group( lastExpr + OneOrMore( thisExpr ) ) elif arity == 3: matchExpr = FollowedBy(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) + \ Group( lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr ) else: raise ValueError("operator must be unary (1), binary (2), or ternary (3)") else: raise ValueError("operator must indicate right or left associativity") if pa: matchExpr.setParseAction( pa ) thisExpr <<= ( matchExpr.setName(termName) | lastExpr ) lastExpr = thisExpr ret <<= lastExpr return ret operatorPrecedence = infixNotation """(Deprecated) Former name of C{L{infixNotation}}, will be dropped in a future release.""" dblQuotedString = Combine(Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*')+'"').setName("string enclosed in double quotes") sglQuotedString = Combine(Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*")+"'").setName("string enclosed in single quotes") quotedString = Combine(Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*')+'"'| Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*")+"'").setName("quotedString using single or double quotes") unicodeString = Combine(_L('u') + quotedString.copy()).setName("unicode string literal") def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()): """Helper method for defining nested lists enclosed in opening and closing delimiters ("(" and ")" are the default). Parameters: - opener - opening character for a nested list (default="("); can also be a pyparsing expression - closer - closing character for a nested list (default=")"); can also be a pyparsing expression - content - expression for items within the nested lists (default=None) - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) If an expression is not provided for the content argument, the nested expression will capture all whitespace-delimited content between delimiters as a list of separate values. Use the C{ignoreExpr} argument to define expressions that may contain opening or closing characters that should not be treated as opening or closing characters for nesting, such as quotedString or a comment expression. Specify multiple expressions using an C{L{Or}} or C{L{MatchFirst}}. The default is L{quotedString}, but if no expressions are to be ignored, then pass C{None} for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") if content is None: if isinstance(opener,basestring) and isinstance(closer,basestring): if len(opener) == 1 and len(closer)==1: if ignoreExpr is not None: content = (Combine(OneOrMore(~ignoreExpr + CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS,exact=1)) ).setParseAction(lambda t:t[0].strip())) else: content = (empty.copy()+CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS ).setParseAction(lambda t:t[0].strip())) else: if ignoreExpr is not None: content = (Combine(OneOrMore(~ignoreExpr + ~Literal(opener) + ~Literal(closer) + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) ).setParseAction(lambda t:t[0].strip())) else: content = (Combine(OneOrMore(~Literal(opener) + ~Literal(closer) + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) ).setParseAction(lambda t:t[0].strip())) else: raise ValueError("opening and closing arguments must be strings if no content expression is given") ret = Forward() if ignoreExpr is not None: ret <<= Group( Suppress(opener) + ZeroOrMore( ignoreExpr | ret | content ) + Suppress(closer) ) else: ret <<= Group( Suppress(opener) + ZeroOrMore( ret | content ) + Suppress(closer) ) ret.setName('nested %s%s expression' % (opener,closer)) return ret def indentedBlock(blockStatementExpr, indentStack, indent=True): """Helper method for defining space-delimited indentation blocks, such as those used to define block statements in Python source code. Parameters: - blockStatementExpr - expression defining syntax of statement that is repeated within the indented block - indentStack - list created by caller to manage indentation stack (multiple statementWithIndentedBlock expressions within a single grammar should share a common indentStack) - indent - boolean indicating whether block must be indented beyond the the current level; set to False for block of left-most statements (default=True) A valid block must contain at least one C{blockStatement}. """ def checkPeerIndent(s,l,t): if l >= len(s): return curCol = col(l,s) if curCol != indentStack[-1]: if curCol > indentStack[-1]: raise ParseFatalException(s,l,"illegal nesting") raise ParseException(s,l,"not a peer entry") def checkSubIndent(s,l,t): curCol = col(l,s) if curCol > indentStack[-1]: indentStack.append( curCol ) else: raise ParseException(s,l,"not a subentry") def checkUnindent(s,l,t): if l >= len(s): return curCol = col(l,s) if not(indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): raise ParseException(s,l,"not an unindent") indentStack.pop() NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()) INDENT = (Empty() + Empty().setParseAction(checkSubIndent)).setName('INDENT') PEER = Empty().setParseAction(checkPeerIndent).setName('') UNDENT = Empty().setParseAction(checkUnindent).setName('UNINDENT') if indent: smExpr = Group( Optional(NL) + #~ FollowedBy(blockStatementExpr) + INDENT + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) + UNDENT) else: smExpr = Group( Optional(NL) + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) ) blockStatementExpr.ignore(_bslash + LineEnd()) return smExpr.setName('indented block') alphas8bit = srange(r"[\0xc0-\0xd6\0xd8-\0xf6\0xf8-\0xff]") punc8bit = srange(r"[\0xa1-\0xbf\0xd7\0xf7]") anyOpenTag,anyCloseTag = makeHTMLTags(Word(alphas,alphanums+"_:").setName('any tag')) _htmlEntityMap = dict(zip("gt lt amp nbsp quot apos".split(),'><& "\'')) commonHTMLEntity = Regex('&(?P' + '|'.join(_htmlEntityMap.keys()) +");").setName("common HTML entity") def replaceHTMLEntity(t): """Helper parser action to replace common HTML entities with their special characters""" return _htmlEntityMap.get(t.entity) # it's easy to get these comment structures wrong - they're very common, so may as well make them available cStyleComment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + '*/').setName("C style comment") "Comment of the form C{/* ... */}" htmlComment = Regex(r"").setName("HTML comment") "Comment of the form C{}" restOfLine = Regex(r".*").leaveWhitespace().setName("rest of line") dblSlashComment = Regex(r"//(?:\\\n|[^\n])*").setName("// comment") "Comment of the form C{// ... (to end of line)}" cppStyleComment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + '*/'| dblSlashComment).setName("C++ style comment") "Comment of either form C{L{cStyleComment}} or C{L{dblSlashComment}}" javaStyleComment = cppStyleComment "Same as C{L{cppStyleComment}}" pythonStyleComment = Regex(r"#.*").setName("Python style comment") "Comment of the form C{# ... (to end of line)}" _commasepitem = Combine(OneOrMore(Word(printables, excludeChars=',') + Optional( Word(" \t") + ~Literal(",") + ~LineEnd() ) ) ).streamline().setName("commaItem") commaSeparatedList = delimitedList( Optional( quotedString.copy() | _commasepitem, default="") ).setName("commaSeparatedList") """Predefined expression of 1 or more printable words or quoted strings, separated by commas.""" # some other useful expressions - using lower-case class name since we are really using this as a namespace class pyparsing_common: """ Here are some common low-level expressions that may be useful in jump-starting parser development: - numeric forms (L{integers}, L{reals}, L{scientific notation}) - common L{programming identifiers} - network addresses (L{MAC}, L{IPv4}, L{IPv6}) - ISO8601 L{dates} and L{datetime} - L{UUID} Parse actions: - C{L{convertToInteger}} - C{L{convertToFloat}} - C{L{convertToDate}} - C{L{convertToDatetime}} - C{L{stripHTMLTags}} """ convertToInteger = tokenMap(int) """ Parse action for converting parsed integers to Python int """ convertToFloat = tokenMap(float) """ Parse action for converting parsed numbers to Python float """ integer = Word(nums).setName("integer").setParseAction(convertToInteger) """expression that parses an unsigned integer, returns an int""" hex_integer = Word(hexnums).setName("hex integer").setParseAction(tokenMap(int,16)) """expression that parses a hexadecimal integer, returns an int""" signedInteger = Regex(r'[+-]?\d+').setName("signed integer").setParseAction(convertToInteger) """expression that parses an integer with optional leading sign, returns an int""" fraction = (signedInteger.addParseAction(convertToFloat) + '/' + signedInteger.addParseAction(convertToFloat)).setName("fraction") """fractional expression of an integer divided by an integer, returns a float""" fraction.addParseAction(lambda t: t[0]/t[-1]) mixed_integer = (fraction | integer + Optional(Optional('-').suppress() + fraction)).setName("fraction or mixed integer-fraction") """mixed integer of the form 'integer - fraction', with optional leading integer, returns float""" mixed_integer.addParseAction(sum) real = Regex(r'[+-]?\d+\.\d*').setName("real number").setParseAction(convertToFloat) """expression that parses a floating point number and returns a float""" sciReal = Regex(r'[+-]?\d+([eE][+-]?\d+|\.\d*([eE][+-]?\d+)?)').setName("real number with scientific notation").setParseAction(convertToFloat) """expression that parses a floating point number with optional scientific notation and returns a float""" # streamlining this expression makes the docs nicer-looking numeric = (sciReal | real | signedInteger).streamline() """any numeric expression, returns the corresponding Python type""" number = Regex(r'[+-]?\d+\.?\d*([eE][+-]?\d+)?').setName("number").setParseAction(convertToFloat) """any int or real number, returned as float""" identifier = Word(alphas+'_', alphanums+'_').setName("identifier") """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')""" ipv4_address = Regex(r'(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}').setName("IPv4 address") "IPv4 address (C{0.0.0.0 - 255.255.255.255})" _ipv6_part = Regex(r'[0-9a-fA-F]{1,4}').setName("hex_integer") _full_ipv6_address = (_ipv6_part + (':' + _ipv6_part)*7).setName("full IPv6 address") _short_ipv6_address = (Optional(_ipv6_part + (':' + _ipv6_part)*(0,6)) + "::" + Optional(_ipv6_part + (':' + _ipv6_part)*(0,6))).setName("short IPv6 address") _short_ipv6_address.addCondition(lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8) _mixed_ipv6_address = ("::ffff:" + ipv4_address).setName("mixed IPv6 address") ipv6_address = Combine((_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).setName("IPv6 address")).setName("IPv6 address") "IPv6 address (long, short, or mixed form)" mac_address = Regex(r'[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}').setName("MAC address") "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)" @staticmethod def convertToDate(fmt="%Y-%m-%d"): """ Helper to create a parse action for converting parsed date string to Python datetime.date Params - - fmt - format to be passed to datetime.strptime (default=C{"%Y-%m-%d"}) """ return lambda s,l,t: datetime.strptime(t[0], fmt).date() @staticmethod def convertToDatetime(fmt="%Y-%m-%dT%H:%M:%S.%f"): """ Helper to create a parse action for converting parsed datetime string to Python datetime.datetime Params - - fmt - format to be passed to datetime.strptime (default=C{"%Y-%m-%dT%H:%M:%S.%f"}) """ return lambda s,l,t: datetime.strptime(t[0], fmt) iso8601_date = Regex(r'(?P\d{4})(?:-(?P\d\d)(?:-(?P\d\d))?)?').setName("ISO8601 date") "ISO8601 date (C{yyyy-mm-dd})" iso8601_datetime = Regex(r'(?P\d{4})-(?P\d\d)-(?P\d\d)[T ](?P\d\d):(?P\d\d)(:(?P\d\d(\.\d*)?)?)?(?PZ|[+-]\d\d:?\d\d)?').setName("ISO8601 datetime") "ISO8601 datetime (C{yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)}) - trailing seconds, milliseconds, and timezone optional; accepts separating C{'T'} or C{' '}" uuid = Regex(r'[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}').setName("UUID") "UUID (C{xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx})" _html_stripper = anyOpenTag.suppress() | anyCloseTag.suppress() @staticmethod def stripHTMLTags(s, l, tokens): """Parse action to remove HTML tags from web page HTML source""" return pyparsing_common._html_stripper.transformString(tokens[0]) if __name__ == "__main__": selectToken = CaselessLiteral("select") fromToken = CaselessLiteral("from") ident = Word(alphas, alphanums + "_$") columnName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens) columnNameList = Group(delimitedList(columnName)).setName("columns") columnSpec = ('*' | columnNameList) tableName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens) tableNameList = Group(delimitedList(tableName)).setName("tables") simpleSQL = selectToken("command") + columnSpec("columns") + fromToken + tableNameList("tables") # demo runTests method, including embedded comments in test string simpleSQL.runTests(""" # '*' as column list and dotted table name select * from SYS.XYZZY # caseless match on "SELECT", and casts back to "select" SELECT * from XYZZY, ABC # list of column names, and mixed case SELECT keyword Select AA,BB,CC from Sys.dual # multiple tables Select A, B, C from Sys.dual, Table2 # invalid SELECT keyword - should fail Xelect A, B, C from Sys.dual # incomplete command - should fail Select # invalid column name - should fail Select ^^^ frox Sys.dual """) pyparsing_common.numeric.runTests(""" 100 -100 +100 3.14159 6.02e23 1e-12 """) # any int or real number, returned as float pyparsing_common.number.runTests(""" 100 -100 +100 3.14159 6.02e23 1e-12 """) pyparsing_common.hex_integer.runTests(""" 100 FF """) import uuid pyparsing_common.uuid.setParseAction(tokenMap(uuid.UUID)) pyparsing_common.uuid.runTests(""" 12345678-1234-5678-1234-567812345678 """)