# URL extractor # Copyright 2004, Paul McGuire from pyparsing import makeHTMLTags from contextlib import closing import urllib.request, urllib.parse, urllib.error import pprint # Define the pyparsing grammar for a URL, that is: # URLlink ::= linkText # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag, linkCloseTag = makeHTMLTags("a") link = linkOpenTag + linkOpenTag.tag_body("body") + linkCloseTag.suppress() # Go get some HTML with some links in it. with closing(urllib.request.urlopen("https://www.cnn.com/")) as serverListPage: htmlText = serverListPage.read() # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks, strt, end in link.scanString(htmlText): print(toks.startA.href, "->", toks.body) # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint( {toks.body: toks.startA.href for toks, strt, end in link.scanString(htmlText)} )