#!/usr/bin/env python # -*- coding: utf8 -*- # :Copyright: © 2011 Günter Milde. # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. # This file is offered as-is, without any warranty. # # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause # :Id: $Id: punctuation_chars.py 7401 2012-05-01 09:50:02Z grubert $ import sys, re import unicodedata # punctuation characters around inline markup # =========================================== # # This module provides the lists of characters for the implementation of # the `inline markup recognition rules`_ in the reStructuredText parser # (states.py) # # .. _inline markup recognition rules: # ../../../docs/ref/rst/restructuredtext.html#inline-markup # Docutils punctuation category sample strings # -------------------------------------------- # # The sample strings are generated by punctuation_samples() and put here # literal to avoid the time-consuming generation with every Docutils # run. Running this file as a standalone module checks the definitions below # against a re-calculation. openers = r"""\"\'\(\<\[\{༺༼᚛⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝([{⦅「«‘“‹⸂⸄⸉⸌⸜⸠‚„»’”›⸃⸅⸊⸍⸝⸡‛‟""" closers = r"""\"\'\)\>\]\}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞)]}⦆」»’”›⸃⸅⸊⸍⸝⸡‛‟«‘“‹⸂⸄⸉⸌⸜⸠‚„""" delimiters = r"\-\/\:֊־᐀᠆‐‑‒–—―⸗⸚〜〰゠︱︲﹘﹣-¡·¿;·՚՛՜՝՞՟։׀׃׆׳״؉؊،؍؛؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾।॥॰෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒྅࿐࿑࿒࿓࿔၊။၌၍၎၏჻፡።፣፤፥፦፧፨᙭᙮᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠇᠈᠉᠊᥄᥅᧞᧟᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᰻᰼᰽᰾᰿᱾᱿᳓‖‗†‡•‣․‥…‧‰‱′″‴‵‶‷‸※‼‽‾⁁⁂⁃⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⳹⳺⳻⳼⳾⳿⸀⸁⸆⸇⸈⸋⸎⸏⸐⸑⸒⸓⸔⸕⸖⸘⸙⸛⸞⸟⸪⸫⸬⸭⸮⸰⸱、。〃〽・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꯫︐︑︒︓︔︕︖︙︰﹅﹆﹉﹊﹋﹌﹐﹑﹒﹔﹕﹖﹗﹟﹠﹡﹨﹪﹫!"#%&'*,./:;?@\。、・𐄀𐄁𐎟𐏐𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐬹𐬺𐬻𐬼𐬽𐬾𐬿𑂻𑂼𑂾𑂿𑃀𑃁𒑰𒑱𒑲𒑳" closing_delimiters = r"\.\,\;\!\?" # Unicode punctuation character categories # ---------------------------------------- unicode_punctuation_categories = { # 'Pc': 'Connector', # not used in Docutils inline markup recognition 'Pd': 'Dash', 'Ps': 'Open', 'Pe': 'Close', 'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage 'Pf': 'Final quote', # may behave like Ps or Pe depending on usage 'Po': 'Other' } """Unicode character categories for punctuation""" # generate character pattern strings # ================================== def unicode_charlists(categories, cp_min=0, cp_max=None): """Return dictionary of Unicode character lists. For each of the `catagories`, an item contains a list with all Unicode characters with `cp_min` <= code-point <= `cp_max` that belong to the category. (The default values check every code-point supported by Python.) """ # Determine highest code point with one of the given categories # (may shorten the search time considerably if there are many # categories with not too high characters): if cp_max is None: # python 2.3: list comprehension instead of generator required cp_max = max([x for x in range(sys.maxunicode + 1) if unicodedata.category(chr(x)) in categories]) # print cp_max # => 74867 for unicode_punctuation_categories charlists = {} for cat in categories: charlists[cat] = [chr(x) for x in range(cp_min, cp_max+1) if unicodedata.category(chr(x)) == cat] return charlists # Character categories in Docutils # -------------------------------- def punctuation_samples(): """Docutils punctuation category sample strings. Return list of sample strings for the categories "Open", "Close", "Delimiters" and "Closing-Delimiters" used in the `inline markup recognition rules`_. """ # Lists with characters in Unicode punctuation character categories cp_min = 160 # ASCII chars have special rules for backwards compatibility ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min) # match opening/closing characters # -------------------------------- # Rearange the lists to ensure matching characters at the same # index position. # low quotation marks are also used as closers (e.g. in Greek) # move them to category Pi: ucharlists['Ps'].remove('‚') # 201A SINGLE LOW-9 QUOTATION MARK ucharlists['Ps'].remove('„') # 201E DOUBLE LOW-9 QUOTATION MARK ucharlists['Pi'] += ['‚', '„'] ucharlists['Pi'].remove('‛') # 201B SINGLE HIGH-REVERSED-9 QUOTATION MARK ucharlists['Pi'].remove('‟') # 201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK ucharlists['Pf'] += ['‛', '‟'] # 301F LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant: ucharlists['Ps'].insert(ucharlists['Pe'].index('\u301f'), '\u301d') # print u''.join(ucharlists['Ps']).encode('utf8') # print u''.join(ucharlists['Pe']).encode('utf8') # print u''.join(ucharlists['Pi']).encode('utf8') # print u''.join(ucharlists['Pf']).encode('utf8') # The Docutils character categories # --------------------------------- # # The categorization of ASCII chars is non-standard to reduce both # false positives and need for escaping. (see `inline markup recognition # rules`_) # matching, allowed before markup openers = [re.escape('"\'(<[{')] for cat in ('Ps', 'Pi', 'Pf'): openers.extend(ucharlists[cat]) # matching, allowed after markup closers = [re.escape('"\')>]}')] for cat in ('Pe', 'Pf', 'Pi'): closers.extend(ucharlists[cat]) # non-matching, allowed on both sides delimiters = [re.escape('-/:')] for cat in ('Pd', 'Po'): delimiters.extend(ucharlists[cat]) # non-matching, after markup closing_delimiters = [re.escape('.,;!?')] # # Test open/close matching: # for i in range(min(len(openers),len(closers))): # print '%4d %s %s' % (i, openers[i].encode('utf8'), # closers[i].encode('utf8')) return [''.join(chars) for chars in (openers, closers, delimiters, closing_delimiters)] # Matching open/close quotes # -------------------------- # Rule (5) requires determination of matching open/close pairs. However, # the pairing of open/close quotes is ambigue due to different typographic # conventions in different languages. quote_pairs = {'\xbb': '\xbb', # Swedish '\u2018': '\u201a', # Greek '\u2019': '\u2019', # Swedish '\u201a': '\u2018\u2019', # German, Polish '\u201c': '\u201e', # German '\u201e': '\u201c\u201d', '\u201d': '\u201d', # Swedish '\u203a': '\u203a', # Swedish } def match_chars(c1, c2): try: i = openers.index(c1) except ValueError: # c1 not in openers return False return c2 == closers[i] or c2 in quote_pairs.get(c1, '') # print results # ============= if __name__ == '__main__': # (re) create and compare the samples: (o, c, d, cd) = punctuation_samples() if o != openers: print('- openers = ur"""%s"""' % openers.encode('utf8')) print('+ openers = ur"""%s"""' % o.encode('utf8')) if c != closers: print('- closers = ur"""%s"""' % closers.encode('utf8')) print('+ closers = ur"""%s"""' % c.encode('utf8')) if d != delimiters: print('- delimiters = ur"%s"' % delimiters.encode('utf8')) print('+ delimiters = ur"%s"' % d.encode('utf8')) if cd != closing_delimiters: print('- closing_delimiters = ur"%s"' % closing_delimiters.encode('utf8')) print('+ closing_delimiters = ur"%s"' % cd.encode('utf8')) # # test prints # print 'openers = ', repr(openers) # print 'closers = ', repr(closers) # print 'delimiters = ', repr(delimiters) # print 'closing_delimiters = ', repr(closing_delimiters) # ucharlists = unicode_charlists(unicode_punctuation_categories) # for cat, chars in ucharlists.items(): # # print cat, chars # # compact output (visible with a comprehensive font): # print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')