# # pyzmail/utils.py # (c) Alain Spineux # http://www.magiksys.net/pyzmail # Released under LGPL """ Various functions used by other modules @var invalid_chars_in_filename: a mix of characters not permitted in most used filesystems @var invalid_windows_name: a list of unauthorized filenames under Windows """ import sys invalid_chars_in_filename=b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' \ b'\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f' \ b'<>:"/\\|?*\%\'' invalid_windows_name=[b'CON', b'PRN', b'AUX', b'NUL', b'COM1', b'COM2', b'COM3', b'COM4', b'COM5', b'COM6', b'COM7', b'COM8', b'COM9', b'LPT1', b'LPT2', b'LPT3', b'LPT4', b'LPT5', b'LPT6', b'LPT7', b'LPT8', b'LPT9' ] def sanitize_filename(filename, alt_name, alt_ext): """ Convert the given filename into a name that should work on all platform. Remove non us-ascii characters, and drop invalid filename. Use the I{alternative} filename if needed. @type filename: unicode or None @param filename: the originale filename or None. Can be unicode. @type alt_name: str @param alt_name: the alternative filename if filename is None or useless @type alt_ext: str @param alt_ext: the alternative filename extension (including the '.') @rtype: str @returns: a valid filename. >>> sanitize_filename('document.txt', 'file', '.txt') 'document.txt' >>> sanitize_filename('number1.txt', 'file', '.txt') 'number1.txt' >>> sanitize_filename(None, 'file', '.txt') 'file.txt' >>> sanitize_filename('R\\xe9pertoir.txt', 'file', '.txt') 'Rpertoir.txt' >>> # the '\\xe9' has been removed >>> sanitize_filename('\\xe9\\xe6.html', 'file', '.txt') 'file.html' >>> # all non us-ascii characters have been removed, the alternative name >>> # has been used the replace empty string. The originale extention >>> # is still valid >>> sanitize_filename('COM1.txt', 'file', '.txt') 'COM1A.txt' >>> # if name match an invalid name or assimilated then a A is added """ if not filename: return alt_name+alt_ext if ((sys.version_info<(3, 0) and isinstance(filename, str)) or \ (sys.version_info>=(3, 0) and isinstance(filename, str))): filename=filename.encode('ascii', 'ignore') filename=filename.translate(None, invalid_chars_in_filename) filename=filename.strip() upper=filename.upper() for name in invalid_windows_name: if upper==name: filename=filename+b'A' break if upper.startswith(name+b'.'): filename=filename[:len(name)]+b'A'+filename[len(name):] break if sys.version_info>=(3, 0): # back to string filename=filename.decode('us-ascii') if filename.rfind('.')==0: filename=alt_name+filename return filename def handle_filename_collision(filename, filenames): """ Avoid filename collision, add a sequence number to the name when required. 'file.txt' will be renamed into 'file-01.txt' then 'file-02.txt' ... until their is no more collision. The file is not added to the list. Windows don't make the difference between lower and upper case. To avoid "case" collision, the function compare C{filename.lower()} to the list. If you provide a list in lower case only, then any collisions will be avoided. @type filename: str @param filename: the filename @type filenames: list or set @param filenames: a list of filenames. @rtype: str @returns: the I{filename} or the appropriately I{indexed} I{filename} >>> handle_filename_collision('file.txt', [ ]) 'file.txt' >>> handle_filename_collision('file.txt', [ 'file.txt' ]) 'file-01.txt' >>> handle_filename_collision('file.txt', [ 'file.txt', 'file-01.txt',]) 'file-02.txt' >>> handle_filename_collision('foo', [ 'foo',]) 'foo-01' >>> handle_filename_collision('foo', [ 'foo', 'foo-01',]) 'foo-02' >>> handle_filename_collision('FOO', [ 'foo', 'foo-01',]) 'FOO-02' """ if filename.lower() in filenames: try: basename, ext=filename.rsplit('.', 1) ext='.'+ext except ValueError: basename, ext=filename, '' i=1 while True: filename='%s-%02d%s' % (basename, i, ext) if filename.lower() not in filenames: break i+=1 return filename def is_usascii(value): """" test if string contains us-ascii characters only >>> is_usascii('foo') True >>> is_usascii('foo') True >>> is_usascii('Fran\xe7ais') False >>> is_usascii('bad\x81') False """ try: # if value is byte string, it will be decoded first using us-ascii # and will generate UnicodeEncodeError, this is fine too value.encode('us-ascii') except UnicodeError: return False return True