1 # Copyright (C) 2001-2007 Python Software Foundation
   2 # Author: Ben Gertzfield, Barry Warsaw
   3 # Contact: email-sig@python.org
   4 
   5 __all__ = [
   6     'Charset',
   7     'add_alias',
   8     'add_charset',
   9     'add_codec',
  10     ]
  11 
  12 from functools import partial
  13 
  14 import email.base64mime
  15 import email.quoprimime
  16 
  17 from email import errors
  18 from email.encoders import encode_7or8bit
  19 
  20 
  21 
  22 # Flags for types of header encodings
  23 QP          = 1 # Quoted-Printable
  24 BASE64      = 2 # Base64
  25 SHORTEST    = 3 # the shorter of QP and base64, but only for headers
  26 
  27 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
  28 RFC2047_CHROME_LEN = 7
  29 
  30 DEFAULT_CHARSET = 'us-ascii'
  31 UNKNOWN8BIT = 'unknown-8bit'
  32 EMPTYSTRING = ''
  33 
  34 
  35 
  36 # Defaults
  37 CHARSETS = {
  38     # input        header enc  body enc output conv
  39     'iso-8859-1':  (QP,        QP,      None),
  40     'iso-8859-2':  (QP,        QP,      None),
  41     'iso-8859-3':  (QP,        QP,      None),
  42     'iso-8859-4':  (QP,        QP,      None),
  43     # iso-8859-5 is Cyrillic, and not especially used
  44     # iso-8859-6 is Arabic, also not particularly used
  45     # iso-8859-7 is Greek, QP will not make it readable
  46     # iso-8859-8 is Hebrew, QP will not make it readable
  47     'iso-8859-9':  (QP,        QP,      None),
  48     'iso-8859-10': (QP,        QP,      None),
  49     # iso-8859-11 is Thai, QP will not make it readable
  50     'iso-8859-13': (QP,        QP,      None),
  51     'iso-8859-14': (QP,        QP,      None),
  52     'iso-8859-15': (QP,        QP,      None),
  53     'iso-8859-16': (QP,        QP,      None),
  54     'windows-1252':(QP,        QP,      None),
  55     'viscii':      (QP,        QP,      None),
  56     'us-ascii':    (None,      None,    None),
  57     'big5':        (BASE64,    BASE64,  None),
  58     'gb2312':      (BASE64,    BASE64,  None),
  59     'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
  60     'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
  61     'iso-2022-jp': (BASE64,    None,    None),
  62     'koi8-r':      (BASE64,    BASE64,  None),
  63     'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
  64     }
  65 
  66 # Aliases for other commonly-used names for character sets.  Map
  67 # them to the real ones used in email.
  68 ALIASES = {
  69     'latin_1': 'iso-8859-1',
  70     'latin-1': 'iso-8859-1',
  71     'latin_2': 'iso-8859-2',
  72     'latin-2': 'iso-8859-2',
  73     'latin_3': 'iso-8859-3',
  74     'latin-3': 'iso-8859-3',
  75     'latin_4': 'iso-8859-4',
  76     'latin-4': 'iso-8859-4',
  77     'latin_5': 'iso-8859-9',
  78     'latin-5': 'iso-8859-9',
  79     'latin_6': 'iso-8859-10',
  80     'latin-6': 'iso-8859-10',
  81     'latin_7': 'iso-8859-13',
  82     'latin-7': 'iso-8859-13',
  83     'latin_8': 'iso-8859-14',
  84     'latin-8': 'iso-8859-14',
  85     'latin_9': 'iso-8859-15',
  86     'latin-9': 'iso-8859-15',
  87     'latin_10':'iso-8859-16',
  88     'latin-10':'iso-8859-16',
  89     'cp949':   'ks_c_5601-1987',
  90     'euc_jp':  'euc-jp',
  91     'euc_kr':  'euc-kr',
  92     'ascii':   'us-ascii',
  93     }
  94 
  95 
  96 # Map charsets to their Unicode codec strings.
  97 CODEC_MAP = {
  98     'gb2312':      'eucgb2312_cn',
  99     'big5':        'big5_tw',
 100     # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
 101     # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
 102     # Let that stuff pass through without conversion to/from Unicode.
 103     'us-ascii':    None,
 104     }
 105 
 106 
 107 
 108 # Convenience functions for extending the above mappings
 109 def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
 110     """Add character set properties to the global registry.
 111 
 112     charset is the input character set, and must be the canonical name of a
 113     character set.
 114 
 115     Optional header_enc and body_enc is either Charset.QP for
 116     quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
 117     the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
 118     is only valid for header_enc.  It describes how message headers and
 119     message bodies in the input charset are to be encoded.  Default is no
 120     encoding.
 121 
 122     Optional output_charset is the character set that the output should be
 123     in.  Conversions will proceed from input charset, to Unicode, to the
 124     output charset when the method Charset.convert() is called.  The default
 125     is to output in the same character set as the input.
 126 
 127     Both input_charset and output_charset must have Unicode codec entries in
 128     the module's charset-to-codec mapping; use add_codec(charset, codecname)
 129     to add codecs the module does not know about.  See the codecs module's
 130     documentation for more information.
 131     """
 132     if body_enc == SHORTEST:
 133         raise ValueError('SHORTEST not allowed for body_enc')
 134     CHARSETS[charset] = (header_enc, body_enc, output_charset)
 135 
 136 
 137 def add_alias(alias, canonical):
 138     """Add a character set alias.
 139 
 140     alias is the alias name, e.g. latin-1
 141     canonical is the character set's canonical name, e.g. iso-8859-1
 142     """
 143     ALIASES[alias] = canonical
 144 
 145 
 146 def add_codec(charset, codecname):
 147     """Add a codec that map characters in the given charset to/from Unicode.
 148 
 149     charset is the canonical name of a character set.  codecname is the name
 150     of a Python codec, as appropriate for the second argument to the unicode()
 151     built-in, or to the encode() method of a Unicode string.
 152     """
 153     CODEC_MAP[charset] = codecname
 154 
 155 
 156 
 157 # Convenience function for encoding strings, taking into account
 158 # that they might be unknown-8bit (ie: have surrogate-escaped bytes)
 159 def _encode(string, codec):
 160     if codec == UNKNOWN8BIT:
 161         return string.encode('ascii', 'surrogateescape')
 162     else:
 163         return string.encode(codec)
 164 
 165 
 166 
 167 class Charset:
 168     """Map character sets to their email properties.
 169 
 170     This class provides information about the requirements imposed on email
 171     for a specific character set.  It also provides convenience routines for
 172     converting between character sets, given the availability of the
 173     applicable codecs.  Given a character set, it will do its best to provide
 174     information on how to use that character set in an email in an
 175     RFC-compliant way.
 176 
 177     Certain character sets must be encoded with quoted-printable or base64
 178     when used in email headers or bodies.  Certain character sets must be
 179     converted outright, and are not allowed in email.  Instances of this
 180     module expose the following information about a character set:
 181 
 182     input_charset: The initial character set specified.  Common aliases
 183                    are converted to their `official' email names (e.g. latin_1
 184                    is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
 185 
 186     header_encoding: If the character set must be encoded before it can be
 187                      used in an email header, this attribute will be set to
 188                      Charset.QP (for quoted-printable), Charset.BASE64 (for
 189                      base64 encoding), or Charset.SHORTEST for the shortest of
 190                      QP or BASE64 encoding.  Otherwise, it will be None.
 191 
 192     body_encoding: Same as header_encoding, but describes the encoding for the
 193                    mail message's body, which indeed may be different than the
 194                    header encoding.  Charset.SHORTEST is not allowed for
 195                    body_encoding.
 196 
 197     output_charset: Some character sets must be converted before they can be
 198                     used in email headers or bodies.  If the input_charset is
 199                     one of them, this attribute will contain the name of the
 200                     charset output will be converted to.  Otherwise, it will
 201                     be None.
 202 
 203     input_codec: The name of the Python codec used to convert the
 204                  input_charset to Unicode.  If no conversion codec is
 205                  necessary, this attribute will be None.
 206 
 207     output_codec: The name of the Python codec used to convert Unicode
 208                   to the output_charset.  If no conversion codec is necessary,
 209                   this attribute will have the same value as the input_codec.
 210     """
 211     def __init__(self, input_charset=DEFAULT_CHARSET):
 212         # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
 213         # unicode because its .lower() is locale insensitive.  If the argument
 214         # is already a unicode, we leave it at that, but ensure that the
 215         # charset is ASCII, as the standard (RFC XXX) requires.
 216         try:
 217             if isinstance(input_charset, str):
 218                 input_charset.encode('ascii')
 219             else:
 220                 input_charset = str(input_charset, 'ascii')
 221         except UnicodeError:
 222             raise errors.CharsetError(input_charset)
 223         input_charset = input_charset.lower()
 224         # Set the input charset after filtering through the aliases
 225         self.input_charset = ALIASES.get(input_charset, input_charset)
 226         # We can try to guess which encoding and conversion to use by the
 227         # charset_map dictionary.  Try that first, but let the user override
 228         # it.
 229         henc, benc, conv = CHARSETS.get(self.input_charset,
 230                                         (SHORTEST, BASE64, None))
 231         if not conv:
 232             conv = self.input_charset
 233         # Set the attributes, allowing the arguments to override the default.
 234         self.header_encoding = henc
 235         self.body_encoding = benc
 236         self.output_charset = ALIASES.get(conv, conv)
 237         # Now set the codecs.  If one isn't defined for input_charset,
 238         # guess and try a Unicode codec with the same name as input_codec.
 239         self.input_codec = CODEC_MAP.get(self.input_charset,
 240                                          self.input_charset)
 241         self.output_codec = CODEC_MAP.get(self.output_charset,
 242                                           self.output_charset)
 243 
 244     def __str__(self):
 245         return self.input_charset.lower()
 246 
 247     __repr__ = __str__
 248 
 249     def __eq__(self, other):
 250         return str(self) == str(other).lower()
 251 
 252     def __ne__(self, other):
 253         return not self.__eq__(other)
 254 
 255     def get_body_encoding(self):
 256         """Return the content-transfer-encoding used for body encoding.
 257 
 258         This is either the string `quoted-printable' or `base64' depending on
 259         the encoding used, or it is a function in which case you should call
 260         the function with a single argument, the Message object being
 261         encoded.  The function should then set the Content-Transfer-Encoding
 262         header itself to whatever is appropriate.
 263 
 264         Returns "quoted-printable" if self.body_encoding is QP.
 265         Returns "base64" if self.body_encoding is BASE64.
 266         Returns conversion function otherwise.
 267         """
 268         assert self.body_encoding != SHORTEST
 269         if self.body_encoding == QP:
 270             return 'quoted-printable'
 271         elif self.body_encoding == BASE64:
 272             return 'base64'
 273         else:
 274             return encode_7or8bit
 275 
 276     def get_output_charset(self):
 277         """Return the output character set.
 278 
 279         This is self.output_charset if that is not None, otherwise it is
 280         self.input_charset.
 281         """
 282         return self.output_charset or self.input_charset
 283 
 284     def header_encode(self, string):
 285         """Header-encode a string by converting it first to bytes.
 286 
 287         The type of encoding (base64 or quoted-printable) will be based on
 288         this charset's `header_encoding`.
 289 
 290         :param string: A unicode string for the header.  It must be possible
 291             to encode this string to bytes using the character set's
 292             output codec.
 293         :return: The encoded string, with RFC 2047 chrome.
 294         """
 295         codec = self.output_codec or 'us-ascii'
 296         header_bytes = _encode(string, codec)
 297         # 7bit/8bit encodings return the string unchanged (modulo conversions)
 298         encoder_module = self._get_encoder(header_bytes)
 299         if encoder_module is None:
 300             return string
 301         return encoder_module.header_encode(header_bytes, codec)
 302 
 303     def header_encode_lines(self, string, maxlengths):
 304         """Header-encode a string by converting it first to bytes.
 305 
 306         This is similar to `header_encode()` except that the string is fit
 307         into maximum line lengths as given by the argument.
 308 
 309         :param string: A unicode string for the header.  It must be possible
 310             to encode this string to bytes using the character set's
 311             output codec.
 312         :param maxlengths: Maximum line length iterator.  Each element
 313             returned from this iterator will provide the next maximum line
 314             length.  This parameter is used as an argument to built-in next()
 315             and should never be exhausted.  The maximum line lengths should
 316             not count the RFC 2047 chrome.  These line lengths are only a
 317             hint; the splitter does the best it can.
 318         :return: Lines of encoded strings, each with RFC 2047 chrome.
 319         """
 320         # See which encoding we should use.
 321         codec = self.output_codec or 'us-ascii'
 322         header_bytes = _encode(string, codec)
 323         encoder_module = self._get_encoder(header_bytes)
 324         encoder = partial(encoder_module.header_encode, charset=codec)
 325         # Calculate the number of characters that the RFC 2047 chrome will
 326         # contribute to each line.
 327         charset = self.get_output_charset()
 328         extra = len(charset) + RFC2047_CHROME_LEN
 329         # Now comes the hard part.  We must encode bytes but we can't split on
 330         # bytes because some character sets are variable length and each
 331         # encoded word must stand on its own.  So the problem is you have to
 332         # encode to bytes to figure out this word's length, but you must split
 333         # on characters.  This causes two problems: first, we don't know how
 334         # many octets a specific substring of unicode characters will get
 335         # encoded to, and second, we don't know how many ASCII characters
 336         # those octets will get encoded to.  Unless we try it.  Which seems
 337         # inefficient.  In the interest of being correct rather than fast (and
 338         # in the hope that there will be few encoded headers in any such
 339         # message), brute force it. :(
 340         lines = []
 341         current_line = []
 342         maxlen = next(maxlengths) - extra
 343         for character in string:
 344             current_line.append(character)
 345             this_line = EMPTYSTRING.join(current_line)
 346             length = encoder_module.header_length(_encode(this_line, charset))
 347             if length > maxlen:
 348                 # This last character doesn't fit so pop it off.
 349                 current_line.pop()
 350                 # Does nothing fit on the first line?
 351                 if not lines and not current_line:
 352                     lines.append(None)
 353                 else:
 354                     separator = (' ' if lines else '')
 355                     joined_line = EMPTYSTRING.join(current_line)
 356                     header_bytes = _encode(joined_line, codec)
 357                     lines.append(encoder(header_bytes))
 358                 current_line = [character]
 359                 maxlen = next(maxlengths) - extra
 360         joined_line = EMPTYSTRING.join(current_line)
 361         header_bytes = _encode(joined_line, codec)
 362         lines.append(encoder(header_bytes))
 363         return lines
 364 
 365     def _get_encoder(self, header_bytes):
 366         if self.header_encoding == BASE64:
 367             return email.base64mime
 368         elif self.header_encoding == QP:
 369             return email.quoprimime
 370         elif self.header_encoding == SHORTEST:
 371             len64 = email.base64mime.header_length(header_bytes)
 372             lenqp = email.quoprimime.header_length(header_bytes)
 373             if len64 < lenqp:
 374                 return email.base64mime
 375             else:
 376                 return email.quoprimime
 377         else:
 378             return None
 379 
 380     def body_encode(self, string):
 381         """Body-encode a string by converting it first to bytes.
 382 
 383         The type of encoding (base64 or quoted-printable) will be based on
 384         self.body_encoding.  If body_encoding is None, we assume the
 385         output charset is a 7bit encoding, so re-encoding the decoded
 386         string using the ascii codec produces the correct string version
 387         of the content.
 388         """
 389         # 7bit/8bit encodings return the string unchanged (module conversions)
 390         if self.body_encoding is BASE64:
 391             if isinstance(string, str):
 392                 string = string.encode(self.output_charset)
 393             return email.base64mime.body_encode(string)
 394         elif self.body_encoding is QP:
 395             # quopromime.body_encode takes a string, but operates on it as if
 396             # it were a list of byte codes.  For a (minimal) history on why
 397             # this is so, see changeset 0cf700464177.  To correctly encode a
 398             # character set, then, we must turn it into pseudo bytes via the
 399             # latin1 charset, which will encode any byte as a single code point
 400             # between 0 and 255, which is what body_encode is expecting.
 401             #
 402             # Note that this clause doesn't handle the case of a _payload that
 403             # is already bytes.  It never did, and the semantics of _payload
 404             # being bytes has never been nailed down, so fixing that is a
 405             # longer term TODO.
 406             if isinstance(string, str):
 407                 string = string.encode(self.output_charset).decode('latin1')
 408             return email.quoprimime.body_encode(string)
 409         else:
 410             if isinstance(string, str):
 411                 string = string.encode(self.output_charset).decode('ascii')
 412             return string