Commit 1510ac32 authored by echel0n's avatar echel0n

Merge branch 'release/v3.2.1'

parents c86aadca 360c3afa
# -*- coding: utf-8 -*-
"""
ftfy: fixes text for you
This is a module for making text less broken. See the `fix_text` function
for more information.
"""
from __future__ import unicode_literals
# See the docstring for ftfy.bad_codecs to see what we're doing here.
import ftfy.bad_codecs
ftfy.bad_codecs.ok()
from ftfy import fixes
from ftfy.fixes import fix_text_encoding
from ftfy.compatibility import PYTHON34_OR_LATER, is_printable
import unicodedata
import warnings
def fix_text(text,
remove_unsafe_private_use=(not PYTHON34_OR_LATER),
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True,
max_decode_length=2**16):
r"""
Given Unicode text as input, make its representation consistent and
possibly less broken.
Let's start with some examples:
>>> print(fix_text('ünicode'))
ünicode
>>> print(fix_text('Broken text… it’s flubberific!'))
Broken text... it's flubberific!
>>> print(fix_text('HTML entities <3'))
HTML entities <3
>>> print(fix_text('<em>HTML entities &lt;3</em>'))
<em>HTML entities &lt;3</em>
>>> print(fix_text('\001\033[36;44mI&#x92;m blue, da ba dee da ba '
... 'doo&#133;\033[0m'))
I'm blue, da ba dee da ba doo...
>>> # This example string starts with a byte-order mark, even if
>>> # you can't see it on the Web.
>>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
Party like
it's 1999!
>>> len(fix_text('fi' * 100000))
200000
>>> len(fix_text(''))
0
Based on the options you provide, ftfy applies these steps in order:
- If `remove_unsafe_private_use` is True, it removes a range of private-use
characters that could trigger a Python bug. The bug is fixed in
the most recent versions of Python, so this will default to False
starting on Python 3.4.
- If `fix_entities` is True, replace HTML entities with their equivalent
characters. If it's "auto" (the default), then consider replacing HTML
entities, but don't do so in text where you have seen a pair of actual
angle brackets (that's probably actually HTML and you shouldn't mess
with the entities).
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
instructions for Unix terminals, such as the codes that make text appear
in different colors.
- If `fix_encoding` is True, look for common mistakes that come from
encoding or decoding Unicode text incorrectly, and fix them if they are
reasonably fixable. See `fix_text_encoding` for details.
- If `normalization` is not None, apply the specified form of Unicode
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
The default, 'NFKC', applies the following relevant transformations:
- C: Combine characters and diacritics that are written using separate
code points, such as converting "e" plus an acute accent modifier
into "é", or converting "ka" (か) plus a dakuten into the
single character "ga" (が).
- K: Replace characters that are functionally equivalent with the most
common form. For example, half-width katakana will be replaced with
full-width versions, full-width Roman characters will be replaced with
ASCII characters, ellipsis characters will be replaced with three
periods, and the ligature 'fl' will be replaced with 'fl'.
- If `uncurl_quotes` is True, replace various curly quotation marks with
plain-ASCII straight quotes.
- If `fix_line_breaks` is true, convert all line breaks to Unix style
(CRLF and CR line breaks become LF line breaks).
- If `fix_control_characters` is true, remove all C0 control characters
except the common useful ones: TAB, CR, LF, and FF. (CR characters
may have already been removed by the `fix_line_breaks` step.)
- If `remove_bom` is True, remove the Byte-Order Mark if it exists.
- If anything was changed, repeat all the steps, so that the function is
idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
`fix_text` will work one line at a time, with the possibility that some
lines are in different encodings. When it encounters lines longer than
`max_decode_length`, it will not run the `fix_encoding` step, to avoid
unbounded slowdowns.
If you are certain your entire text is in the same encoding (though that
encoding is possibly flawed), and do not mind performing operations on
the whole text at once, use `fix_text_segment`.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
out = []
pos = 0
while pos < len(text):
textbreak = text.find('\n', pos) + 1
fix_encoding_this_time = fix_encoding
if textbreak == 0:
textbreak = len(text)
if (textbreak - pos) > max_decode_length:
fix_encoding_this_time = False
substring = text[pos:textbreak]
if fix_entities == 'auto' and '<' in substring and '>' in substring:
# we see angle brackets together; this could be HTML
fix_entities = False
out.append(
fix_text_segment(
substring,
remove_unsafe_private_use=remove_unsafe_private_use,
fix_entities=fix_entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding_this_time,
normalization=normalization,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom
)
)
pos = textbreak
return ''.join(out)
ftfy = fix_text
def fix_file(input_file,
remove_unsafe_private_use=True,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True):
"""
Fix text that is found in a file.
If the file is being read as Unicode text, use that. If it's being read as
bytes, then unfortunately, we have to guess what encoding it is. We'll try
a few common encodings, but we make no promises. See the `guess_bytes`
function for how this is done.
The output is a stream of fixed lines of text.
"""
entities = fix_entities
for line in input_file:
if isinstance(line, bytes):
line, encoding = guess_bytes(line)
if fix_entities == 'auto' and '<' in line and '>' in line:
entities = False
yield fix_text_segment(
line,
remove_unsafe_private_use=remove_unsafe_private_use,
fix_entities=entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding,
normalization=normalization,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom
)
def fix_text_segment(text,
remove_unsafe_private_use=True,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
normalization='NFKC',
uncurl_quotes=True,
fix_line_breaks=True,
remove_control_chars=True,
remove_bom=True):
"""
Apply fixes to text in a single chunk. This could be a line of text
within a larger run of `fix_text`, or it could be a larger amount
of text that you are certain is all in the same encoding.
See `fix_text` for a description of the parameters.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if remove_unsafe_private_use:
text = fixes.remove_unsafe_private_use(text)
if fix_entities:
text = fixes.unescape_html(text)
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_text_encoding(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom:
text = fixes.remove_bom(text)
if text == origtext:
return text
def guess_bytes(bstring):
"""
If you have some bytes in an unknown encoding, here's a reasonable
strategy for decoding them, by trying a few common encodings that
can be distinguished from each other.
This is not a magic bullet. If the bytes are coming from some MySQL
database with the "character set" set to ISO Elbonian, this won't figure
it out. Perhaps more relevantly, this currently doesn't try East Asian
encodings.
The encodings we try are:
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
like nothing else
- UTF-8, because it's the global de facto standard
- "utf-8-variants", because it's what people actually implement when they
think they're doing UTF-8
- MacRoman, because Microsoft Office thinks it's still a thing, and it
can be distinguished by its line breaks. (If there are no line breaks in
the string, though, you're out of luck.)
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
single-byte encoding
"""
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
return bstring.decode('utf-16'), 'utf-16'
byteset = set(bytes(bstring))
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
try:
if byte_ed in byteset or byte_c0 in byteset:
# Byte 0xed can be used to encode a range of codepoints that
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
# so when we see 0xed, it's very likely we're being asked to
# decode CESU-8, the variant that encodes UTF-16 surrogates
# instead of the original characters themselves.
#
# This will occasionally trigger on standard UTF-8, as there
# are some Korean characters that also use byte 0xed, but that's
# not harmful.
#
# Byte 0xc0 is impossible because, numerically, it would only
# encode characters lower than U+0040. Those already have
# single-byte representations, and UTF-8 requires using the
# shortest possible representation. However, Java hides the null
# codepoint, U+0000, in a non-standard longer representation -- it
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
# will never appear in the encoded bytes.
#
# The 'utf-8-variants' decoder can handle both of these cases, as
# well as standard UTF-8, at the cost of a bit of speed.
return bstring.decode('utf-8-variants'), 'utf-8-variants'
else:
return bstring.decode('utf-8'), 'utf-8'
except UnicodeDecodeError:
pass
if byte_CR in bstring and byte_LF not in bstring:
return bstring.decode('macroman'), 'macroman'
else:
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
def explain_unicode(text):
"""
A utility method that's useful for debugging mysterious Unicode.
It breaks down a string, showing you for each codepoint its number in
hexadecimal, its glyph, its category in the Unicode standard, and its name
in the Unicode standard.
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
U+0028 ( [Ps] LEFT PARENTHESIS
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+00B0 ° [So] DEGREE SIGN
U+25A1 □ [So] WHITE SQUARE
U+00B0 ° [So] DEGREE SIGN
U+0029 ) [Pe] RIGHT PARENTHESIS
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
U+0020 [Zs] SPACE
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
"""
for char in text:
if is_printable(char):
display = char
else:
display = char.encode('unicode-escape').decode('ascii')
print('U+{code:04X} {display:<7} [{category}] {name}'.format(
display=display,
code=ord(char),
category=unicodedata.category(char),
name=unicodedata.name(char, '<unknown>')
))
def fix_bad_encoding(text):
"""
Kept for compatibility with previous versions of ftfy.
"""
warnings.warn(
'fix_bad_encoding is now known as fix_text_encoding',
DeprecationWarning
)
return fix_text_encoding(text)
# coding: utf-8
r"""
Give Python the ability to decode some common, flawed encodings.
Python does not want you to be sloppy with your text. Its encoders and decoders
("codecs") follow the relevant standards whenever possible, which means that
when you get text that *doesn't* follow those standards, you'll probably fail
to decode it. Or you might succeed at decoding it for implementation-specific
reasons, which is perhaps worse.
There are some encodings out there that Python wishes didn't exist, which are
widely used outside of Python:
- "utf-8-variants", a family of not-quite-UTF-8 encodings, including the
ever-popular CESU-8 and "Java modified UTF-8".
- "Sloppy" versions of character map encodings, where bytes that don't map to
anything will instead map to the Unicode character with the same number.
Simply importing this module, or in fact any part of the `ftfy` package, will
make these new "bad codecs" available to Python through the standard Codecs
API. You never have to actually call any functions inside `ftfy.bad_codecs`.
However, if you want to call something because your code checker insists on it,
you can call ``ftfy.bad_codecs.ok()``.
A quick example of decoding text that's encoded in CESU-8:
>>> import ftfy.bad_codecs
>>> print(b'\xed\xa0\xbd\xed\xb8\x8d'.decode('utf-8-variants'))
😍
"""
from __future__ import unicode_literals
from encodings import normalize_encoding
import codecs
_CACHE = {}
# Define some aliases for 'utf-8-variants'. All hyphens get turned into
# underscores, because of `normalize_encoding`.
UTF8_VAR_NAMES = (
'utf_8_variants', 'utf8_variants',
'utf_8_variant', 'utf8_variant',
'utf_8_var', 'utf8_var',
'cesu_8', 'cesu8',
'java_utf_8', 'java_utf8'
)
def search_function(encoding):
"""
Register our "bad codecs" with Python's codecs API. This involves adding
a search function that takes in an encoding name, and returns a codec
for that encoding if it knows one, or None if it doesn't.
The encodings this will match are:
- Encodings of the form 'sloppy-windows-NNNN' or 'sloppy-iso-8859-N',
where the non-sloppy version is an encoding that leaves some bytes
unmapped to characters.
- The 'utf-8-variants' encoding, which has the several aliases seen
above.
"""
if encoding in _CACHE:
return _CACHE[encoding]
norm_encoding = normalize_encoding(encoding)
codec = None
if norm_encoding in UTF8_VAR_NAMES:
from ftfy.bad_codecs.utf8_variants import CODEC_INFO
codec = CODEC_INFO
elif norm_encoding.startswith('sloppy_'):
from ftfy.bad_codecs.sloppy import CODECS
codec = CODECS.get(norm_encoding)
if codec is not None:
_CACHE[encoding] = codec
return codec
def ok():
"""
A feel-good function that gives you something to call after importing
this package.
Why is this here? Pyflakes. Pyflakes gets upset when you import a module
and appear not to use it. It doesn't know that you're using it when
you use the ``unicode.encode`` and ``bytes.decode`` methods with certain
encodings.
"""
pass
codecs.register(search_function)
# coding: utf-8
r"""
Decodes single-byte encodings, filling their "holes" in the same messy way that
everyone else does.
A single-byte encoding maps each byte to a Unicode character, except that some
bytes are left unmapped. In the commonly-used Windows-1252 encoding, for
example, bytes 0x81 and 0x8D, among others, have no meaning.
Python, wanting to preserve some sense of decorum, will handle these bytes
as errors. But Windows knows that 0x81 and 0x8D are possible bytes and they're
different from each other. It just hasn't defined what they are in terms of
Unicode.
Software that has to interoperate with Windows-1252 and Unicode -- such as all
the common Web browsers -- will pick some Unicode characters for them to map
to, and the characters they pick are the Unicode characters with the same
numbers: U+0081 and U+008D. This is the same as what Latin-1 does, and the
resulting characters tend to fall into a range of Unicode that's set aside for
obselete Latin-1 control characters anyway.
These sloppy codecs let Python do the same thing, thus interoperating with
other software that works this way. It defines a sloppy version of many
single-byte encodings with holes. (There is no need for a sloppy version of
an encoding without holes: for example, there is no such thing as
sloppy-iso-8859-2 or sloppy-macroman.)
The following encodings will become defined:
- sloppy-windows-1250 (Central European, sort of based on ISO-8859-2)
- sloppy-windows-1251 (Cyrillic)
- sloppy-windows-1252 (Western European, based on Latin-1)
- sloppy-windows-1253 (Greek, sort of based on ISO-8859-7)
- sloppy-windows-1254 (Turkish, based on ISO-8859-9)
- sloppy-windows-1255 (Hebrew, based on ISO-8859-8)
- sloppy-windows-1256 (Arabic)
- sloppy-windows-1257 (Baltic, based on ISO-8859-13)
- sloppy-windows-1258 (Vietnamese)
- sloppy-cp874 (Thai, based on ISO-8859-11)
- sloppy-iso-8859-3 (Maltese and Esperanto, I guess)
- sloppy-iso-8859-6 (different Arabic)
- sloppy-iso-8859-7 (Greek)
- sloppy-iso-8859-8 (Hebrew)
- sloppy-iso-8859-11 (Thai)
Aliases such as "sloppy-cp1252" for "sloppy-windows-1252" will also be
defined.
Only sloppy-windows-1251 and sloppy-windows-1252 are used by the rest of ftfy;
the rest are rather uncommon.
Here are some examples, using `ftfy.explain_unicode` to illustrate how
sloppy-windows-1252 merges Windows-1252 with Latin-1:
>>> from ftfy import explain_unicode
>>> some_bytes = b'\x80\x81\x82'
>>> explain_unicode(some_bytes.decode('latin-1'))
U+0080 \x80 [Cc] <unknown>
U+0081 \x81 [Cc] <unknown>
U+0082 \x82 [Cc] <unknown>
>>> explain_unicode(some_bytes.decode('windows-1252', 'replace'))
U+20AC € [Sc] EURO SIGN
U+FFFD � [So] REPLACEMENT CHARACTER
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
>>> explain_unicode(some_bytes.decode('sloppy-windows-1252'))
U+20AC € [Sc] EURO SIGN
U+0081 \x81 [Cc] <unknown>
U+201A ‚ [Ps] SINGLE LOW-9 QUOTATION MARK
"""
from __future__ import unicode_literals
import codecs
from encodings import normalize_encoding
REPLACEMENT_CHAR = '\ufffd'
def make_sloppy_codec(encoding):
"""
Take a codec name, and return a 'sloppy' version of that codec that can
encode and decode the unassigned bytes in that encoding.
Single-byte encodings in the standard library are defined using some
boilerplate classes surrounding the functions that do the actual work,
`codecs.charmap_decode` and `charmap_encode`. This function, given an
encoding name, *defines* those boilerplate classes.
"""
# Make an array of all 256 possible bytes.
all_bytes = bytearray(range(256))
# Get a list of what they would decode to in Latin-1.
sloppy_chars = list(all_bytes.decode('latin-1'))
# Get a list of what they decode to in the given encoding. Use the
# replacement character for unassigned bytes.
decoded_chars = all_bytes.decode(encoding, 'replace')
# Update the sloppy_chars list. Each byte that was successfully decoded
# gets its decoded value in the list. The unassigned bytes are left as
# they are, which gives their decoding in Latin-1.
for i, char in enumerate(decoded_chars):
if char != REPLACEMENT_CHAR:
sloppy_chars[i] = char
# Create the data structures that tell the charmap methods how to encode
# and decode in this sloppy encoding.
decoding_table = ''.join(sloppy_chars)
encoding_table = codecs.charmap_build(decoding_table)
# Now produce all the class boilerplate. Look at the Python source for
# `encodings.cp1252` for comparison; this is almost exactly the same,
# except I made it follow pep8.
class Codec(codecs.Codec):
def encode(self, input, errors='strict'):
return codecs.charmap_encode(input, errors, encoding_table)
def decode(self, input, errors='strict'):
return codecs.charmap_decode(input, errors, decoding_table)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
class StreamWriter(Codec, codecs.StreamWriter):
pass
class StreamReader(Codec, codecs.StreamReader):
pass
return codecs.CodecInfo(
name='sloppy-' + encoding,
encode=Codec().encode,
decode=Codec().decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
# Define a codec for each incomplete encoding. The resulting CODECS dictionary
# can be used by the main module of ftfy.bad_codecs.
CODECS = {}
INCOMPLETE_ENCODINGS = (
['windows-%s' % num for num in range(1250, 1259)] +
['iso-8859-%s' % num for num in (3, 6, 7, 8, 11)] +
['cp%s' % num for num in range(1250, 1259)] + ['cp874']
)
for _encoding in INCOMPLETE_ENCODINGS:
_new_name = normalize_encoding('sloppy-' + _encoding)
CODECS[_new_name] = make_sloppy_codec(_encoding)
r"""
This file defines a codec called "utf-8-variants" (or "utf-8-var"), which can
decode text that's been encoded with a popular non-standard version of UTF-8.
This includes CESU-8, the accidental encoding made by layering UTF-8 on top of
UTF-16, as well as Java's twist on CESU-8 that contains a two-byte encoding for
codepoint 0.
This is particularly relevant in Python 3, which provides no other way of
decoding CESU-8 or Java's encoding. [1]
The easiest way to use the codec is to simply import `ftfy.bad_codecs`:
>>> import ftfy.bad_codecs
>>> result = b'here comes a null! \xc0\x80'.decode('utf-8-var')
>>> print(repr(result).lstrip('u'))
'here comes a null! \x00'
The codec does not at all enforce "correct" CESU-8. For example, the Unicode
Consortium's not-quite-standard describing CESU-8 requires that there is only
one possible encoding of any character, so it does not allow mixing of valid