[Python-3000-checkins] r66951 - in python/branches/py3k: Lib/test/test_pep3120.py Misc/NEWS Parser/tokenizer.c Parser/tokenizer.h Python/ast.c

brett.cannon

2008-10-17 03:38:51 UTC

Author: brett.cannon
Date: Fri Oct 17 05:38:50 2008
New Revision: 66951

Log:
Latin-1 source code was not being properly decoded when passed through
compile(). This was due to left-over special-casing before UTF-8 became the
default source encoding.

Closes issue #3574. Thanks to Victor Stinner for help with the patch.

Modified:
python/branches/py3k/Lib/test/test_pep3120.py
python/branches/py3k/Misc/NEWS
python/branches/py3k/Parser/tokenizer.c
python/branches/py3k/Parser/tokenizer.h
python/branches/py3k/Python/ast.c

Modified: python/branches/py3k/Lib/test/test_pep3120.py
==============================================================================
Binary files. No diff available.

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS (original)
+++ python/branches/py3k/Misc/NEWS Fri Oct 17 05:38:50 2008
@@ -15,6 +15,8 @@
Core and Builtins
-----------------

+- Issue #3574: compile() incorrectly handled source code encoded as Latin-1.
+
- Issues #2384 and #3975: Tracebacks were not correctly printed when the
source file contains a ``coding:`` header: the wrong line was displayed, and
the encoding was not respected.

Modified: python/branches/py3k/Parser/tokenizer.c
==============================================================================
--- python/branches/py3k/Parser/tokenizer.c (original)
+++ python/branches/py3k/Parser/tokenizer.c Fri Oct 17 05:38:50 2008
@@ -135,6 +135,7 @@
tok->decoding_state = STATE_INIT;
tok->decoding_erred = 0;
tok->read_coding_spec = 0;
+ tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
#ifndef PGEN
@@ -274,8 +275,7 @@
tok->read_coding_spec = 1;
if (tok->encoding == NULL) {
assert(tok->decoding_state == STATE_RAW);
- if (strcmp(cs, "utf-8") == 0 ||
- strcmp(cs, "iso-8859-1") == 0) {
+ if (strcmp(cs, "utf-8") == 0) {
tok->encoding = cs;
} else {
r = set_readline(tok, cs);

Modified: python/branches/py3k/Parser/tokenizer.h
==============================================================================
--- python/branches/py3k/Parser/tokenizer.h (original)
+++ python/branches/py3k/Parser/tokenizer.h Fri Oct 17 05:38:50 2008
@@ -49,14 +49,14 @@
enum decoding_state decoding_state;
int decoding_erred; /* whether erred in decoding */
int read_coding_spec; /* whether 'coding:...' has been read */
- char *encoding;
+ char *encoding; /* Source encoding. */
int cont_line; /* whether we are in a continuation line. */
const char* line_start; /* pointer to start of current line */
#ifndef PGEN
PyObject *decoding_readline; /* codecs.open(...).readline */
PyObject *decoding_buffer;
#endif
- const char* enc;
+ const char* enc; /* Encoding for the current str. */
const char* str;
};

Modified: python/branches/py3k/Python/ast.c
==============================================================================
--- python/branches/py3k/Python/ast.c (original)
+++ python/branches/py3k/Python/ast.c Fri Oct 17 05:38:50 2008
@@ -3160,9 +3160,6 @@
if (encoding == NULL) {
buf = (char *)s;
u = NULL;
- } else if (strcmp(encoding, "iso-8859-1") == 0) {
- buf = (char *)s;
- u = NULL;
} else {
/* check for integer overflow */
if (len > PY_SIZE_MAX / 4)
@@ -3275,8 +3272,7 @@
}
}
need_encoding = (!*bytesmode && c->c_encoding != NULL &&
- strcmp(c->c_encoding, "utf-8") != 0 &&
- strcmp(c->c_encoding, "iso-8859-1") != 0);
+ strcmp(c->c_encoding, "utf-8") != 0);
if (rawmode || strchr(s, '\\') == NULL) {
if (need_encoding) {
PyObject *v, *u = PyUnicode_DecodeUTF8(s, len, NULL);