Discussion:
[Python-3000-checkins] r67310 - in python/branches/py3k: Lib/dbm/dumb.py Lib/test/test_dbm_dumb.py Misc/NEWS
brett.cannon
2008-11-21 00:17:54 UTC
Permalink
Author: brett.cannon
Date: Fri Nov 21 01:17:53 2008
New Revision: 67310

Log:
Make dbm.dumb encode strings as UTF-8. Also fix it so it accepts bytes and
strings.

Closes issue #3799.


Modified:
python/branches/py3k/Lib/dbm/dumb.py
python/branches/py3k/Lib/test/test_dbm_dumb.py
python/branches/py3k/Misc/NEWS

Modified: python/branches/py3k/Lib/dbm/dumb.py
==============================================================================
--- python/branches/py3k/Lib/dbm/dumb.py (original)
+++ python/branches/py3k/Lib/dbm/dumb.py Fri Nov 21 01:17:53 2008
@@ -84,6 +84,7 @@
for line in f:
line = line.rstrip()
key, pos_and_siz_pair = eval(line)
+ key = key.encode('Latin-1')
self._index[key] = pos_and_siz_pair
f.close()

@@ -110,13 +111,16 @@
f = self._io.open(self._dirfile, 'w')
self._chmod(self._dirfile)
for key, pos_and_siz_pair in self._index.items():
- f.write("%r, %r\n" % (key, pos_and_siz_pair))
+ # Use Latin-1 since it has no qualms with any value in any
+ # position; UTF-8, though, does care sometimes.
+ f.write("%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair))
f.close()

sync = _commit

def __getitem__(self, key):
- key = key.decode("latin-1")
+ if isinstance(key, str):
+ key = key.encode('utf-8')
pos, siz = self._index[key] # may raise KeyError
f = _io.open(self._datfile, 'rb')
f.seek(pos)
@@ -161,11 +165,12 @@
f.close()

def __setitem__(self, key, val):
- if not isinstance(key, bytes):
- raise TypeError("keys must be bytes")
- key = key.decode("latin-1") # hashable bytes
+ if isinstance(key, str):
+ key = key.encode('utf-8')
+ elif not isinstance(key, (bytes, bytearray)):
+ raise TypeError("keys must be bytes or strings")
if not isinstance(val, (bytes, bytearray)):
- raise TypeError("values must be byte strings")
+ raise TypeError("values must be bytes")
if key not in self._index:
self._addkey(key, self._addval(val))
else:
@@ -191,7 +196,8 @@
# (so that _commit() never gets called).

def __delitem__(self, key):
- key = key.decode("latin-1")
+ if isinstance(key, str):
+ key = key.encode('utf-8')
# The blocks used by the associated value are lost.
del self._index[key]
# XXX It's unclear why we do a _commit() here (the code always
@@ -201,14 +207,14 @@
self._commit()

def keys(self):
- return [key.encode("latin-1") for key in self._index.keys()]
+ return list(self._index.keys())

def items(self):
- return [(key.encode("latin-1"), self[key.encode("latin-1")])
- for key in self._index.keys()]
+ return [(key, self[key]) for key in self._index.keys()]

def __contains__(self, key):
- key = key.decode("latin-1")
+ if isinstance(key, str):
+ key = key.encode('utf-8')
return key in self._index

def iterkeys(self):

Modified: python/branches/py3k/Lib/test/test_dbm_dumb.py
==============================================================================
--- python/branches/py3k/Lib/test/test_dbm_dumb.py (original)
+++ python/branches/py3k/Lib/test/test_dbm_dumb.py Fri Nov 21 01:17:53 2008
@@ -19,13 +19,14 @@
pass

class DumbDBMTestCase(unittest.TestCase):
- _dict = {'0': b'',
- 'a': b'Python:',
- 'b': b'Programming',
- 'c': b'the',
- 'd': b'way',
- 'f': b'Guido',
- 'g': b'intended',
+ _dict = {b'0': b'',
+ b'a': b'Python:',
+ b'b': b'Programming',
+ b'c': b'the',
+ b'd': b'way',
+ b'f': b'Guido',
+ b'g': b'intended',
+ '\u00fc'.encode('utf-8') : b'!',
}

def __init__(self, *args):
@@ -35,7 +36,7 @@
f = dumbdbm.open(_fname, 'c')
self.assertEqual(list(f.keys()), [])
for key in self._dict:
- f[key.encode("ascii")] = self._dict[key]
+ f[key] = self._dict[key]
self.read_helper(f)
f.close()

@@ -73,7 +74,7 @@
def test_dumbdbm_modification(self):
self.init_db()
f = dumbdbm.open(_fname, 'w')
- self._dict['g'] = f[b'g'] = b"indented"
+ self._dict[b'g'] = f[b'g'] = b"indented"
self.read_helper(f)
f.close()

@@ -105,6 +106,21 @@
self.assertEqual(f[b'1'], b'hello2')
f.close()

+ def test_str_read(self):
+ self.init_db()
+ f = dumbdbm.open(_fname, 'r')
+ self.assertEqual(f['\u00fc'], self._dict['\u00fc'.encode('utf-8')])
+
+ def test_str_write_contains(self):
+ self.init_db()
+ f = dumbdbm.open(_fname)
+ f['\u00fc'] = b'!'
+ f.close()
+ f = dumbdbm.open(_fname, 'r')
+ self.assert_('\u00fc' in f)
+ self.assertEqual(f['\u00fc'.encode('utf-8')],
+ self._dict['\u00fc'.encode('utf-8')])
+
def test_line_endings(self):
# test for bug #1172763: dumbdbm would die if the line endings
# weren't what was expected.
@@ -129,16 +145,16 @@
def read_helper(self, f):
keys = self.keys_helper(f)
for key in self._dict:
- self.assertEqual(self._dict[key], f[key.encode("ascii")])
+ self.assertEqual(self._dict[key], f[key])

def init_db(self):
f = dumbdbm.open(_fname, 'w')
for k in self._dict:
- f[k.encode("ascii")] = self._dict[k]
+ f[k] = self._dict[k]
f.close()

def keys_helper(self, f):
- keys = sorted(k.decode("ascii") for k in f.keys())
+ keys = sorted(f.keys())
dkeys = sorted(self._dict.keys())
self.assertEqual(keys, dkeys)
return keys
@@ -155,12 +171,12 @@
if random.random() < 0.2:
if k in d:
del d[k]
- del f[k.encode("ascii")]
+ del f[k]
else:
v = random.choice((b'a', b'b', b'c')) * random.randrange(10000)
d[k] = v
- f[k.encode("ascii")] = v
- self.assertEqual(f[k.encode("ascii")], v)
+ f[k] = v
+ self.assertEqual(f[k], v)
f.close()

f = dumbdbm.open(_fname)

Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS (original)
+++ python/branches/py3k/Misc/NEWS Fri Nov 21 01:17:53 2008
@@ -19,7 +19,7 @@
- Issue #3327: Don't overallocate in the modules_by_index list.

- Issue #1721812: Binary set operations and copy() returned the input type
- instead of the appropriate base type. This was incorrect because set
+ instead of the appropriate base type. This was incorrect because set
subclasses would be created without their __init__() method being called.
The corrected behavior brings sets into line with lists and dicts.

@@ -33,6 +33,9 @@
Library
-------

+- Issue #3799: Fix dbm.dumb to accept strings as well as bytes for keys. String
+ keys are now written out in UTF-8.
+
- Issue #4338: Fix distutils upload command.

- Issue #4354: Fix distutils register command.
skip
2008-11-21 13:12:45 UTC
Permalink
Author: brett.cannon
Date: Fri Nov 21 01:17:53 2008
New Revision: 67310

Log:
Make dbm.dumb encode strings as UTF-8. Also fix it so it accepts bytes and
strings.

Closes issue #3799.

I'm not online right now so I can't verify this by reviewing the ticket, but
I thought Guido was of the opinion that the 3.0 version should be able to
read dumb dbms written by earlier Python versions. That would mean that if
trying to read utf-8 content fails you need to fall back to latin-1.

Skip
skip
2008-11-21 15:36:09 UTC
Permalink
me> ... I thought Guido was of the opinion that the 3.0 version should
me> be able to read dumb dbms written by earlier Python versions....

And write them. From msg72963:

(1) Be able to read databases written by Python 2.x.

(1a) Write databases readable by Python 2.x.

Ah, but wait a minute. I see your comment in msg76080:

If you look at the 2.7 code all it requires of keys and values in
__setitem__ is that they are strings; there is nothing about Latin-1 in
terms of specific encoding (must be a 3.0 addition to make the
str/unicode transition the easiest).

The acid test. I executed the attached mydb2write.py using Python 2.5 then
executed the attached mydb3read.py using Python 3.0. The output:

% python2.5 mydb2write.py
1 abc
2 [4, {4.2999999999999998: 12}]
3 <__main__.C instance at 0x34bb70>
% python3.0 mydb3read.py
1 b'abc'
2 [4, {4.2999999999999998: 12}]
Traceback (most recent call last):
File "mydb3read.py", line 13, in <module>
print(3, pickle.loads(db['3']))
File "/Users/skip/local/lib/python3.0/pickle.py", line 1329, in loads
return Unpickler(file, encoding=encoding, errors=errors).load()
_pickle.UnpicklingError: bad pickle data

so if the ability to read Python 2.x dumbdbm files is still a requirement I
think there's a little more work to do.

cc'ing report at bugs.python.org to preserve the scripts with the ticket.

Skip

-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/octet-stream
Size: 492 bytes
Desc: dumbdbm write (Python 2.5)
URL: <http://mail.python.org/pipermail/python-3000-checkins/attachments/20081121/34e0f3d9/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/octet-stream
Size: 237 bytes
Desc: dbm.dumb read script (Python 3.0)
URL: <http://mail.python.org/pipermail/python-3000-checkins/attachments/20081121/34e0f3d9/attachment-0001.obj>
Guido van Rossum
2008-11-21 15:44:54 UTC
Permalink
I think the ability to read old files is essential. The ability to
write them is a mer nice-to-have.
Post by skip
me> ... I thought Guido was of the opinion that the 3.0 version should
me> be able to read dumb dbms written by earlier Python versions....
(1) Be able to read databases written by Python 2.x.
(1a) Write databases readable by Python 2.x.
If you look at the 2.7 code all it requires of keys and values in
__setitem__ is that they are strings; there is nothing about Latin-1 in
terms of specific encoding (must be a 3.0 addition to make the
str/unicode transition the easiest).
The acid test. I executed the attached mydb2write.py using Python 2.5 then
% python2.5 mydb2write.py
1 abc
2 [4, {4.2999999999999998: 12}]
3 <__main__.C instance at 0x34bb70>
% python3.0 mydb3read.py
1 b'abc'
2 [4, {4.2999999999999998: 12}]
File "mydb3read.py", line 13, in <module>
print(3, pickle.loads(db['3']))
File "/Users/skip/local/lib/python3.0/pickle.py", line 1329, in loads
return Unpickler(file, encoding=encoding, errors=errors).load()
_pickle.UnpicklingError: bad pickle data
so if the ability to read Python 2.x dumbdbm files is still a requirement I
think there's a little more work to do.
cc'ing report at bugs.python.org to preserve the scripts with the ticket.
Skip
_______________________________________________
Python-3000-checkins mailing list
Python-3000-checkins at python.org
http://mail.python.org/mailman/listinfo/python-3000-checkins
--
--Guido van Rossum (home page: http://www.python.org/~guido/)
Brett Cannon
2008-11-21 18:27:20 UTC
Permalink
I have taken this over to the issue tracker (http://bugs.python.org/issue3799).
Post by brett.cannon
Author: brett.cannon
Date: Fri Nov 21 01:17:53 2008
New Revision: 67310
Make dbm.dumb encode strings as UTF-8. Also fix it so it accepts bytes and
strings.
Closes issue #3799.
I'm not online right now so I can't verify this by reviewing the ticket, but
I thought Guido was of the opinion that the 3.0 version should be able to
read dumb dbms written by earlier Python versions. That would mean that if
trying to read utf-8 content fails you need to fall back to latin-1.
Skip
_______________________________________________
Python-3000-checkins mailing list
Python-3000-checkins at python.org
http://mail.python.org/mailman/listinfo/python-3000-checkins
Loading...