jeremy.hylton
2008-07-18 20:59:45 UTC
Author: jeremy.hylton
Date: Fri Jul 18 22:59:44 2008
New Revision: 65118
Log:
Bug 3347: robotparser failed because it didn't convert bytes to string.
The solution is to convert bytes to text via utf-8. I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.
Modified:
python/branches/py3k/Lib/test/test_robotparser.py
python/branches/py3k/Lib/urllib/robotparser.py
Modified: python/branches/py3k/Lib/test/test_robotparser.py
==============================================================================
--- python/branches/py3k/Lib/test/test_robotparser.py (original)
+++ python/branches/py3k/Lib/test/test_robotparser.py Fri Jul 18 22:59:44 2008
@@ -136,8 +136,9 @@
RobotTest(7, doc, good, bad)
-class TestCase(unittest.TestCase):
- def runTest(self):
+class NetworkTestCase(unittest.TestCase):
+
+ def testPasswordProtectedSite(self):
support.requires('network')
# whole site is password-protected.
url = 'http://mueblesmoraleda.com'
@@ -146,9 +147,17 @@
parser.read()
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
+ def testPythonOrg(self):
+ support.requires('network')
+ parser = urllib.robotparser.RobotFileParser(
+ "http://www.python.org/robots.txt")
+ parser.read()
+ self.assertTrue(parser.can_fetch("*",
+ "http://www.python.org/robots.txt"))
+
def test_main():
+ support.run_unittest(NetworkTestCase)
support.run_unittest(tests)
- TestCase().run()
if __name__=='__main__':
support.Verbose = 1
Modified: python/branches/py3k/Lib/urllib/robotparser.py
==============================================================================
--- python/branches/py3k/Lib/urllib/robotparser.py (original)
+++ python/branches/py3k/Lib/urllib/robotparser.py Fri Jul 18 22:59:44 2008
@@ -60,7 +60,8 @@
elif err.code >= 400:
self.allow_all = True
else:
- self.parse(f.read().splitlines())
+ raw = f.read()
+ self.parse(raw.decode("utf-8").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
@@ -123,7 +124,10 @@
return True
# search for given user agent matches
# the first match counts
- url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+ url = urllib.parse.quote(
+ urllib.parse.urlparse(urllib.parse.unquote(url))[2])
+ if not url:
+ url = "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
Date: Fri Jul 18 22:59:44 2008
New Revision: 65118
Log:
Bug 3347: robotparser failed because it didn't convert bytes to string.
The solution is to convert bytes to text via utf-8. I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.
Modified:
python/branches/py3k/Lib/test/test_robotparser.py
python/branches/py3k/Lib/urllib/robotparser.py
Modified: python/branches/py3k/Lib/test/test_robotparser.py
==============================================================================
--- python/branches/py3k/Lib/test/test_robotparser.py (original)
+++ python/branches/py3k/Lib/test/test_robotparser.py Fri Jul 18 22:59:44 2008
@@ -136,8 +136,9 @@
RobotTest(7, doc, good, bad)
-class TestCase(unittest.TestCase):
- def runTest(self):
+class NetworkTestCase(unittest.TestCase):
+
+ def testPasswordProtectedSite(self):
support.requires('network')
# whole site is password-protected.
url = 'http://mueblesmoraleda.com'
@@ -146,9 +147,17 @@
parser.read()
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
+ def testPythonOrg(self):
+ support.requires('network')
+ parser = urllib.robotparser.RobotFileParser(
+ "http://www.python.org/robots.txt")
+ parser.read()
+ self.assertTrue(parser.can_fetch("*",
+ "http://www.python.org/robots.txt"))
+
def test_main():
+ support.run_unittest(NetworkTestCase)
support.run_unittest(tests)
- TestCase().run()
if __name__=='__main__':
support.Verbose = 1
Modified: python/branches/py3k/Lib/urllib/robotparser.py
==============================================================================
--- python/branches/py3k/Lib/urllib/robotparser.py (original)
+++ python/branches/py3k/Lib/urllib/robotparser.py Fri Jul 18 22:59:44 2008
@@ -60,7 +60,8 @@
elif err.code >= 400:
self.allow_all = True
else:
- self.parse(f.read().splitlines())
+ raw = f.read()
+ self.parse(raw.decode("utf-8").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
@@ -123,7 +124,10 @@
return True
# search for given user agent matches
# the first match counts
- url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+ url = urllib.parse.quote(
+ urllib.parse.urlparse(urllib.parse.unquote(url))[2])
+ if not url:
+ url = "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)