From 57977e4d0b0c19babd563fedcbd35efde7f2a12e Mon Sep 17 00:00:00 2001 From: John Lee Date: Mon, 18 Apr 2011 23:40:10 +0100 Subject: [PATCH 1/7] Moved some functions into mechanize-build-tools repository --- release.py | 88 +++++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 67 deletions(-) diff --git a/release.py b/release.py index 4f686d0..48928dd 100644 --- a/release.py +++ b/release.py @@ -25,7 +25,8 @@ # git://github.com/jjlee/mechanize-build-tools.git # TODO - +# * Tag mechanize-build-tools repository when releasing so that builds are +# reproducible # * 0install package? # * test in a Windows VM @@ -34,7 +35,6 @@ import os import re import shutil -import smtplib import subprocess import sys import tempfile @@ -45,8 +45,6 @@ # available or not running under Python >= 2.6. AttributeError occurs if run # with Python < 2.6, due to lack of collections.namedtuple try: - import email.mime.text - import action_tree import build_log import cmd_env @@ -116,57 +114,12 @@ def run_performance_tests(path): return result -def send_email(from_address, to_address, subject, body): - msg = email.mime.text.MIMEText(body) - msg['Subject'] = subject - msg['From'] = from_address - msg['To'] = to_address - # print "from_address %r" % from_address - # print "to_address %r" % to_address - # print "msg.as_string():\n%s" % msg.as_string() - s = smtplib.SMTP() - s.connect() - s.sendmail(from_address, [to_address], msg.as_string()) - s.quit() - - -def is_git_repository(path): - return os.path.exists(os.path.join(path, ".git")) - - -def ensure_unmodified(env, path): - # raise if working tree differs from HEAD - release.CwdEnv(env, path).cmd(["git", "diff", "--exit-code", "HEAD"]) - - -def add_to_path_cmd(value): - set_path_script = """\ -if [ -n "$PATH" ] - then - export PATH="$PATH":%(value)s - else - export PATH=%(value)s -fi -exec "$@" -""" % dict(value=value) - return ["sh", "-c", set_path_script, "inline_script"] - - def clean_environ_env(env): return cmd_env.PrefixCmdEnv( ["sh", "-c", 'env -i HOME="$HOME" PATH="$PATH" "$@"', "clean_environ_env"], env) -def ensure_trailing_slash(path): - return path.rstrip("/") + "/" - - -def clean_dir(env, path): - env.cmd(release.rm_rf_cmd(path)) - env.cmd(["mkdir", "-p", path]) - - def check_version_equals(env, version, python): try: output = release.get_cmd_stdout( @@ -212,7 +165,7 @@ def __init__(self, env, install_dir, project_name, [("PYTHONPATH", self._install_dir)], env) def easy_install(self, log): - clean_dir(self._env, self._install_dir) + release.clean_dir(self._env, self._install_dir) check_not_installed(self._install_dir_on_pythonpath, self._python) output = release.get_cmd_stdout( self._install_dir_on_pythonpath, @@ -295,7 +248,7 @@ def __init__(self, env, git_repository_path, release_area, mirror_path, self._release_dir = release_dir = os.path.join(release_area, "release") self._opt_dir = os.path.join(release_dir, "opt") self._bin_dir = os.path.join(self._opt_dir, "bin") - AddToPathEnv = release.make_env_maker(add_to_path_cmd) + AddToPathEnv = release.make_env_maker(release.add_to_path_cmd) self._env = AddToPathEnv(release.GitPagerWrapper(env), self._bin_dir) self._source_repo_path = git_repository_path self._in_source_repo = release.CwdEnv(self._env, @@ -400,7 +353,7 @@ def _ensure_installed(self, package_name, ppa): def install_css_validator_in_release_area(self, log): jar_dir = os.path.join(self._release_area, self._css_validator_path) - clean_dir(self._env, jar_dir) + release.clean_dir(self._env, jar_dir) in_jar_dir = release.CwdEnv(self._env, jar_dir) in_jar_dir.cmd([ "wget", @@ -471,7 +424,7 @@ def copy_test_dependencies(self, log): # automatically on sys.path def copy_in(src): self._env.cmd(["cp", "-r", src, self._test_deps_dir]) - clean_dir(self._env, self._test_deps_dir) + release.clean_dir(self._env, self._test_deps_dir) copy_in(os.path.join(self._repo_path, "test.py")) copy_in(os.path.join(self._repo_path, "test")) copy_in(os.path.join(self._repo_path, "test-tools")) @@ -628,7 +581,7 @@ def pandoc(filename, source_filename): pandoc(filename, source_filename) self._in_repo.cmd(["cp", "-r", "ChangeLog", "docs/html/ChangeLog.txt"]) if self._build_tools_path is not None: - styles = ensure_trailing_slash( + styles = release.ensure_trailing_slash( os.path.join(self._website_source_path, "styles")) self._env.cmd(["rsync", "-a", styles, os.path.join(self._docs_dir, "styles")]) @@ -680,8 +633,8 @@ def _stage(self, path, dest_dir, dest_basename=None, def ensure_unmodified(self, log): if self._build_tools_path: - ensure_unmodified(self._env, self._website_source_path) - ensure_unmodified(self._env, self._mirror_path) + release.ensure_unmodified(self._env, self._website_source_path) + release.ensure_unmodified(self._env, self._mirror_path) def _stage_flat_dir(self, path, dest): self._env.cmd(["mkdir", "-p", os.path.join(self._mirror_path, dest)]) @@ -830,7 +783,7 @@ def validate_css(self, log): raise CSSValidationError(path, output) def fetch_zope_testbrowser(self, log): - clean_dir(self._env, self._zope_testbrowser_dir) + release.clean_dir(self._env, self._zope_testbrowser_dir) in_testbrowser = release.CwdEnv(self._env, self._zope_testbrowser_dir) in_testbrowser.cmd(["easy_install", "--editable", "--build-directory", ".", @@ -873,7 +826,7 @@ def sync_to_sf(self, log): assert os.path.isdir( os.path.join(self._mirror_path, "htdocs/mechanize")) self._env.cmd(["rsync", "-rlptvuz", "--exclude", "*~", "--delete", - ensure_trailing_slash(self._mirror_path), + release.ensure_trailing_slash(self._mirror_path), "jjlee,wwwsearch@web.sourceforge.net:"]) @action_tree.action_node @@ -895,11 +848,11 @@ def upload(self): return r def clean(self, log): - clean_dir(self._env, self._release_area) + release.clean_dir(self._env, self._release_area) def clean_most(self, log): # not dependencies installed in release area (css validator) - clean_dir(self._env, self._release_dir) + release.clean_dir(self._env, self._release_dir) def write_email(self, log): log = release.get_cmd_stdout(self._in_repo, @@ -976,10 +929,11 @@ def send_email(self, log): subject, sep, body = text.partition("\n") body = body.lstrip() assert len(body) > 0, body - send_email(from_address="John J Lee ", - to_address="wwwsearch-general@lists.sourceforge.net", - subject=subject, - body=body) + release.send_email( + from_address="John J Lee ", + to_address="wwwsearch-general@lists.sourceforge.net", + subject=subject, + body=body) @action_tree.action_node def build(self): @@ -1098,14 +1052,14 @@ def parse_options(args): parser.error("Expected at least 1 argument, got %d" % nr_args) if options.git_repository_path is None: options.git_repository_path = os.getcwd() - if not is_git_repository(options.git_repository_path): + if not release.is_git_repository(options.git_repository_path): parser.error("incorrect git repository path") if options.build_tools_repository is not None and \ - not is_git_repository(options.build_tools_repository): + not release.is_git_repository(options.build_tools_repository): parser.error("incorrect mechanize-build-tools repository path") mirror_path = options.mirror_path if mirror_path is not None: - if not is_git_repository(options.mirror_path): + if not release.is_git_repository(options.mirror_path): parser.error("mirror path is not a git reporsitory") mirror_path = os.path.join(mirror_path, "mirror") if not os.path.isdir(mirror_path): From f394dbea6545ec98caf3c520e5abe1a6e9e5135f Mon Sep 17 00:00:00 2001 From: John Lee Date: Wed, 5 Oct 2011 19:28:29 +0100 Subject: [PATCH 2/7] Test that adding the Host header works --- mechanize/_testcase.py | 2 ++ test/test_browser.py | 50 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/mechanize/_testcase.py b/mechanize/_testcase.py index f372760..443f36d 100644 --- a/mechanize/_testcase.py +++ b/mechanize/_testcase.py @@ -4,6 +4,8 @@ import tempfile import unittest +print unittest.__file__ + class SetupStack(object): diff --git a/test/test_browser.py b/test/test_browser.py index 9b874ca..3329dca 100644 --- a/test/test_browser.py +++ b/test/test_browser.py @@ -1,12 +1,18 @@ #!/usr/bin/env python """Tests for mechanize.Browser.""" -import StringIO from unittest import TestCase +import StringIO +import httplib +import mimetools import re -import mechanize from mechanize._response import test_html_response +import mechanize +import mechanize._response +import mechanize._testcase + + FACTORY_CLASSES = [mechanize.DefaultFactory, mechanize.RobustFactory] @@ -158,7 +164,7 @@ def test_referer(self): def test_encoding(self): import mechanize from StringIO import StringIO - import urllib, mimetools + import urllib # always take first encoding, since that's the one from the real HTTP # headers, rather than from HTTP-EQUIV b = mechanize.Browser() @@ -737,7 +743,6 @@ def test_set_response(self): self.assertEqual(list(br.links())[0].url, "eggs") def test_str(self): - import mimetools from mechanize import _response br = TestBrowser() @@ -769,6 +774,43 @@ def test_str(self): >""") +class HttplibTests(mechanize._testcase.TestCase): + + def make_browser(self): + class TestBrowser(mechanize.Browser): + default_features = [] + default_schemes = ["http"] + return TestBrowser() + + def monkey_patch_httplib(self, putheader): + def do_nothing(*args, **kwds): + return + def getresponse(self_): + class Response(object): + msg = mimetools.Message(StringIO.StringIO("")) + status = 200 + reason = "OK" + def read(self__): + return "" + return Response() + self.monkey_patch(httplib.HTTPConnection, "putheader", putheader) + self.monkey_patch(httplib.HTTPConnection, "connect", do_nothing) + self.monkey_patch(httplib.HTTPConnection, "send", do_nothing) + self.monkey_patch(httplib.HTTPConnection, "close", do_nothing) + self.monkey_patch(httplib.HTTPConnection, "getresponse", getresponse) + + def test_add_host_header(self): + headers = [] + def putheader(self_, header, value): + headers.append((header, value)) + self.monkey_patch_httplib(putheader) + browser = self.make_browser() + request = mechanize.Request("http://example.com/") + request.add_header("Host", "myway.example.com") + browser.open(request) + self.assertIn(("Host", "myway.example.com"), headers) + + if __name__ == "__main__": import unittest unittest.main() From 965e258874d76cba8be3dc07b24e87d0685aea4c Mon Sep 17 00:00:00 2001 From: John Lee Date: Wed, 5 Oct 2011 19:57:29 +0100 Subject: [PATCH 3/7] Oops, remove print statement --- mechanize/_testcase.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mechanize/_testcase.py b/mechanize/_testcase.py index 443f36d..f372760 100644 --- a/mechanize/_testcase.py +++ b/mechanize/_testcase.py @@ -4,8 +4,6 @@ import tempfile import unittest -print unittest.__file__ - class SetupStack(object): From 9704b217d4c1985dd3b6f1f8de4924826df158b6 Mon Sep 17 00:00:00 2001 From: John Lee Date: Sun, 12 Feb 2012 14:43:00 +0000 Subject: [PATCH 4/7] Automated update of Opera cookie test URIs for release 0.2.6 --- test/opera_cookie_test_uris | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 test/opera_cookie_test_uris diff --git a/test/opera_cookie_test_uris b/test/opera_cookie_test_uris new file mode 100644 index 0000000..8cc8fd8 --- /dev/null +++ b/test/opera_cookie_test_uris @@ -0,0 +1,10 @@ +http://testsuites.opera.com/cookies/001.php +http://testsuites.opera.com/cookies/003.php +http://testsuites.opera.com/cookies/006.php +http://testsuites.opera.com/cookies/007.php +http://testsuites.opera.com/cookies/008.php +http://testsuites.opera.com/cookies/011.php +http://testsuites.opera.com/cookies/015/015.php +http://testsuites.opera.com/cookies/016.php +http://testsuites.opera.com/cookies/311.php +http://testsuites.opera.com/cookies/312.php From 068fc642ae705aa0c71732db4b77d342e5b23628 Mon Sep 17 00:00:00 2001 From: John Lee Date: Sun, 12 Feb 2012 14:47:05 +0000 Subject: [PATCH 5/7] Add some code to run some cookie tests from the opera website --- mechanize/_util.py | 8 +++ release.py | 21 +++++++ test/skiptest_opera.py | 122 ++++++++++++++++++++++++++++++++++++++++ test/test_functional.py | 10 +--- 4 files changed, 152 insertions(+), 9 deletions(-) create mode 100644 test/skiptest_opera.py diff --git a/mechanize/_util.py b/mechanize/_util.py index 0a5ebb1..3a2ccbc 100644 --- a/mechanize/_util.py +++ b/mechanize/_util.py @@ -30,6 +30,14 @@ def reset_deprecations(): warnings.filterwarnings("default", category=DeprecationWarning) +def read_file(filename): + fh = open(filename) + try: + return fh.read() + finally: + fh.close() + + def write_file(filename, data): f = open(filename, "wb") try: diff --git a/release.py b/release.py index 48928dd..e309082 100644 --- a/release.py +++ b/release.py @@ -452,6 +452,24 @@ def _make_test_cmd(self, python_version, test_cmd.extend(["--uri", uri]) return test_cmd + def update_opera_test_uris(self, log): + import test.test_opera + test.test_opera.OperaCookieTests.write_test_uris() + uris_path = os.path.join( + "test", + test.test_opera.OperaCookieTests.OPERA_COOKIE_TEST_URIS_FILENAME) + self._in_repo.cmd(["git", "add", uris_path]) + try: + release.ensure_unmodified(self._env, self._repo_path) + except cmd_env.CommandFailedError: + self._in_repo.cmd( + ["git", "commit", "-m", + "Automated update of Opera cookie test URIs for release %s" % + self._release_version]) + + def opera_tests(self, log): + self._in_repo.cmd(["python", "test.py", "skiptest_opera"]) + def performance_test(self, log): result = run_performance_tests(self._repo_path) if not result.wasSuccessful(): @@ -537,6 +555,7 @@ def test(self): self._make_source_dist_easy_install_test_step( self._in_repo, python_version=(2, 4)))) r.append(self.performance_test) + r.append(self.opera_tests) return r def make_coverage_html(self, log): @@ -945,6 +964,7 @@ def build(self): self.print_next_tag, self.clone, self.checks, + self.update_opera_test_uris, # self.clean_coverage, self.copy_test_dependencies, self.test, @@ -999,6 +1019,7 @@ def update_staging_website(self): @action_tree.action_node def tell_the_world(self): return [ + # TODO: push master too self.push_tag, self.upload, ("easy_install_test_internet", diff --git a/test/skiptest_opera.py b/test/skiptest_opera.py new file mode 100644 index 0000000..8f3621f --- /dev/null +++ b/test/skiptest_opera.py @@ -0,0 +1,122 @@ +"""Some cookie tests from the testsuites.opera.com website. + +These are skipped by test.py since they access the internet even when the --uri +option is not passed to the test runner. + +TODO: get the source code for these tests and run them locally if feasible +""" + +import os +import posixpath + +import mechanize +from mechanize._util import read_file, write_file + +from test.test_functional import TestCase + + +def ensure_trailing_newline(text): + if not text.endswith("\n"): + return text + "\n" + return text + + +class OperaCookieTests(TestCase): + + OPERA_COOKIE_TEST_URIS_FILENAME = "opera_cookie_test_uris" + + OPERA_COOKIE_TEST_URIS_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + OPERA_COOKIE_TEST_URIS_FILENAME) + + @classmethod + def fetch_test_uris(cls): + browser = mechanize.Browser() + test_page = "http://testsuites.opera.com/cookies/" + browser.open(test_page) + # TODO: These exclusions are all failing. Uncomment the subset of + # these tests that aren't failing due to lack of JS support or similar. + exclusions = set([ + "http://testsuites.opera.com/cookies/002.php", + "http://testsuites.opera.com/cookies/004/004.php", + # uses JS + "http://testsuites.opera.com/cookies/009.php", + # uses JS + "http://testsuites.opera.com/cookies/010.php", + "http://testsuites.opera.com/cookies/012.php", + # the test_page URI comments that this "Needs restart" + "http://testsuites.opera.com/cookies/013.php", + # traceback + "http://testsuites.opera.com/cookies/014/014.php", + # uses JS + "http://testsuites.opera.com/cookies/017.php", + "http://testsuites.opera.com/cookies/201.php", + "http://testsuites.opera.com/cookies/202.php", + "http://testsuites.opera.com/cookies/203.php", + "http://testsuites.opera.com/cookies/204.php", + "http://testsuites.opera.com/cookies/205.php", + "http://testsuites.opera.com/cookies/206.php", + # traceback; the test_page URI comments that this "Needs + # restart" + "http://testsuites.opera.com/cookies/301.php", + "http://testsuites.opera.com/cookies/302/302.php", + "http://testsuites.opera.com/cookies/303.php", + "http://testsuites.opera.com/cookies/304.php", + "http://testsuites.opera.com/cookies/305.php", + "http://testsuites.opera.com/cookies/306.php", + "http://testsuites.opera.com/cookies/307.php", + # traceback + "http://testsuites.opera.com/cookies/308.php", + "http://testsuites.opera.com/cookies/308b.php", + "http://testsuites.opera.com/cookies/309.php", + "http://testsuites.opera.com/cookies/309b.php", + # the test_page URI comments that this "Needs ... deletion of + # all cookies first" + "http://testsuites.opera.com/cookies/310.php", + # the test_page URI comments that this "Might need deletion of + # all cookies first" + "http://testsuites.opera.com/cookies/313.php", + # the test_page URI comments "Needs UI" re these two + "http://testsuites.opera.com/cookies/501.php", + "http://testsuites.opera.com/cookies/502.php", + ]) + uris = [] + for link in browser.links(): + uri = link.absolute_url + if uri not in exclusions: + uris.append(uri) + return uris + + @classmethod + def write_test_uris(cls): + uris = cls.fetch_test_uris() + write_file(cls.OPERA_COOKIE_TEST_URIS_PATH, + ensure_trailing_newline("\n".join(uris))) + + @classmethod + def make_test(cls, uri): + def test(self): + browser = self.make_browser() + browser.open(uri) + self.assertIn("

PASS

", browser.response().get_data()) + scheme, authority, path, query, fragment = \ + mechanize._rfc3986.urlsplit(uri) + name = posixpath.splitext(posixpath.basename(path))[0] + method_name = "test_%s" % name + test.__name__ = method_name + return test + + @classmethod + def add_test(cls, uri): + test = cls.make_test(uri) + setattr(cls, test.__name__, test) + + @classmethod + def add_tests(cls): + if not os.path.exists(cls.OPERA_COOKIE_TEST_URIS_PATH): + return + + for uri in read_file(cls.OPERA_COOKIE_TEST_URIS_PATH).splitlines(): + cls.add_test(uri) + +OperaCookieTests.add_tests() diff --git a/test/test_functional.py b/test/test_functional.py index 90dcaae..0036f32 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -22,7 +22,7 @@ HTTPRedirectDebugProcessor, HTTPResponseDebugProcessor from mechanize._rfc3986 import urljoin from mechanize._util import hide_experimental_warnings, \ - reset_experimental_warnings, write_file + reset_experimental_warnings, read_file, write_file import mechanize._opener import mechanize._rfc3986 import mechanize._sockettimeout @@ -92,14 +92,6 @@ def sanepathname2url(path): return urlpath -def read_file(filename): - fh = open(filename) - try: - return fh.read() - finally: - fh.close() - - class FtpTestCase(TestCase): def test_ftp(self): From 81bb333d8000c2e5cbad6a1003bea234d1a10006 Mon Sep 17 00:00:00 2001 From: John Lee Date: Sat, 18 Mar 2017 17:26:00 +0000 Subject: [PATCH 6/7] Remove source Project has moved to https://github.com/python-mechanize/mechanize --- .gitignore | 7 - COPYING.txt | 101 - ChangeLog | 553 --- INSTALL.txt | 19 - MANIFEST.in | 12 - README.txt | 8 +- docs/development.txt | 47 - docs/doc.txt | 524 --- docs/documentation.txt | 132 - docs/download.txt.in | 55 - docs/faq.txt | 368 -- docs/forms.txt.in | 100 - docs/hints.txt | 154 - docs/html.template | 50 - docs/index.txt | 180 - docs/support.txt | 26 - examples/forms/data.dat | 1 - examples/forms/data.txt | 3 - examples/forms/echo.cgi | 23 - examples/forms/example.html | 54 - examples/forms/example.py | 193 - examples/forms/simple.py | 21 - examples/hack21.py | 60 - examples/pypi.py | 33 - ez_setup.py | 284 -- mechanize/__init__.py | 211 - mechanize/_auth.py | 68 - mechanize/_beautifulsoup.py | 1077 ----- mechanize/_clientcookie.py | 1725 -------- mechanize/_debug.py | 28 - mechanize/_firefox3cookiejar.py | 248 -- mechanize/_form.py | 3280 -------------- mechanize/_gzip.py | 105 - mechanize/_headersutil.py | 241 -- mechanize/_html.py | 629 --- mechanize/_http.py | 447 -- mechanize/_lwpcookiejar.py | 185 - mechanize/_markupbase.py | 393 -- mechanize/_mechanize.py | 669 --- mechanize/_mozillacookiejar.py | 161 - mechanize/_msiecookiejar.py | 388 -- mechanize/_opener.py | 442 -- mechanize/_pullparser.py | 391 -- mechanize/_request.py | 40 - mechanize/_response.py | 525 --- mechanize/_rfc3986.py | 245 -- mechanize/_sgmllib_copy.py | 559 --- mechanize/_sockettimeout.py | 6 - mechanize/_testcase.py | 162 - mechanize/_urllib2.py | 50 - mechanize/_urllib2_fork.py | 1414 ------ mechanize/_useragent.py | 367 -- mechanize/_util.py | 313 -- mechanize/_version.py | 2 - release.py | 1107 ----- setup.cfg | 3 - setup.py | 92 - test-tools/cookietest.cgi | 61 - test-tools/doctest.py | 2695 ------------ test-tools/functools_copy.py | 59 - test-tools/linecache_copy.py | 132 - test-tools/testprogram.py | 469 -- test-tools/twisted-ftpserver.py | 86 - test-tools/twisted-localserver.py | 294 -- test-tools/unittest/__init__.py | 63 - test-tools/unittest/__main__.py | 8 - test-tools/unittest/case.py | 921 ---- test-tools/unittest/loader.py | 387 -- test-tools/unittest/main.py | 178 - test-tools/unittest/result.py | 113 - test-tools/unittest/runner.py | 174 - test-tools/unittest/suite.py | 72 - test-tools/unittest/util.py | 44 - test.py | 48 - test/__init__.py | 0 .../FormsExamplesTests.test_example/output | 43 - .../FormsExamplesTests.test_simple/output | 19 - test/opera_cookie_test_uris | 10 - test/skiptest_opera.py | 122 - test/test_api.py | 13 - test/test_browser.doctest | 295 -- test/test_browser.py | 816 ---- test/test_cookie.py | 60 - test/test_cookies.py | 1892 -------- test/test_date.py | 104 - test/test_form.py | 3525 --------------- test/test_form_data/Auth.html | 79 - test/test_form_data/FullSearch.html | 114 - test/test_form_data/GeneralSearch.html | 178 - test/test_form_data/MarkedRecords.html | 152 - test/test_form_data/MarkedResults.html | 97 - test/test_form_data/Results.html | 94 - test/test_form_data/SearchType.html | 55 - test/test_form_mutation.py | 23 - test/test_forms.doctest | 59 - test/test_functional.py | 767 ---- test/test_headers.py | 146 - test/test_history.doctest | 12 - test/test_html.doctest | 259 -- test/test_html.py | 151 - test/test_import.py | 15 - test/test_opener.doctest | 58 - test/test_opener.py | 283 -- test/test_password_manager.special_doctest | 148 - test/test_performance.py | 104 - test/test_pickle.py | 37 - test/test_pullparser.py | 320 -- test/test_request.doctest | 71 - test/test_response.doctest | 229 - test/test_response.py | 213 - test/test_rfc3986.doctest | 168 - test/test_robotfileparser.doctest | 8 - test/test_unittest.py | 3785 ----------------- test/test_urllib2.py | 1680 -------- test/test_urllib2_localnet.py | 525 --- test/test_useragent.py | 76 - 116 files changed, 3 insertions(+), 41187 deletions(-) delete mode 100644 .gitignore delete mode 100644 COPYING.txt delete mode 100644 ChangeLog delete mode 100644 INSTALL.txt delete mode 100644 MANIFEST.in delete mode 100644 docs/development.txt delete mode 100644 docs/doc.txt delete mode 100644 docs/documentation.txt delete mode 100644 docs/download.txt.in delete mode 100644 docs/faq.txt delete mode 100644 docs/forms.txt.in delete mode 100644 docs/hints.txt delete mode 100644 docs/html.template delete mode 100644 docs/index.txt delete mode 100644 docs/support.txt delete mode 100644 examples/forms/data.dat delete mode 100644 examples/forms/data.txt delete mode 100755 examples/forms/echo.cgi delete mode 100644 examples/forms/example.html delete mode 100755 examples/forms/example.py delete mode 100755 examples/forms/simple.py delete mode 100644 examples/hack21.py delete mode 100644 examples/pypi.py delete mode 100644 ez_setup.py delete mode 100644 mechanize/__init__.py delete mode 100644 mechanize/_auth.py delete mode 100644 mechanize/_beautifulsoup.py delete mode 100644 mechanize/_clientcookie.py delete mode 100644 mechanize/_debug.py delete mode 100644 mechanize/_firefox3cookiejar.py delete mode 100644 mechanize/_form.py delete mode 100644 mechanize/_gzip.py delete mode 100644 mechanize/_headersutil.py delete mode 100644 mechanize/_html.py delete mode 100644 mechanize/_http.py delete mode 100644 mechanize/_lwpcookiejar.py delete mode 100644 mechanize/_markupbase.py delete mode 100644 mechanize/_mechanize.py delete mode 100644 mechanize/_mozillacookiejar.py delete mode 100644 mechanize/_msiecookiejar.py delete mode 100644 mechanize/_opener.py delete mode 100644 mechanize/_pullparser.py delete mode 100644 mechanize/_request.py delete mode 100644 mechanize/_response.py delete mode 100644 mechanize/_rfc3986.py delete mode 100644 mechanize/_sgmllib_copy.py delete mode 100644 mechanize/_sockettimeout.py delete mode 100644 mechanize/_testcase.py delete mode 100644 mechanize/_urllib2.py delete mode 100644 mechanize/_urllib2_fork.py delete mode 100644 mechanize/_useragent.py delete mode 100644 mechanize/_util.py delete mode 100644 mechanize/_version.py delete mode 100644 release.py delete mode 100644 setup.cfg delete mode 100755 setup.py delete mode 100755 test-tools/cookietest.cgi delete mode 100644 test-tools/doctest.py delete mode 100644 test-tools/functools_copy.py delete mode 100644 test-tools/linecache_copy.py delete mode 100644 test-tools/testprogram.py delete mode 100644 test-tools/twisted-ftpserver.py delete mode 100644 test-tools/twisted-localserver.py delete mode 100644 test-tools/unittest/__init__.py delete mode 100644 test-tools/unittest/__main__.py delete mode 100644 test-tools/unittest/case.py delete mode 100644 test-tools/unittest/loader.py delete mode 100644 test-tools/unittest/main.py delete mode 100644 test-tools/unittest/result.py delete mode 100644 test-tools/unittest/runner.py delete mode 100644 test-tools/unittest/suite.py delete mode 100644 test-tools/unittest/util.py delete mode 100755 test.py delete mode 100644 test/__init__.py delete mode 100644 test/functional_tests_golden/FormsExamplesTests.test_example/output delete mode 100644 test/functional_tests_golden/FormsExamplesTests.test_simple/output delete mode 100644 test/opera_cookie_test_uris delete mode 100644 test/skiptest_opera.py delete mode 100644 test/test_api.py delete mode 100644 test/test_browser.doctest delete mode 100644 test/test_browser.py delete mode 100644 test/test_cookie.py delete mode 100644 test/test_cookies.py delete mode 100644 test/test_date.py delete mode 100644 test/test_form.py delete mode 100644 test/test_form_data/Auth.html delete mode 100644 test/test_form_data/FullSearch.html delete mode 100644 test/test_form_data/GeneralSearch.html delete mode 100644 test/test_form_data/MarkedRecords.html delete mode 100644 test/test_form_data/MarkedResults.html delete mode 100644 test/test_form_data/Results.html delete mode 100644 test/test_form_data/SearchType.html delete mode 100644 test/test_form_mutation.py delete mode 100644 test/test_forms.doctest delete mode 100644 test/test_functional.py delete mode 100644 test/test_headers.py delete mode 100644 test/test_history.doctest delete mode 100644 test/test_html.doctest delete mode 100644 test/test_html.py delete mode 100644 test/test_import.py delete mode 100644 test/test_opener.doctest delete mode 100644 test/test_opener.py delete mode 100644 test/test_password_manager.special_doctest delete mode 100644 test/test_performance.py delete mode 100644 test/test_pickle.py delete mode 100644 test/test_pullparser.py delete mode 100644 test/test_request.doctest delete mode 100644 test/test_response.doctest delete mode 100644 test/test_response.py delete mode 100644 test/test_rfc3986.doctest delete mode 100644 test/test_robotfileparser.doctest delete mode 100644 test/test_unittest.py delete mode 100644 test/test_urllib2.py delete mode 100644 test/test_urllib2_localnet.py delete mode 100644 test/test_useragent.py diff --git a/.gitignore b/.gitignore deleted file mode 100644 index e9c743c..0000000 --- a/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -/build/ -/dist/ -/docs/download.txt -/docs/forms.txt -/docs/html -/docs/styles -*.egg-info/ diff --git a/COPYING.txt b/COPYING.txt deleted file mode 100644 index 6a8bf1b..0000000 --- a/COPYING.txt +++ /dev/null @@ -1,101 +0,0 @@ -All the code with the exception of _gzip.py is covered under either -the BSD-style license immediately below, or (at your choice) the ZPL -2.1. The code in _gzip.py is taken from the effbot.org library, and -falls under the effbot.org license (also BSD-style) that appears at -the end of this file. - - -Copyright (c) 2002-2010 John J. Lee -Copyright (c) 1997-1999 Gisle Aas -Copyright (c) 1997-1999 Johnny Lee -Copyright (c) 2003 Andy Lester - - -BSD-style License -================== - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -Neither the name of the contributors nor the names of their employers -may be used to endorse or promote products derived from this software -without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - - - - -ZPL 2.1 -================== - -Zope Public License (ZPL) Version 2.1 - -A copyright notice accompanies this license document that identifies the copyright holders. - -This license has been certified as open source. It has also been designated as GPL compatible by the Free Software Foundation (FSF). - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - 1. Redistributions in source code must retain the accompanying copyright notice, this list of conditions, and the following disclaimer. - 2. Redistributions in binary form must reproduce the accompanying copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Names of the copyright holders must not be used to endorse or promote products derived from this software without prior written permission from the copyright holders. - 4. The right to distribute this software or to use it for any purpose does not give you the right to use Servicemarks (sm) or Trademarks (tm) of the copyright holders. Use of them is covered by separate agreement with the copyright holders. - 5. If any files are modified, you must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. - -Disclaimer - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - - - - --------------------------------------------------------------------- -The effbot.org Library is - -Copyright (c) 1999-2004 by Secret Labs AB -Copyright (c) 1999-2004 by Fredrik Lundh - -By obtaining, using, and/or copying this software and/or its -associated documentation, you agree that you have read, understood, -and will comply with the following terms and conditions: - -Permission to use, copy, modify, and distribute this software and its -associated documentation for any purpose and without fee is hereby -granted, provided that the above copyright notice appears in all -copies, and that both that copyright notice and this permission notice -appear in supporting documentation, and that the name of Secret Labs -AB or the author not be used in advertising or publicity pertaining to -distribution of the software without specific, written prior -permission. - -SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO -THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND -FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR -ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT -OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. --------------------------------------------------------------------- diff --git a/ChangeLog b/ChangeLog deleted file mode 100644 index d41559f..0000000 --- a/ChangeLog +++ /dev/null @@ -1,553 +0,0 @@ -This isn't really in proper GNU ChangeLog format, it just happens to -look that way. - -2011-03-31 John J Lee - * 0.2.5 release. - * This is essentially a no-changes release to fix easy_install - breakage caused by a SourceForge issue - * Sourceforge is returning invalid HTTP responses, make download - links point to PyPI instead - * Include cookietest.cgi in source distribution - * Note new IETF cookie standardisation effort - -2010-10-28 John J Lee - * 0.2.4 release. - * Fix IndexError on empty Content-type header value. (GH-18) - * Fall back to another encoding if an unknown one is declared. - Fixes traceback on unknoqn encoding in Content-type header. - (GH-30) - -2010-10-16 John J Lee - * 0.2.3 release. - * Fix str(ParseError()) traceback. (GH-25) - * Add equality methods to mechanize.Cookie . (GH-29) - -2010-07-17 John J Lee - * 0.2.2 release. - * Officially support Python 2.7 (no changes were required) - * Fix TypeError on .open()ing ftp: URL (only affects Python 2.4 - and 2.5) - * Don't include HTTPSHandler in __all__ if it's not available - -2010-05-16 John J Lee - * 0.2.1 release. - * API change: Change argument order of - HTTPRedirectHandler.redirect_request() to match urllib2. - * Fix failure to use bundled BeautifulSoup for forms. (GH-15) - * Fix default cookie path where request path has query containing - / character. (http://bugs.python.org/issue3704) - * Fix failure to raise on click for nonexistent label. (GH-16) - * Documentation fixes. - -2010-04-22 John J Lee - * 0.2.0 release. - * Behaviour change: merged upstream urllib2 change (allegedly a - "bug fix") to return a response for all 2** HTTP responses (e.g. - 206 Partial Content). Previously, only 200 caused a response - object to be returned. All other HTTP response codes resulted - in a response object being raised as an exception. - * Behaviour change: Use of mechanize classes with `urllib2` (and - vice-versa) is no longer supported. However, existing classes - implementing the urllib2 Handler interface are likely to work - unchanged with mechanize. Removed RequestUpgradeProcessor, - ResponseUpgradeProcessor, SeekableProcessor. - * ClientForm has been merged into mechanize. This means that - mechanize has no dependencies other than Python itself. The - ClientForm API is still available -- to switch from ClientForm to - mechanize, just s/ClientForm/mechanize in your source code, and - ensure any use of the module logging logger named "ClientForm" is - updated to use the new logger name "mechanize.forms". I probably - won't do further standalone releases of ClientForm. - * Stop monkey-patching Python stdlib. - * Merge fixes from urllib2 trunk - * Close file objects on .read() failure in .retrieve() - * Fix a python 2.4 bug due to buggy urllib.splithost - * Fix Python 2.4 syntax error in _firefox3cookiejar - * Fix __init__.py typo that hid mechanize.seek_wrapped_response and - mechanize.str2time. Fixes - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=465206 - * Fix an obvious bug with experimental firefox 3 cookiejar support. - It's still experimental and not properly tested. - * Change documentation to not require a .port attribute on request - objects, since that's unused. - * Doc fixes - * Added mechanize.urljoin (RFC 3986 compliant function for joining - a base URI with a URI reference) - * Merge of ClientForm (see above). - * Moved to git (from SVN) http://github.com/jjlee/mechanize - * Created an issue tracker http://github.com/jjlee/mechanize/issues - * Docs are now in markdown format (thanks John Gabriele). - * Website rearranged. The old website has been archived at - http://wwwsearch.sourceforge.net/old/ . The new website is - essentially just the mechanize pages, rearranged and cleaned up a - bit. - * Source code rearranged for easier merging with upstream urllib2 - * Fully automated release process. - * New test runner. Single test suite; tests create their own HTTP - server fixtures (server fixtures are cached where possible for - speed). - -2009-02-07 John J Lee - * 0.1.11 release. - * Fix quadratic performance in number of .read() calls (and add an - automated performance test). - -2008-12-03 John J Lee - * 0.1.10 release. - * Add support for Python 2.6: Raise URLError on file: URL errors, - not IOError (port of upstream urllib2 fix). Add support for - Python 2.6's per-connection timeouts: Add timeout arguments to - urlopen(), Request constructor, .open(), and .open_novisit(). - * Drop support for Python 2.3 - * Add Content-length header to Request object (httplib bug that - prevented doing that was fixed in Python 2.4). There's no - change is what is actually sent over the wire here, just in what - headers get added to the Request object. - * Fix AttributeError on .retrieve() with a Request (as opposed to - URL string) argument - * Don't change CookieJar state in .make_cookies(). - * Fix AttributeError in case where .make_cookies() or - .cookies_for_request() is called before other methods like - .extract_cookies() or .make_cookie_header() - * Fixes affecting version cookie-attribute - (http://bugs.python.org/issue3924). - * Silence module logging's "no handlers could be found for logger - mechanize" warning in a way that doesn't clobber attempts to set - log level sometimes - * Don't use private attribute of request in request upgrade - handler (what was I thinking??) - * Don't call setup() on import of setup.py - * Add new public function effective_request_host - * Add .get_policy() method to CookieJar - * Add method CookieJar.cookies_for_request() - * Fix documented interface required of requests and responses (and - add some tests for this!) - * Allow either .is_unverifiable() or .unverifiable on request - objects (preferring the former) - * Note that there's a new functional test for digest auth, which - fails when run against the sourceforge site (which is the - default). It looks like this reflects the fact that digest auth - has been fairly broken since it was introduced in urllib2. I - don't plan to fix this myself. - -2008-09-24 John J Lee - * 0.1.9 release. - * Fix ImportError if sqlite3 not available - * Fix a couple of functional tests not to wait 5 seconds each - -2008-09-13 John J Lee - * 0.1.8 release. - * Close sockets. This only affects Python 2.5 (and later) - - earlier versions of Python were unaffected. See - http://bugs.python.org/issue1627441 - * Make title parsing follow Firefox behaviour wrt child - elements (previously the behaviour differed between Factory and - RobustFactory). - * Fix BeautifulSoup RobustLinksFactory (hence RobustFactory) link - text parsing for case of link text containing tags (Titus Brown) - * Fix issue where more tags after caused default parser to - raise an exception - * Handle missing cookie max-age value. Previously, a warning was - emitted in this case. - * Fix thoroughly broken digest auth (still need functional - test!) (trebor74hr@...) - * Handle cookies containing embedded tabs in mozilla format files - * Remove an assertion about mozilla format cookies file - contents (raise LoadError instead) - * Fix MechanizeRobotFileParser.set_opener() - * Fix selection of global form using .select_form() (Titus Brown) - * Log skipped Refreshes - * Stop tests from clobbering files that happen to be lying around - in cwd (!) - * Use SO_REUSEADDR for local test server. - * Raise exception if local test server fails to start. - * Tests no longer (accidentally) depend on third-party coverage - module - * The usual docs and test fixes. - * Add convenience method Browser.open_local_file(filename) - * Add experimental support for Firefox 3 cookie jars - ("cookies.sqlite"). Requires Python 2.5 - * Fix a _gzip.py NameError (gzip support is experimental) - -2007-05-31 John J Lee <jjl@pobox.com> - * 0.1.7b release. - * Sub-requests should not usually be visiting, so make it so. In - fact the visible behaviour wasn't really broken here, since - .back() skips over None responses (which is odd in itself, but - won't be changed until after stable release is branched). - However, this patch does change visible behaviour in that it - creates new Request objects for sub-requests (e.g. basic auth - retries) where previously we just mutated the existing Request - object. - * Changes to sort out abuse of by SeekableProcessor and - ResponseUpgradeProcessor (latter shouldn't have been public in - the first place) and resulting confusing / unclear / broken - behaviour. Deprecate SeekableProcessor and - ResponseUpgradeProcessor. Add SeekableResponseOpener. Remove - SeekableProcessor and ResponseUpgradeProcessor from Browser. - Move UserAgentBase.add_referer_header() to Browser (it was on by - default, breaking UserAgent, and should never really have been - there). - * Fix HTTP proxy support: r29110 meant that Request.get_selector() - didn't take into account the change to .__r_host - (Thanks tgates@...). - * Redirected robots.txt fetch no longer results in another - attempted robots.txt fetch to check the redirection is allowed! - * Fix exception raised by RFC 3986 implementation with - urljoin(base, '/..') - * Fix two multiple-response-wrapping bugs. - * Add missing import in tests (caused failure on Windows). - * Set svn:eol-style to native for all text files in SVN. - * Add some tests for upgrade_response(). - * Add a functional test for 302 + 404 case. - * Add an -l option to run the functional tests against a local - twisted.web2-based server (you need Twisted installed for this - to work). This is much faster than running against - wwwsearch.sourceforge.net - * Add -u switch to skip unittests (and only run the doctests). - -2007-01-07 John J Lee <jjl@pobox.com> - - * 0.1.6b release - * Add mechanize.ParseError class, document it as part of the - mechanize.Factory interface, and raise it from all Factory - implementations. This is backwards-compatible, since the new - exception derives from the old exceptions. - * Bug fix: Truncation when there is no full .read() before - navigating to the next page, and an old response is read after - navigation. This happened e.g. with r = br.open(); - r.readline(); br.open(url); r.read(); br.back() . - * Bug fix: when .back() caused a reload, it was returning the old - response, not the .reload()ed one. - * Bug fix: .back() was not returning a copy of the response, which - presumably would cause seek position problems. - * Bug fix: base tag without href attribute would override document - URL with a None value, causing a crash (thanks Nathan Eror). - * Fix .set_response() to close current response first. - * Fix non-idempotent behaviour of Factory.forms() / .links() . - Previously, if for example you got a ParseError during execution - of .forms(), you could call it again and have it not raise an - exception, because it started out where it left off! - * Add a missing copy.copy() to RobustFactory . - * Fix redirection to 'URIs' that contain characters that are not - allowed in URIs (thanks Riko Wichmann). Also, Request - constructor now logs a module logging warning about any such bad - URIs. - * Add .global_form() method to Browser to support form controls - whose HTML elements are not descendants of any FORM element. - * Add a new method .visit_response() . This creates a new history - entry from a response object, rather than just changing the - current visited response. This is useful e.g. when you want to - use Browser features in a handler. - * Misc minor bug fixes. - -2006-10-25 John J Lee <jjl@pobox.com> - - * 0.1.5b release: Update setuptools dependencies to depend on - ClientForm>=0.2.5 (for an important bug fix affecting fragments - in URLs). There are no other changes in this release -- this - release was done purely so that people upgrading to the latest - version of mechanize will get the latest ClientForm too. - -2006-10-14 John J Lee <jjl@pobox.com> - * 0.1.4b release: (skipped a version deliberately for obscure - reasons) - * Improved auth & proxies support. - * Follow RFC 3986. - * Add a .set_cookie() method to Browser . - * Add Browser.open_novisit() and Request.visit to allow fetching - files without affecting Browser state. - * UserAgent and Browser are now subclasses of UserAgentBase. - UserAgent's only role in life above what UserAgentBase does is - to provide the .set_seekable_responses() method (it lives there - because Browser depends on seekable responses, because that's - how browser history is implemented). - * Bundle BeautifulSoup 2.1.1. No more dependency pain! Note that - BeautifulSoup is, and always was, optional, and that mechanize - will eventually switch to BeautifulSoup version 3, at which - point it may well stop bundling BeautifulSoup. Note also that - the module is only used internally, and is not available as a - public attribute of the package. If you dare, you can import it - ("from mechanize import _beautifulsoup"), but beware that it - will go away later, and that the API of BeautifulSoup will - change when the upgrade to 3 happens. Also, BeautifulSoup - support (mainly RobustFactory) is still a little experimental - and buggy. - * Fix HTTP-EQUIV with no content attribute case (thanks Pratik - Dam). - * Fix bug with quoted META Refresh URL (thanks Nilton Volpato). - * Fix crash with </base> tag (yajdbgr02@...). - * Somebody found a server that (incorrectly) depends on HTTP - header case, so follow the Title-Case convention. Note that the - Request headers interface(s), which were (somewhat oddly -- this - is an inheritance from urllib2 that should really be fixed in a - better way than it is currently) always case-sensitive still - are; the only thing that changed is what actually eventually - gets sent over the wire. - * Use mechanize (not urllib) to open robots.txt. Don't consult - RobotFileParser instance about non-HTTP URLs. - * Fix OpenerDirector.retrieve(), which was very broken (thanks - Duncan Booth). - * Crash in a much more obvious way if trying to use OpenerDirector - after .close() . - * .reload() on .back() if necessary (necessary iff response was - not fully .read() on first .open()ing ) * Strip fragments before - retrieving URLs (fixed Request.get_selector() to strip fragment) - * Fix catching HTTPError subclasses while still preserving all - their response behaviour - * Correct over-enthusiastic documented guarantees of - closeable_response . - * Fix assumption that httplib.HTTPMessage treats dict-style - __setitem__ as append rather than set (where on earth did I get - that from?). - * Expose History in mechanize/__init__.py (though interface is - still experimental). - * Lots of other "internals" bugs fixed (thanks to reports / - patches from Benji York especially, also Titus Brown, Duncan - Booth, and me ;-), where I'm not 100% sure exactly when they - were introduced, so not listing them here in detail. - * Numerous other minor fixes. - * Some code cleanup. - -2006-05-21 John J Lee <jjl@pobox.com> - * 0.1.2b release: - * mechanize now exports the whole urllib2 interface. - * Pull in bugfixed auth/proxy support code from Python 2.5. - * Bugfix: strip leading and trailing whitespace from link URLs - * Fix .any_response() / .any_request() methods to have ordering. - consistent with rest of handlers rather than coming before all - of them. - * Tell cookie-handling code about new TLDs. - * Remove Browser.set_seekable_responses() (they always are - anyway). - * Show in web page examples how to munge responses and how to do - proxy/auth. - * Rename 0.1.* changes document 0.1.0-changes.txt --> - 0.1-changes.txt. - * In 0.1 changes document, note change of logger name from - "ClientCookie" to "mechanize" - * Add something about response objects to changes document - * Improve Browser.__str__ - * Accept regexp strings as well as regexp objects when finding - links. - * Add crappy gzip transfer encoding support. This is off by - default and warns if you turn it on (hopefully will get better - later :-). - * A bit of internal cleanup following merge with pullparser / - ClientCookie. - -2006-05-06 John J Lee <jjl@pobox.com> - * 0.1.1a release: - * Merge ClientCookie and pullparser with mechanize. - * Response object fixes. - * Remove accidental dependency on BeautifulSoup introduced in - 0.1.0a (the BeautifulSoup support is still here, but - BeautifulSoup is not required to use mechanize). - -2006-05-03 John J Lee <jjl@pobox.com> - * 0.1.0a release: - * Stop trying to record precise dates in changelog, since that's - silly ;-) - * A fair number of interface changes: see 0.1.0-changes.txt. - * Depend on recent ClientCookie with copy.copy()able response - objects. - * Don't do broken XHTML handling by default (need to review code - before switching this back on, e.g. should use a real XML parser - for first-try at parsing). To get the old behaviour, pass - i_want_broken_xhtml_support=True to mechanize.DefaultFactory / - .RobustFactory constructor. - * Numerous small bug fixes. - * Documentation & setup.py fixes. - * Don't use cookielib, to avoid having to work around Python 2.4 - RFC 2109 bug, and to avoid my braindead thread synchronisation - code in cookielib :-((((( (I haven't encountered specific - breakage due to latter, but since it's braindead I may as well - avoid it). - -2005-11-30 John J Lee <jjl@pobox.com> - * Fixed setuptools support. - * Release 0.0.11a. - -2005-11-19 John J Lee <jjl@pobox.com> - * Release 0.0.10a. - -2005-11-17 John J Lee <jjl@pobox.com> - * Fix set_handle_referer. - -2005-11-12 John J Lee <jjl@pobox.com> - * Fix history (Gary Poster). - * Close responses on reload (Gary Poster). - * Don't depend on SSL support (Gary Poster). - -2005-10-31 John J Lee <jjl@pobox.com> - * Add setuptools support. - -2005-10-30 John J Lee <jjl@pobox.com> - * Don't mask AttributeError exception messages from ClientForm. - * Document intent of .links() vs. .get_links_iter(); Rename - LinksFactory method. - * Remove pullparser import dependency. - * Remove Browser.urltags (now an argument to LinksFactory). - * Document Browser constructor as taking keyword args only (and - change positional arg spec). - * Cleanup of lazy parsing (may fix bugs, not sure...). - -2005-10-28 John J Lee <jjl@pobox.com> - * Support ClientForm backwards_compat switch. - -2005-08-28 John J Lee <jjl@pobox.com> - * Apply optimisation patch (Stephan Richter). - -2005-08-15 John J Lee <jjl@pobox.com> - * Close responses (ie. close the file handles but leave response - still .read()able &c., thanks to the response objects we're - using) (aurel@nexedi.com). - -2005-08-14 John J Lee <jjl@pobox.com> - * Add missing argument to UserAgent's _add_referer_header stub. - * Doc and comment improvements. - -2005-06-28 John J Lee <jjl@pobox.com> - * Allow specifying parser class for equiv handling. - * Ensure correct default constructor args are passed to - HTTPRefererProcessor. - * Allow configuring details of Refresh handling. - * Switch to tolerant parser. - -2005-06-11 John J Lee <jjl@pobox.com> - * Do .seek(0) after link parsing in a finally block. - * Regard text/xhtml as HTML. - * Fix 2.4-compatibility bugs. - * Fix spelling of '_equiv' feature string. - -2005-05-30 John J Lee <jjl@pobox.com> - * Turn on Referer, Refresh and HTTP-Equiv handling by default. - -2005-05-08 John J Lee <jjl@pobox.com> - * Fix .reload() to not update history (thanks to Titus Brown). - * Use cookielib where available - -2005-03-01 John J Lee <jjl@pobox.com> - * Fix referer bugs: Don't send URL fragments; Don't add in Referer - header in redirected request unless original request had a - Referer header. - -2005-02-19 John J Lee <jjl@pobox.com> - * Allow supplying own mechanize.FormsFactory, so eg. can use - ClientForm.XHTMLFormParser. Also allow supplying own Request - class, and use sensible defaults for this. Now depends on - ClientForm 0.1.17. Side effect is that, since we use the - correct Request class by default, there's (I hope) no need for - using RequestUpgradeProcessor in Browser._add_referer_header() - :-) - -2005-01-30 John J Lee <jjl@pobox.com> - * Released 0.0.9a. - -2005-01-05 John J Lee <jjl@pobox.com> - * Fix examples (scraped sites have changed). - * Fix .set_*() method boolean arguments. - * The .response attribute is now a method, .response() - * Don't depend on BaseProcessor (no longer exists). - -2004-05-18 John J Lee <jjl@pobox.com> - * Released 0.0.8a: - * Added robots.txt observance, controlled by - * BASE element has attribute 'href', not 'uri'! (patch from Jochen - Knuth) - * Fixed several bugs in handling of Referer header. - * Link.__eq__ now returns False instead of raising AttributeError - on comparison with non-Link (patch from Jim Jewett) - * Removed dependencies on HTTPS support in Python and on - ClientCookie.HTTPRobotRulesProcessor - -2004-01-18 John J Lee <jjl@pobox.com> - * Added robots.txt observance, controlled by - UserAgent.set_handle_robots(). This is now on by default. - * Removed set_persistent_headers() method -- just use .addheaders, - as in base class. - -2004-01-09 John J Lee <jjl@pobox.com> - * Removed unnecessary dependence on SSL support in Python. Thanks - to Krzysztof Kowalczyk for bug report. - * Released 0.0.7a. - -2004-01-06 John J Lee <jjl@pobox.com> - * Link instances may now be passed to .click_link() and - .follow_link(). - * Added a new example program, pypi.py. - -2004-01-05 John J Lee <jjl@pobox.com> - * Released 0.0.5a. - * If <title> tag was missing, links and forms would not be parsed. - Also, base element (giving base URI) was ignored. Now parse - title lazily, and get base URI while parsing links. Also, fixed - ClientForm to take note of base element. Thanks to Phillip J. - Eby for bug report. - * Released 0.0.6a. - -2004-01-04 John J Lee <jjl@pobox.com> - * Fixed _useragent._replace_handler() to update self.handlers - correctly. - * Updated required pullparser version check. - * Visiting a URL now deselects form (sets self.form to None). - * Only first Content-Type header is now checked by - ._viewing_html(), if there are more than one. - * Stopped using getheaders from ClientCookie -- don't need it, - since depend on Python 2.2, which has .getheaders() method on - responses. Improved comments. - * .open() now resets .response to None. Also rearranged .open() a - bit so instance remains in consistent state on failure. - * .geturl() now checks for non-None .response, and raises Browser. - * .back() now checks for non-None .response, and doesn't attempt - to parse if it's None. - * .reload() no longer adds new history item. - * Documented tag argument to .find_link(). - * Fixed a few places where non-keyword arguments for .find_link() - were silently ignored. Now raises ValueError. - -2004-01-02 John J Lee <jjl@pobox.com> - * Use response_seek_wrapper instead of seek_wrapper, which broke - use of reponses after they're closed. - * (Fixed response_seek_wrapper in ClientCookie.) - * Fixed adding of Referer header. Thanks to Per Cederqvist for - bug report. - * Released 0.0.4a. - * Updated required ClientCookie version check. - -2003-12-30 John J Lee <jjl@pobox.com> - * Added support for character encodings (for matching link text). - * Released 0.0.3a. - -2003-12-28 John J Lee <jjl@pobox.com> - * Attribute lookups are no longer forwarded to .response -- - you have to do it explicitly. - * Added .geturl() method, which just delegates to .response. - * Big rehash of UserAgent, which was broken. Added a test. - * Discovered that zip() doesn't raise an exception when its - arguments are of different length, so several tests could pass - when they should have failed. Fixed. - * Fixed <A/> case in ._parse_html(). - * Released 0.0.2a. - -2003-12-27 John J Lee <jjl@pobox.com> - * Added and improved docstrings. - * Browser.form is now a public attribute. Also documented - Browser's public attributes. - * Added base_url and absolute_url attributes to Link. - * Tidied up .open(). Relative URL Request objects are no longer - converted to absolute URLs -- they should probably be absolute - in the first place anyway. - * Added proper Referer handling (the handler in ClientCookie is a - hack that only covers a restricted case). - * Added click_link method, for symmetry with .click() / .submit() - methods (which latter apply to forms). Of these methods, - .click/.click_link() returns a request, and .submit/ - .follow_link() actually .open()s the request. - * Updated broken example code. - -2003-12-24 John J Lee <jjl@pobox.com> - * Modified setup.py so can easily register with PyPI. - -2003-12-22 John J Lee <jjl@pobox.com> - * Released 0.0.1a. diff --git a/INSTALL.txt b/INSTALL.txt deleted file mode 100644 index e0d839d..0000000 --- a/INSTALL.txt +++ /dev/null @@ -1,19 +0,0 @@ -To install mechanize: - -See the web page for the version of Python required (included here as -docs/html/index.html). - -To install the package, run the following command: - - python setup.py install - - -Alternatively, just copy the whole mechanize directory into a directory on -your Python path (e.g. unix: /usr/local/lib/python2.7/site-packages, -Windows: C:\Python27\Lib\site-packages). Only copy the mechanize directory -that's inside the distributed tarball / zip archive, not the entire -mechanize-x.x.x directory! - - -John J. Lee <jjl@pobox.com> -July 2010 diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 0f1edb6..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,12 +0,0 @@ -include COPYING.txt -include INSTALL.txt -include MANIFEST.in -include README.txt -include *.py -recursive-include examples *.py -recursive-include examples/forms *.dat *.txt *.html *.cgi *.py -recursive-include test/functional_tests_golden output -recursive-include test/test_form_data *.html -recursive-include test *.py *.doctest *.special_doctest -recursive-include test-tools *.py *.cgi -recursive-include docs *.txt *.html *.css *.js diff --git a/README.txt b/README.txt index 70a0647..c5bcf7e 100644 --- a/README.txt +++ b/README.txt @@ -1,7 +1,5 @@ -See INSTALL.txt for installation instructions. +This project has moved to https://github.com/python-mechanize -See docs/html/index.html and docstrings for documentation. +It's now being maintained by other people, principally Kovid Goyal -If you have a git working tree rather than a release, you'll only have -the markdown source, e.g. mechanize/index.txt; release.py is used to -build the HTML docs. +-- John Lee, March 2017 diff --git a/docs/development.txt b/docs/development.txt deleted file mode 100644 index 92c8f96..0000000 --- a/docs/development.txt +++ /dev/null @@ -1,47 +0,0 @@ -% mechanize -- Development - -git repository --------------- - -The [git](http://git-scm.com/) repository is -[here](http://github.com/jjlee/mechanize). To check it out: - - `git clone git://github.com/jjlee/mechanize.git` - -There is also [another -repository](http://github.com/jjlee/mechanize-build-tools), which is only -useful for making mechanize releases: - - `git clone git://github.com/jjlee/mechanize-build-tools.git` - - -Old repository --------------- - -The [old SVN repository](http://codespeak.net/svn/wwwsearch/) may be useful for -viewing ClientForm history. ClientForm used to be a dependency of mechanize, -but has been merged into mechanize as of release 0.2.0; the history wasn't -imported. To check out: - - `svn co http://codespeak.net/svn/wwwsearch/` - - -Bug tracker ------------ - -The bug tracker is [here on github](http://github.com/jjlee/mechanize/issues). -It's equally acceptable to file bugs on the tracker or post about them to the -[mailing list](http://lists.sourceforge.net/lists/listinfo/wwwsearch-general). -Feel free to send patches too! - - -Mailing list ------------- - -There is a [mailing -list](http://lists.sourceforge.net/lists/listinfo/wwwsearch-general). - - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/doc.txt b/docs/doc.txt deleted file mode 100644 index 93c40ad..0000000 --- a/docs/doc.txt +++ /dev/null @@ -1,524 +0,0 @@ -% mechanize -- Documentation - -<span class="docwarning">This documentation is in need of -reorganisation!</span> - -This page is the old ClientCookie documentation. It deals with operation on -the level of `urllib2 Handler` objects, and also with adding headers, -debugging, and cookie handling. See the [front page](./) for more typical use. - - -Examples --------- - -~~~~{.python} -import mechanize -response = mechanize.urlopen("http://example.com/") -~~~~ - -This function behaves identically to `urllib2.urlopen()`, except that it deals -with cookies automatically. - -Here is a more complicated example, involving `Request` objects (useful if you -want to pass `Request`s around, add headers to them, etc.): - -~~~~{.python} -import mechanize -request = mechanize.Request("http://example.com/") -# note we're using the urlopen from mechanize, not urllib2 -response = mechanize.urlopen(request) -# let's say this next request requires a cookie that was set -# in response -request2 = mechanize.Request("http://example.com/spam.html") -response2 = mechanize.urlopen(request2) - -print response2.geturl() -print response2.info() # headers -print response2.read() # body (readline and readlines work too) -~~~~ - -In these examples, the workings are hidden inside the `mechanize.urlopen()` -function, which is an extension of `urllib2.urlopen()`. Redirects, proxies and -cookies are handled automatically by this function (note that you may need a -bit of configuration to get your proxies correctly set up: see `urllib2` -documentation). - -There is also a `urlretrieve()` function, which works like -`urllib.urlretrieve()`. - -An example at a slightly lower level shows how the module processes cookies -more clearly: - -~~~~{.python} -# Don't copy this blindly! You probably want to follow the examples -# above, not this one. -import mechanize - -# Build an opener that *doesn't* automatically call .add_cookie_header() -# and .extract_cookies(), so we can do it manually without interference. -class NullCookieProcessor(mechanize.HTTPCookieProcessor): - def http_request(self, request): return request - def http_response(self, request, response): return response -opener = mechanize.build_opener(NullCookieProcessor) - -request = mechanize.Request("http://example.com/") -response = mechanize.urlopen(request) -cj = mechanize.CookieJar() -cj.extract_cookies(response, request) -# let's say this next request requires a cookie that was set in response -request2 = mechanize.Request("http://example.com/spam.html") -cj.add_cookie_header(request2) -response2 = mechanize.urlopen(request2) -~~~~ - -The `CookieJar` class does all the work. There are essentially two operations: -`.extract_cookies()` extracts HTTP cookies from `Set-Cookie` (the original -[Netscape cookie standard](http://curl.haxx.se/rfc/cookie_spec.html)) and -`Set-Cookie2` ([RFC 2965](http://www.ietf.org/rfc/rfc2965.txt)) headers from a -response if and only if they should be set given the request, and -`.add_cookie_header()` adds `Cookie` headers if and only if they are -appropriate for a particular HTTP request. Incoming cookies are checked for -acceptability based on the host name, etc. Cookies are only set on outgoing -requests if they match the request's host name, path, etc. - -**Note that if you're using `mechanize.urlopen()` (or if you're using -`mechanize.HTTPCookieProcessor` by some other means), you don't need to call -`.extract_cookies()` or `.add_cookie_header()` yourself**. If, on the other -hand, you want to use mechanize to provide cookie handling for an HTTP client -other than mechanize itself, you will need to use this pair of methods. You -can make your own `request` and `response` objects, which must support the -interfaces described in the docstrings of `.extract_cookies()` and -`.add_cookie_header()`. - -There are also some `CookieJar` subclasses which can store cookies in files and -databases. `FileCookieJar` is the abstract class for `CookieJar`s that can -store cookies in disk files. `LWPCookieJar` saves cookies in a format -compatible with the libwww-perl library. This class is convenient if you want -to store cookies in a human-readable file: - -~~~~{.python} -import mechanize -cj = mechanize.LWPCookieJar() -cj.revert("cookie3.txt") -opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) -r = opener.open("http://foobar.com/") -cj.save("cookie3.txt") -~~~~ - -The `.revert()` method discards all existing cookies held by the `CookieJar` -(it won't lose any existing cookies if the load fails). The `.load()` method, -on the other hand, adds the loaded cookies to existing cookies held in the -`CookieJar` (old cookies are kept unless overwritten by newly loaded ones). - -`MozillaCookieJar` can load and save to the Mozilla/Netscape/lynx-compatible -`'cookies.txt'` format. This format loses some information (unusual and -nonstandard cookie attributes such as comment, and also information specific to -RFC 2965 cookies). The subclass `MSIECookieJar` can load (but not save) from -Microsoft Internet Explorer's cookie files on Windows. - - -Important note --------------- - -Only use names you can import directly from the `mechanize` package, and that -don't start with a single underscore. Everything else is subject to change or -disappearance without notice. - - -Cooperating with Browsers -------------------------- - -**Firefox since version 3 persists cookies in an sqlite database, which is not -supported by MozillaCookieJar.** - -The subclass `MozillaCookieJar` differs from `CookieJar` only in storing -cookies using a different, Firefox 2/Mozilla/Netscape-compatible, file format -known as "cookies.txt". The lynx browser also uses this format. This file -format can't store RFC 2965 cookies, so they are downgraded to Netscape cookies -on saving. `LWPCookieJar` itself uses a libwww-perl specific format -(\`Set-Cookie3') -- see the example above. Python and your browser should be -able to share a cookies file (note that the file location here will differ on -non-unix OSes): - -**WARNING:** you may want to back up your browser's cookies file if you use -`MozillaCookieJar` to save cookies. I *think* it works, but there have been -bugs in the past! - -~~~~{.python} -import os, mechanize -cookies = mechanize.MozillaCookieJar() -cookies.load(os.path.join(os.environ["HOME"], "/.netscape/cookies.txt")) -# see also the save and revert methods -~~~~ - -Note that cookies saved while Mozilla is running will get clobbered by Mozilla --- see `MozillaCookieJar.__doc__`. - -`MSIECookieJar` does the same for Microsoft Internet Explorer (MSIE) 5.x and -6.x on Windows, but does not allow saving cookies in this format. In future, -the Windows API calls might be used to load and save (though the index has to -be read directly, since there is no API for that, AFAIK; there's also an -unfinished `MSIEDBCookieJar`, which uses (reads and writes) the Windows MSIE -cookie database directly, rather than storing copies of cookies as -`MSIECookieJar` does). - -~~~~{.python} -import mechanize -cj = mechanize.MSIECookieJar(delayload=True) -cj.load_from_registry() # finds cookie index file from registry -~~~~ - -A true `delayload` argument speeds things up. - -On Windows 9x (win 95, win 98, win ME), you need to supply a username to the -`.load_from_registry()` method: - -~~~~{.python} -cj.load_from_registry(username="jbloggs") -~~~~ - -Konqueror/Safari and Opera use different file formats, which aren't yet -supported. - - -Saving cookies in a file ------------------------- - -If you have no need to co-operate with a browser, the most convenient way to -save cookies on disk between sessions in human-readable form is to use -`LWPCookieJar`. This class uses a libwww-perl specific format -(\`Set-Cookie3'). Unlike `MozilliaCookieJar`, this file format doesn't lose -information. - - -Supplying a CookieJar ---------------------- - -You might want to do this to [use your browser's -cookies](#cooperating-with-browsers), to customize `CookieJar`'s behaviour by -passing constructor arguments, or to be able to get at the cookies it will hold -(for example, for saving cookies between sessions and for debugging). - -If you're using the higher-level `urllib2`-like interface (`urlopen()`, etc), -you'll have to let it know what `CookieJar` it should use: - -~~~~{.python} -import mechanize -cookies = mechanize.CookieJar() -# build_opener() adds standard handlers (such as HTTPHandler and -# HTTPCookieProcessor) by default. The cookie processor we supply -# will replace the default one. -opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) - -r = opener.open("http://example.com/") # GET -r = opener.open("http://example.com/", data) # POST -~~~~ - -The `urlopen()` function uses a global `OpenerDirector` instance to do its -work, so if you want to use `urlopen()` with your own `CookieJar`, install the -`OpenerDirector` you built with `build_opener()` using the -`mechanize.install_opener()` function, then proceed as usual: - -~~~~{.python} -mechanize.install_opener(opener) -r = mechanize.urlopen("http://example.com/") -~~~~ - -Of course, everyone using `urlopen` is using the same global `CookieJar` -instance! - -<a name="policy" /> - -You can set a policy object (must satisfy the interface defined by -`mechanize.CookiePolicy`), which determines which cookies are allowed to be set -and returned. Use the `policy` argument to the `CookieJar` constructor, or use -the `.set\_policy()` method. The default implementation has some useful -switches: - -~~~~{.python} -from mechanize import CookieJar, DefaultCookiePolicy as Policy -cookies = CookieJar() -# turn on RFC 2965 cookies, be more strict about domains when setting and -# returning Netscape cookies, and block some domains from setting cookies -# or having them returned (read the DefaultCookiePolicy docstring for the -# domain matching rules here) -policy = Policy(rfc2965=True, strict_ns_domain=Policy.DomainStrict, - blocked_domains=["ads.net", ".ads.net"]) -cookies.set_policy(policy) -~~~~ - - -Additional Handlers -------------------- - -The following handlers are provided in addition to those provided by `urllib2`: - -`HTTPRobotRulesProcessor` -: WWW Robots (also called wanderers or spiders) are programs that traverse many - pages in the World Wide Web by recursively retrieving linked pages. This - kind of program can place significant loads on web servers, so there is a - [standard](http://www.robotstxt.org/wc/norobots.html) for a `robots.txt` - file by which web site operators can request robots to keep out of their - site, or out of particular areas of it. This handler uses the standard - Python library's `robotparser` module. It raises - `mechanize.RobotExclusionError` (subclass of `mechanize.HTTPError`) if an - attempt is made to open a URL prohibited by `robots.txt`. - -`HTTPEquivProcessor` -: The `<META HTTP-EQUIV>` tag is a way of including data in HTML to be treated - as if it were part of the HTTP headers. mechanize can automatically read - these tags and add the `HTTP-EQUIV` headers to the response object's real - HTTP headers. The HTML is left unchanged. - -`HTTPRefreshProcessor` -: The `Refresh` HTTP header is a non-standard header which is widely used. It - requests that the user-agent follow a URL after a specified time delay. - mechanize can treat these headers (which may have been set in `<META - HTTP-EQUIV>` tags) as if they were 302 redirections. Exactly when and how - `Refresh` headers are handled is configurable using the constructor - arguments. - -`HTTPRefererProcessor` -: The `Referer` HTTP header lets the server know which URL you've just visited. - Some servers use this header as state information, and don't like it if - this is not present. It's a chore to add this header by hand every time - you make a request. This adds it automatically. **NOTE**: this only makes - sense if you use each handler for a single chain of HTTP requests (so, for - example, if you use a single HTTPRefererProcessor to fetch a series of URLs - extracted from a single page, **this will break**). - [mechanize.Browser](../mechanize/) does this properly. - -Example: - -~~~~{.python} -import mechanize -cookies = mechanize.CookieJar() - -opener = mechanize.build_opener(mechanize.HTTPRefererProcessor, - mechanize.HTTPEquivProcessor, - mechanize.HTTPRefreshProcessor, - ) -opener.open("http://www.rhubarb.com/") -~~~~ - - -Seekable responses ------------------- - -Response objects returned from (or raised as exceptions by) -`mechanize.SeekableResponseOpener`, `mechanize.UserAgent` (if -`.set_seekable_responses(True)` has been called) and `mechanize.Browser()` have -`.seek()`, `.get_data()` and `.set_data()` methods: - -~~~~{.python} -import mechanize -opener = mechanize.OpenerFactory(mechanize.SeekableResponseOpener).build_opener() -response = opener.open("http://example.com/") -# same return value as .read(), but without affecting seek position -total_nr_bytes = len(response.get_data()) -assert len(response.read()) == total_nr_bytes -assert len(response.read()) == 0 # we've already read the data -response.seek(0) -assert len(response.read()) == total_nr_bytes -response.set_data("blah\n") -assert response.get_data() == "blah\n" -... -~~~~ - -This caching behaviour can be avoided by using `mechanize.OpenerDirector`. It -can also be avoided with `mechanize.UserAgent`. Note that `HTTPEquivProcessor` -and `HTTPResponseDebugProcessor` require seekable responses and so are not -compatible with `mechanize.OpenerDirector` and `mechanize.UserAgent`. - -~~~~{.python} -import mechanize -ua = mechanize.UserAgent() -ua.set_seekable_responses(False) -ua.set_handle_equiv(False) -ua.set_debug_responses(False) -~~~~ - -Note that if you turn on features that use seekable responses (currently: -HTTP-EQUIV handling and response body debug printing), returned responses *may* -be seekable as a side-effect of these features. However, this is not -guaranteed (currently, in these cases, returned response objects are seekable, -but raised respose objects — `mechanize.HTTPError` instances — are not -seekable). This applies regardless of whether you use `mechanize.UserAgent` or -`mechanize.OpenerDirector`. If you explicitly request seekable responses by -calling `.set_seekable_responses(True)` on a `mechanize.UserAgent` instance, or -by using `mechanize.Browser` or `mechanize.SeekableResponseOpener`, which -always return seekable responses, then both returned and raised responses are -guaranteed to be seekable. - -Handlers should call `response = mechanize.seek_wrapped_response(response)` if -they require the `.seek()`, `.get_data()` or `.set_data()` methods. - - -Request object lifetime ------------------------ - -Note that handlers may create new `Request` instances (for example when -performing redirects) rather than adding headers to existing `Request` objects. - - -Adding headers --------------- - -Adding headers is done like so: - -~~~~{.python} -import mechanize -req = mechanize.Request("http://foobar.com/") -req.add_header("Referer", "http://wwwsearch.sourceforge.net/mechanize/") -r = mechanize.urlopen(req) -~~~~ - -You can also use the `headers` argument to the `mechanize.Request` constructor. - -mechanize adds some headers to `Request` objects automatically -- see the next -section for details. - - -Automatically-added headers ---------------------------- - -`OpenerDirector` automatically adds a `User-Agent` header to every `Request`. - -To change this and/or add similar headers, use your own `OpenerDirector`: - -~~~~{.python} -import mechanize -cookies = mechanize.CookieJar() -opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) -opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"), - ("From", "responsible.person@example.com")] -~~~~ - -Again, to use `urlopen()`, install your `OpenerDirector` globally: - -~~~~{.python} -mechanize.install_opener(opener) -r = mechanize.urlopen("http://example.com/") -~~~~ - -Also, a few standard headers (`Content-Length`, `Content-Type` and `Host`) are -added when the `Request` is passed to `urlopen()` (or `OpenerDirector.open()`). -You shouldn't need to change these headers, but since this is done by -`AbstractHTTPHandler`, you can change the way it works by passing a subclass of -that handler to `build_opener()` (or, as always, by constructing an opener -yourself and calling `.add_handler()`). - - - -Initiating unverifiable transactions ------------------------------------- - -This section is only of interest for correct handling of third-party HTTP -cookies. See [below](#note-about-cookie-standards) for an explanation of -'third-party'. - -First, some terminology. - -An *unverifiable request* (defined fully by ([RFC -2965](http://www.ietf.org/rfc/rfc2965.txt)) is one whose URL the user did not -have the option to approve. For example, a transaction is unverifiable if the -request is for an image in an HTML document, and the user had no option to -approve the fetching of the image from a particular URL. - -The *request-host of the origin transaction* (defined fully by RFC 2965) is the -host name or IP address of the original request that was initiated by the user. -For example, if the request is for an image in an HTML document, this is the -request-host of the request for the page containing the image. - -**mechanize knows that redirected transactions are unverifiable, and will -handle that on its own (ie. you don't need to think about the origin -request-host or verifiability yourself).** - -If you want to initiate an unverifiable transaction yourself (which you should -if, for example, you're downloading the images from a page, and 'the user' -hasn't explicitly OKed those URLs): - -~~~~{.python} -request = Request(origin_req_host="www.example.com", unverifiable=True) -~~~~ - - -RFC 2965 support ----------------- - -Support for the RFC 2965 protocol is switched off by default, because few -browsers implement it, so the RFC 2965 protocol is essentially never seen on -the internet. To switch it on, see [here](#policy). - - -Parsing HTTP dates ------------------- - -A function named `str2time` is provided by the package, which may be useful for -parsing dates in HTTP headers. `str2time` is intended to be liberal, since -HTTP date/time formats are poorly standardised in practice. There is no need -to use this function in normal operations: `CookieJar` instances keep track of -cookie lifetimes automatically. This function will stay around in some form, -though the supported date/time formats may change. - - -Dealing with bad HTML ---------------------- - -XXX Intro - -XXX Test me - - -Note about cookie standards ---------------------------- - -There are several standards relevant to HTTP cookies. - -The Netscape protocol is the only standard supported by most web browsers -(including Internet Explorer and Firefox). This is a *de facto* standard -defined by the behaviour of popular browsers, and neither the -[cookie\_spec.html](http://curl.haxx.se/rfc/cookie_spec.html) document that was -published by Netscape, nor the RFCs that were published later, describe the -Netscape protocol accurately or completely. Netscape protocol cookies are also -known as V0 cookies, to distinguish them from RFC 2109 or RFC 2965 cookies, -which have a version cookie-attribute with a value of 1. - -[RFC 2109](http://www.ietf.org/rfc/rfc2109.txt) was introduced to fix some -problems identified with the Netscape protocol, while still keeping the same -HTTP headers (`Cookie` and `Set-Cookie`). The most prominent of these problems -is the 'third-party' cookie issue, which was an accidental feature of the -Netscape protocol. Some features defined by RFC2109 (such as the port and -max-age cookie attributes) are now part of the de facto Netscape protocol, but -the RFC was never implemented fully by browsers, because of differences in -behaviour between the Netscape and Internet Explorer browsers of the time. - -[RFC 2965](http://www.ietf.org/rfc/rfc2965.txt) attempted to fix the -compatibility problem by introducing two new headers, `Set-Cookie2` and -`Cookie2`. Unlike the `Cookie` header, `Cookie2` does *not* carry cookies to -the server -- rather, it simply advertises to the server that RFC 2965 is -understood. `Set-Cookie2` *does* carry cookies, from server to client: the new -header means that both IE and Netscape ignore these cookies. This preserves -backwards compatibility, but popular browsers did not implement the RFC, so it -was never widely adopted. One confusing point to note about RFC 2965 is that -it uses the same value (1) of the Version attribute in HTTP headers as does RFC -2109. See also [RFC 2964](http://www.ietf.org/rfc/rfc2964.txt), which -discusses use of the protocol. - -Because Netscape cookies are so poorly specified, the general philosophy of the -module's Netscape protocol implementation is to start with RFC 2965 and open -holes where required for Netscape protocol-compatibility. RFC 2965 cookies are -*always* treated as RFC 2965 requires, of course. - -There is more information about the history of HTTP cookies in [this paper by -David Kristol](http://arxiv.org/abs/cs.SE/0105018). - -Recently (2011), [an IETF effort has -started](http://tools.ietf.org/html/draft-ietf-httpstate-cookie) to specify the -syntax and semantics of the `Cookie` and `Set-Cookie` headers as they are -actually used on the internet. - - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/documentation.txt b/docs/documentation.txt deleted file mode 100644 index 1e0793d..0000000 --- a/docs/documentation.txt +++ /dev/null @@ -1,132 +0,0 @@ -% mechanize -- Documentation - -Full API documentation is in the docstrings and the documentation of -[`urllib2`](http://docs.python.org/release/2.6/library/urllib2.html). The -documentation in these web pages is in need of reorganisation at the moment, -after the merge of ClientCookie and ClientForm into mechanize. - - -Tests and examples ------------------- - -### Examples ### - -The [front page](./) has some introductory examples. - -The `examples` directory in the source packages contains a couple of silly, -but working, scripts to demonstrate basic use of the module. - -See also the [forms examples](./forms.html) (these examples use the forms API -independently of `mechanize.Browser`). - - -### Tests ### - -To run the tests: - - python test.py - -There are some tests that try to fetch URLs from the internet. To include -those in the test run: - - python test.py discover --tag internet - - -The `urllib2` interface ------------------------ - -mechanize exports the complete interface of `urllib2`. See the [`urllib2` -documentation](http://docs.python.org/release/2.6/library/urllib2.html). For -example: - -~~~~{.python} -import mechanize -response = mechanize.urlopen("http://www.example.com/") -print response.read() -~~~~ - - -Compatibility -------------- - -These notes explain the relationship between mechanize, ClientCookie, -ClientForm, `cookielib` and `urllib2`, and which to use when. If you're just -using mechanize, and not any of those other libraries, you can ignore this -section. - - #. mechanize works with Python 2.4, Python 2.5, Python 2.6, and Python 2.7. - - #. When using mechanize, anything you would normally import from `urllib2` - should be imported from `mechanize` instead. - - #. Use of mechanize classes with `urllib2` (and vice-versa) is no longer - supported. However, existing classes implementing the `urllib2 Handler` - interface are likely to work unchanged with mechanize. - - #. mechanize now only imports `urllib2.URLError` and `urllib2.HTTPError` from - `urllib2`. The rest is forked. I intend to merge fixes from Python trunk - frequently. - - #. ClientForm is no longer maintained as a separate package. The code is - now part of mechanize, and its interface is now exported through module - mechanize (since mechanize 0.2.0). Old code can simply be changed to - `import mechanize as ClientForm` and should continue to work. - - #. ClientCookie is no longer maintained as a separate package. The code is - now part of mechanize, and its interface is now exported through module - mechanize (since mechanize 0.1.0). Old code can simply be changed to - `import mechanize as ClientCookie` and should continue to work. - - #. The cookie handling parts of mechanize are in Python 2.4 standard library - as module `cookielib` and extensions to module `urllib2`. mechanize does - not currently use `cookielib`, due to the presence of thread - synchronisation code in `cookielib` that is not present in the mechanize - fork of `cookielib`. - -API differences between mechanize and `urllib2`: - - #. mechanize provides additional features. - - #. `mechanize.urlopen` differs in its behaviour: it handles cookies, whereas - `urllib2.urlopen` does not. To make a `urlopen` function with the - `urllib2` behaviour: - -~~~~{.python} -import mechanize -handler_classes = [mechanize.ProxyHandler, - mechanize.UnknownHandler, - mechanize.HTTPHandler, - mechanize.HTTPDefaultErrorHandler, - mechanize.HTTPRedirectHandler, - mechanize.FTPHandler, - mechanize.FileHandler, - mechanize.HTTPErrorProcessor] -opener = mechanize.OpenerDirector() -for handler_class in handler_classes: - opener.add_handler(handler_class()) -urlopen = opener.open -~~~~ - - #. Since Python 2.6, `urllib2` uses a `.timeout` attribute on `Request` - objects internally. However, `urllib2.Request` has no timeout constructor - argument, and `urllib2.urlopen()` ignores this parameter. - `mechanize.Request` has a `timeout` constructor argument which is used to - set the attribute of the same name, and `mechanize.urlopen()` does not - ignore the timeout attribute. - - -UserAgent vs UserAgentBase --------------------------- - -`mechanize.UserAgent` is a trivial subclass of `mechanize.UserAgentBase`, -adding just one method, `.set_seekable_responses()` (see the [documentation on -seekable responses](./doc.html#seekable-responses)). - -The reason for the extra class is that `mechanize.Browser` depends on seekable -response objects (because response objects are used to implement the browser -history). - - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/download.txt.in b/docs/download.txt.in deleted file mode 100644 index 2e7528a..0000000 --- a/docs/download.txt.in +++ /dev/null @@ -1,55 +0,0 @@ -% mechanize -- Download - -There is more than one way to obtain mechanize: - -_Note re Windows and Mac support: currently the tests are only routinely run on -[Ubuntu](http://www.ubuntu.com/) 10.10 ("maverick"). However, as far as I know, -mechanize works fine on Windows and Mac platforms._ - - -easy_install ------------- - - #. Install [EasyInstall](http://peak.telecommunity.com/DevCenter/EasyInstall) - - #. `easy_install mechanize` - -Easy install will automatically download the latest source code release and -install it. - - -Source code release -------------------- - - #. Download the source from one of the links below - - #. Unpack the source distribution and change directory to the resulting -top-level directory. - - #. `python setup.py install` - - -This is a stable release. - - * [`mechanize-@(version).tar.gz`](http://pypi.python.org/packages/source/m/mechanize/mechanize-@(version).tar.gz) - - * [`mechanize-@(version).zip`](http://pypi.python.org/packages/source/m/mechanize/mechanize-@(version).zip) - - * [Older versions.](./src/) Note: these are hosted on sourceforge, which at the time of writing (2011-03-31) is returning invalid HTTP responses -- you can also find old releases on [PyPI](http://pypi.python.org/)) - -All the documentation (these web pages, docstrings, and [the -changelog](./ChangeLog.txt)) is included in the distribution. - - -git repository --------------- - -The [git](http://git-scm.com/) repository is -[here](http://github.com/jjlee/mechanize). To check it out: - - #. <p>`git clone git://github.com/jjlee/mechanize.git`</p> - - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/faq.txt b/docs/faq.txt deleted file mode 100644 index fdfab47..0000000 --- a/docs/faq.txt +++ /dev/null @@ -1,368 +0,0 @@ -% mechanize -- FAQ - -<div class="expanded"> - - - * <span class="q">Which version of Python do I need?</span> - - Python 2.4, 2.5, 2.6, or 2.7. Python 3 is not yet supported. - - * <span class="q">Does mechanize depend on BeautifulSoup?</span> - - No. mechanize offers a few classes that make use of BeautifulSoup, but -these classes are not required to use mechanize. mechanize bundles -BeautifulSoup version 2, so that module is no longer required. A future -version of mechanize will support BeautifulSoup version 3, at which point -mechanize will likely no longer bundle the module. - - * <span class="q">Does mechanize depend on ClientForm?</span> - - No, ClientForm is now part of mechanize. - - * <span class="q">Which license?</span> - - mechanize is dual-licensed: you may pick either the [BSD -license](http://www.opensource.org/licenses/bsd-license.php), or the [ZPL -2.1](http://www.zope.org/Resources/ZPL) (both are included in the -distribution). - - -Usage ------ - - * <span class="q">I'm not getting the HTML page I expected to see.</span> - - [Debugging tips](hints.html) - - * <span class="q">`Browser` doesn't have all of the forms/links I see in the -HTML. Why not?</span> - - Perhaps the default parser can't cope with invalid HTML. Try using the -included BeautifulSoup 2 parser instead: - -~~~~{.python} -import mechanize - -browser = mechanize.Browser(factory=mechanize.RobustFactory()) -browser.open("http://example.com/") -print browser.forms -~~~~ - - Alternatively, you can process the HTML (and headers) arbitrarily: - -~~~~{.python} -browser = mechanize.Browser() -browser.open("http://example.com/") -html = browser.response().get_data().replace("<br/>", "<br />") -response = mechanize.make_response( - html, [("Content-Type", "text/html")], - "http://example.com/", 200, "OK") -browser.set_response(response) -~~~~ - - * <span class="q">Is JavaScript supported?</span> - - No, sorry. See [FAQs](#change-value) [below](#script). - - * <span class="q">My HTTP response data is truncated.</span> - - `mechanize.Browser's` response objects support the `.seek()` method, and -can still be used after `.close()` has been called. Response data is not -fetched until it is needed, so navigation away from a URL before fetching all -of the response will truncate it. Call `response.get_data()` before navigation -if you don't want that to happen. - - * <a name="xhtml" /><span class="q">I'm *sure* this page is HTML, why does `mechanize.Browser` -think otherwise?</span> - -~~~~{.python} -b = mechanize.Browser( - # mechanize's XHTML support needs work, so is currently switched off. If - # we want to get our work done, we have to turn it on by supplying a - # mechanize.Factory (with XHTML support turned on): - factory=mechanize.DefaultFactory(i_want_broken_xhtml_support=True) - ) -~~~~ - - * <span class="q">Why don't timeouts work for me?</span> - - Timeouts are ignored with with versions of Python earlier than 2.6. -Timeouts do not apply to DNS lookups. - - * <span class="q">Is there any example code?</span> - - Look in the `examples/` directory. Note that the examples on the [forms - page](./forms.html) are executable as-is. Contributions of example code - would be very welcome! - - -Cookies -------- - - * <span class="q">Doesn't the standard Python library module, `Cookie`, do - this?</span> - - No: module `Cookie` does the server end of the job. It doesn't know when - to accept cookies from a server or when to send them back. Part of - mechanize has been contributed back to the standard library as module - `cookielib` (there are a few differences, notably that `cookielib` contains - thread synchronization code; mechanize does not use `cookielib`). - - * <span class="q">Which HTTP cookie protocols does mechanize support?</span> - - Netscape and [RFC 2965](http://www.ietf.org/rfc/rfc2965.txt). RFC 2965 - handling is switched off by default. - - * <span class="q">What about RFC 2109?</span> - - RFC 2109 cookies are currently parsed as Netscape cookies, and treated - by default as RFC 2965 cookies thereafter if RFC 2965 handling is enabled, - or as Netscape cookies otherwise. - - - * <span class="q">Why don't I have any cookies?</span> - - See [here](hints.html#cookies). - - * <span class="q">My response claims to be empty, but I know it's not!</span> - - Did you call `response.read()` (e.g., in a debug statement), then forget - that all the data has already been read? In that case, you may want to use - `mechanize.response_seek_wrapper`. `mechanize.Browser` always returns - [seekable responses](doc.html#seekable-responses), so it's not necessary to - use this explicitly in that case. - - * <span class="q">What's the difference between the `.load()` and `.revert()` - methods of `CookieJar`?</span> - - `.load()` *appends* cookies from a file. `.revert()` discards all - existing cookies held by the `CookieJar` first (but it won't lose any - existing cookies if the loading fails). - - * <span class="q">Is it threadsafe?</span> - - No. As far as I know, you can use mechanize in threaded code, but it - provides no synchronisation: you have to provide that yourself. - - * <span class="q">How do I do <X\></span> - - Refer to the API documentation in docstrings. - - -Forms ------ - - * <span class="q">Doesn't the standard Python library module, `cgi`, do this?</span> - - No: the `cgi` module does the server end of the job. It doesn't know - how to parse or fill in a form or how to send it back to the server. - - * <span class="q">How do I figure out what control names and values to use?</span> - - `print form` is usually all you need. In your code, things like the - `HTMLForm.items` attribute of `HTMLForm` instances can be useful to inspect - forms at runtime. Note that it's possible to use item labels instead of - item names, which can be useful — use the `by_label` arguments to the - various methods, and the `.get_value_by_label()` / `.set_value_by_label()` - methods on `ListControl`. - - * <span class="q">What do those `'*'` characters mean in the string - representations of list controls?</span> - - A `*` next to an item means that item is selected. - - * <span class="q">What do those parentheses (round brackets) mean in the string - representations of list controls?</span> - - Parentheses `(foo)` around an item mean that item is disabled. - - * <span class="q">Why doesn't <some control\> turn up in the data returned by - `.click*()` when that control has non-`None` value?</span> - - Either the control is disabled, or it is not successful for some other - reason. 'Successful' (see [HTML 4 - specification](http://www.w3.org/TR/REC-html40/interact/forms.html#h-17.13.2)) - means that the control will cause data to get sent to the server. - - * <span class="q">Why does mechanize not follow the HTML 4.0 / RFC 1866 - standards for `RADIO` and multiple-selection `SELECT` controls?</span> - - Because by default, it follows browser behaviour when setting the - initially-selected items in list controls that have no items explicitly - selected in the HTML. Use the `select_default` argument to `ParseResponse` - if you want to follow the RFC 1866 rules instead. Note that browser - behaviour violates the HTML 4.01 specification in the case of `RADIO` - controls. - - * <span class="q">Why does `.click()`ing on a button not work for me?</span> - - * Clicking on a `RESET` button doesn't do anything, by design - this is a - library for web automation, not an interactive browser. Even in an - interactive browser, clicking on `RESET` sends nothing to the server, - so there is little point in having `.click()` do anything special here. - - * Clicking on a `BUTTON TYPE=BUTTON` doesn't do anything either, also by - design. This time, the reason is that that `BUTTON` is only in the - HTML standard so that one can attach JavaScript callbacks to its - events. Their execution may result in information getting sent back to - the server. mechanize, however, knows nothing about these callbacks, - so it can't do anything useful with a click on a `BUTTON` whose type is - `BUTTON`. - - * Generally, JavaScript may be messing things up in all kinds of ways. - See the answer to the next question. - - * <a name="change-value" /><span class="q">How do I change `INPUT -TYPE=HIDDEN` field values (for example, to emulate the effect of JavaScript -code)?</span> - - As with any control, set the control's `readonly` attribute false. - -~~~~{.python} -form.find_control("foo").readonly = False # allow changing .value of control foo -form.set_all_readonly(False) # allow changing the .value of all controls -~~~~ - - * <span class="q">I'm having trouble debugging my code.</span> - - See [here](hints.html) for few relevant tips. - - * <span class="q">I have a control containing a list of integers. How do I - select the one whose value is nearest to the one I want?</span> - -~~~~{.python} -import bisect -def closest_int_value(form, ctrl_name, value): - values = map(int, [item.name for item in form.find_control(ctrl_name).items]) - return str(values[bisect.bisect(values, value) - 1]) - -form["distance"] = [closest_int_value(form, "distance", 23)] -~~~~ - - -General -------- - - * <a name="sniffing" /><span class="q">I want to see what my web browser is - doing, but standard network sniffers like - [wireshark](http://www.wireshark.org/) or netcat (nc) don't work for HTTPS. - How do I sniff HTTPS traffic?</span> - - Three good options: - - * Mozilla plugin: [LiveHTTPHeaders](http://livehttpheaders.mozdev.org/). - - * [ieHTTPHeaders](http://www.blunck.info/iehttpheaders.html) does - the same for MSIE. - - * Use [`lynx`](http://lynx.browser.org/) `-trace`, and filter out - the junk with a script. - - * <a name="script" /><span class="q">JavaScript is messing up my - web-scraping. What do I do?</span> - - JavaScript is used in web pages for many purposes -- for example: creating - content that was not present in the page at load time, submitting or - filling in parts of forms in response to user actions, setting cookies, - etc. mechanize does not provide any support for JavaScript. - - If you come across this in a page you want to automate, you have four - options. Here they are, roughly in order of simplicity. - - * Figure out what the JavaScript is doing and emulate it in your Python - code: for example, by manually adding cookies to your `CookieJar` - instance, calling methods on `HTMLForm`s, calling `urlopen`, etc. See - [above](#change-value) re forms. - - * Use Java's [HtmlUnit](http://htmlunit.sourceforge.net/) or - [HttpUnit](http://httpunit.sourceforge.net) from Jython, since they - know some JavaScript. - - * Instead of using mechanize, automate a browser instead. For example - use MS Internet Explorer via its COM automation interfaces, using the - [Python for Windows - extensions](http://starship.python.net/crew/mhammond/), aka pywin32, - aka win32all (e.g. [simple - function](http://vsbabu.org/mt/archives/2003/06/13/ie_automation.html), - [pamie](http://pamie.sourceforge.net/); [pywin32 chapter from the - O'Reilly - book](http://www.oreilly.com/catalog/pythonwin32/chapter/ch12.html)) or - [ctypes](http://python.net/crew/theller/ctypes/) - ([example](http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/305273)). - [This](http://www.brunningonline.net/simon/blog/archives/winGuiAuto.py.html) - kind of thing may also come in useful on Windows for cases where the - automation API is lacking. For Firefox, there is - [PyXPCOM](https://developer.mozilla.org/en/PyXPCOM). - - * Get ambitious and automatically delegate the work to an appropriate - interpreter (Mozilla's JavaScript interpreter, for instance). This is - what HtmlUnit and httpunit do. I did a spike along these lines some - years ago, but I think it would (still) be quite a lot of work to do - well. - - * <span class="q">Misc links</span> - - * <a name="parsing" />The following libraries can be useful for dealing - with bad HTML: [lxml.html](http://codespeak.net/lxml/lxmlhtml.html), - [html5lib](http://code.google.com/p/html5lib/), [BeautifulSoup - 3](http://www.crummy.com/software/BeautifulSoup/CHANGELOG.html), - [mxTidy](http://www.egenix.com/files/python/mxTidy.html) and - [mu-Tidylib](http://utidylib.berlios.de/). - - * [Selenium](http://www.openqa.org/selenium/): In-browser web functional - testing. If you need to test websites against real browsers, this is a - standard way to do it. - - * O'Reilly book: [Spidering - Hacks](http://oreilly.com/catalog/9780596005771). Very Perl-oriented. - - * Standard extensions for web development with Firefox, which are also - handy if you're scraping the web: [Web - Developer](http://chrispederick.com/work/webdeveloper/) (amongst other - things, this can display HTML form information), - [Firebug](http://getfirebug.com/). - - * Similar functionality for IE6 and IE7: [Internet Explorer Developer - Toolbar](http://www.google.co.uk/search?q=internet+explorer+developer+toolbar&btnI=I'm+Feeling+Lucky) - (IE8 comes with something equivalent built-in, as does Google Chrome). - - * [Open source functional testing - tools](http://www.opensourcetesting.org/functional.php). - - * [A HOWTO on web - scraping](http://www.rexx.com/~dkuhlman/quixote_htmlscraping.html) from - Dave Kuhlman. - - * <span class="q">Will any of this code make its way into the Python standard - library?</span> - - The request / response processing extensions to `urllib2` from mechanize - have been merged into `urllib2` for Python 2.4. The cookie processing has - been added, as module `cookielib`. There are other features that would be - appropriate additions to `urllib2`, but since Python 2 is heading into - bugfix-only mode, and I'm not using Python 3, they're unlikely to be added. - - * <span class="q">Where can I find out about the relevant standards?</span> - - * [HTML 4.01 Specification](http://www.w3.org/TR/html401/) - - * [Draft HTML 5 Specification](http://dev.w3.org/html5/spec/) - - * [RFC 1866](http://www.ietf.org/rfc/rfc1866.txt) - the HTML 2.0 - standard (you don't want to read this) - - * [RFC 1867](http://www.ietf.org/rfc/rfc1867.txt) - Form-based file - upload - - * [RFC 2616](http://www.ietf.org/rfc/rfc2616.txt) - HTTP 1.1 - Specification - - * [RFC 3986](http://www.ietf.org/rfc/rfc3986.txt) - URIs - - * [RFC 3987](http://www.ietf.org/rfc/rfc3987.txt) - IRIs - -</div> - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/forms.txt.in b/docs/forms.txt.in deleted file mode 100644 index a63c8be..0000000 --- a/docs/forms.txt.in +++ /dev/null @@ -1,100 +0,0 @@ -% mechanize -- Forms - -<span class="docwarning">This documentation is in need of reorganisation!</span> - -This page is the old ClientForm documentation. ClientForm is now part of -mechanize, but the documentation hasn't been fully updated to reflect that: -what's here is correct, but not well-integrated with the rest of the -documentation. This page deals with HTML form handling: parsing HTML forms, -filling them in and returning the completed forms to the server. See the -[front page](./) for how to obtain form objects from a `mechanize.Browser`. - -Simple working example (`examples/forms/simple.py` in the source distribution): - -~~~~{.python} -@("".join(open("../examples/forms/simple.py").readlines()[2:])) -~~~~ - -A more complicated working example (from `examples/forms/example.py` in the -source distribution): - -~~~~{.python} -@("".join(open("../examples/forms/example.py").readlines()[2:])) -~~~~ - -All of the standard control types are supported: `TEXT`, `PASSWORD`, `HIDDEN`, -`TEXTAREA`, `ISINDEX`, `RESET`, `BUTTON` (`INPUT TYPE=BUTTON` and the various -`BUTTON` types), `SUBMIT`, `IMAGE`, `RADIO`, `CHECKBOX`, `SELECT`/`OPTION` and -`FILE` (for file upload). Both standard form encodings -(`application/x-www-form-urlencoded` and `multipart/form-data`) are supported. - -The module is designed for testing and automation of web interfaces, not for -implementing interactive user agents. - -***Security note*: Remember that any passwords you store in `HTMLForm` -instances will be saved to disk in the clear if, for example, you -[pickle](http://docs.python.org/library/pickle.html) them.** - - -Parsers -------- - -There are two parsers. - -TODO: more! - -See also the FAQ entries on [XHTML](faq.html#xhtml) and [parsing bad -HTML](./faq.html#parsing). - - -Backwards-compatibility mode ----------------------------- - -mechanize (and ClientForm 0.2) includes three minor backwards-incompatible -interface changes from ClientForm version 0.1. - -To make upgrading from ClientForm 0.1 easier, and to allow me to stop -supporting version ClientForm 0.1 sooner, there is support for operating in a -backwards-compatible mode, under which code written for ClientForm 0.1 should -work without modification. This is done on a per-`HTMLForm` basis via the -`.backwards_compat` attribute, but for convenience the `ParseResponse()` and -`ParseFile()` factory functions accept `backwards_compat` arguments. These -backwards-compatibility features will be removed soon. The default is to -operate in backwards-compatible mode. To run with backwards compatible mode -turned ***OFF*** (**strongly recommended**): - -~~~~{.python} -from mechanize import ParseResponse, urlopen -forms = ParseResponse(urlopen("http://example.com/"), backwards_compat=False) -# ... -~~~~ - -The backwards-incompatible changes are: - - * Ambiguous specification of controls or items now results in AmbiguityError. - If you want the old behaviour, explicitly pass `nr=0` to indicate you want - the first matching control or item. - - * Item label matching is now done by substring, not by strict string-equality - (but note leading and trailing space is always stripped). (Control label - matching is always done by substring.) - - * Handling of disabled list items has changed. First, note that handling of - disabled list items in ClientForm 0.1 (and in ClientForm 0.2's - backwards-compatibility mode!) is buggy: disabled items are successful - (ie. disabled item names are sent back to the server). As a result, there - was no distinction to be made between successful items and selected items. - In ClientForm 0.2, the bug is fixed, so this is no longer the case, and it - is important to note that list controls' `.value` attribute contains only - the *successful* item names; items that are *selected* but not successful - (because disabled) are not included in `.value`. Second, disabled list - items may no longer be deselected: AttributeError is raised in ClientForm - 0.2, whereas deselection was allowed in ClientForm 0.1. The bug in - ClientForm 0.1 and in ClientForm 0.2's backwards-compatibility mode will - not be fixed, to preserve compatibility and to encourage people to upgrade - to the new ClientForm 0.2 `backwards_compat=False` behaviour. - - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/hints.txt b/docs/hints.txt deleted file mode 100644 index 35e1db0..0000000 --- a/docs/hints.txt +++ /dev/null @@ -1,154 +0,0 @@ -% mechanize -- Hints - -Hints for debugging programs that use mechanize. - -Cookies -------- - -A common mistake is to use `mechanize.urlopen()`, *and* the -`.extract_cookies()` and `.add_cookie_header()` methods on a cookie object -themselves. If you use `mechanize.urlopen()` (or `OpenerDirector.open()`), the -module handles extraction and adding of cookies by itself, so you should not -call `.extract_cookies()` or `.add_cookie_header()`. - -Are you sure the server is sending you any cookies in the first place? Maybe -the server is keeping track of state in some other way (`HIDDEN` HTML form -entries (possibly in a separate page referenced by a frame), URL-encoded -session keys, IP address, HTTP `Referer` headers)? Perhaps some embedded -script in the HTML is setting cookies (see below)? Turn on -[logging](#logging). - -When you `.save()` to or `.load()`/`.revert()` from a file, single-session -cookies will expire unless you explicitly request otherwise with the -`ignore_discard` argument. This may be your problem if you find cookies are -going away after saving and loading. - -~~~~{.python} -import mechanize -cj = mechanize.LWPCookieJar() -opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) -mechanize.install_opener(opener) -r = mechanize.urlopen("http://foobar.com/") -cj.save("/some/file", ignore_discard=True, ignore_expires=True) -~~~~ - -JavaScript code can set cookies; mechanize does not support this. See [the -FAQ](faq.html#script). - - -General -------- - -Enable [logging](#logging). - -Sometimes, a server wants particular HTTP headers set to the values it expects. -For example, the `User-Agent` header may need to be [set](./doc.html#headers) -to a value like that of a popular browser. - -Check that the browser is able to do manually what you're trying to achieve -programatically. Make sure that what you do manually is *exactly* the same as -what you're trying to do from Python -- you may simply be hitting a server bug -that only gets revealed if you view pages in a particular order, for example. - -Try comparing the headers and data that your program sends with those that a -browser sends. Often this will give you the clue you need. There are [browser -addons](faq.html#sniffing) available that allow you to see what the browser -sends and receives even if HTTPS is in use. - -If nothing is obviously wrong with the requests your program is sending and -you're out of ideas, you can reliably locate the problem by copying the headers -that a browser sends, and then changing headers until your program stops -working again. Temporarily switch to explicitly sending individual HTTP -headers (by calling `.add_header()`, or by using `httplib` directly). Start by -sending exactly the headers that Firefox or IE send. You may need to make sure -that a valid session ID is sent -- the one you got from your browser may no -longer be valid. If that works, you can begin the tedious process of changing -your headers and data until they match what your original code was sending. -You should end up with a minimal set of changes. If you think that reveals a -bug in mechanize, please [report it](support.html). - - -Logging -------- - -To enable logging to stdout: - -~~~~{.python} -import sys, logging -logger = logging.getLogger("mechanize") -logger.addHandler(logging.StreamHandler(sys.stdout)) -logger.setLevel(logging.DEBUG) -~~~~ - -You can reduce the amount of information shown by setting the level to -`logging.INFO` instead of `logging.DEBUG`, or by only enabling logging for one -of the following logger names instead of `"mechanize"`: - - * `"mechanize"`: Everything. - - * `"mechanize.cookies"`: Why particular cookies are accepted or rejected and why -they are or are not returned. Requires logging enabled at the `DEBUG` level. - - * `"mechanize.http_responses"`: HTTP response body data. - - * `"mechanize.http_redirects"`: HTTP redirect information. - - -HTTP headers ------------- - -An example showing how to enable printing of HTTP headers to stdout, logging of -HTTP response bodies, and logging of information about redirections: - -~~~~{.python} -import sys, logging -import mechanize - -logger = logging.getLogger("mechanize") -logger.addHandler(logging.StreamHandler(sys.stdout)) -logger.setLevel(logging.DEBUG) - -browser = mechanize.Browser() -browser.set_debug_http(True) -browser.set_debug_responses(True) -browser.set_debug_redirects(True) -response = browser.open("http://python.org/") -~~~~ - -Alternatively, you can examine request and response objects to see what's going -on. Note that requests may involve "sub-requests" in cases such as -redirection, in which case you will not see everything that's going on just by -examining the original request and final response. It's often useful to [use -the `.get_data()` method](./doc.html#seekable-responses) on responses during -debugging. - -### Handlers ### - -**This section is not relevant if you use `mechanize.Browser`.** - -An example showing how to enable printing of HTTP headers to stdout, at the -`HTTPHandler` level: - -~~~~{.python} -import mechanize -hh = mechanize.HTTPHandler() # you might want HTTPSHandler, too -hh.set_http_debuglevel(1) -opener = mechanize.build_opener(hh) -response = opener.open(url) -~~~~ - -The following handlers are available: - -**NOTE**: as well as having these handlers in your `OpenerDirector` (for -example, by passing them to `build_opener()`) you have to [turn on -logging](#logging) at the `INFO` level or lower in order to see any output. - -`HTTPRedirectDebugProcessor`: logs information about redirections - -`HTTPResponseDebugProcessor`: logs HTTP response bodies (including those that -are read during redirections) - - -<!-- Local Variables: --> -<!-- fill-column:79 --> -<!-- End: --> diff --git a/docs/html.template b/docs/html.template deleted file mode 100644 index 7fd2206..0000000 --- a/docs/html.template +++ /dev/null @@ -1,50 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html4/strict.dtd"> -<html> -<!--This file was generated by pandoc: do not edit--> -<head> - <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> - <meta name="author" content="John J. Lee <jjl@pobox.com>"> - <meta name="date" content="$last_modified_iso$"> - <meta name="keywords" content="Python,HTML,HTTP,browser,stateful,web,client,client-side,mechanize,cookie,form,META,HTTP-EQUIV,Refresh,ClientForm,ClientCookie,pullparser,WWW::Mechanize"> - <meta name="keywords" content="cookie,HTTP,Python,web,client,client-side,HTML,META,HTTP-EQUIV,Refresh"> - <style type="text/css" media="screen">@import "../styles/style.css";</style> - <!--breaks resizing text in IE6,7,8 (the lack of it also breaks baseline grid a bit in IE8 - can't win)--> - <!--[if !IE]>--><style type="text/css" media="screen">body{font-size:14px;}</style><!--<![endif]--> - <!--max-width--> - <!--[if IE 6]><script type="text/javascript" src="../styles/ie6.js"></script><![endif]--> - <title>$if(pagetitle)$$pagetitle$$endif$ - - -
-SourceForge.net. Fast, secure and Free Open Source software downloads -
- -
- -$nav$ - -
- -$if(title)$ -

$title$

-$endif$ - -$subnav$ - -$toc$ - -$body$ - -

I prefer questions and comments to be sent to -the mailing -list rather than direct to me.

- -

John J. Lee, $last_modified_month_year$. - -


- -
-
- - diff --git a/docs/index.txt b/docs/index.txt deleted file mode 100644 index a258d13..0000000 --- a/docs/index.txt +++ /dev/null @@ -1,180 +0,0 @@ -% mechanize - -Stateful programmatic web browsing in Python, after Andy Lester's Perl -module [`WWW::Mechanize`](http://search.cpan.org/dist/WWW-Mechanize/). - - * `mechanize.Browser` and `mechanize.UserAgentBase` implement the - interface of `urllib2.OpenerDirector`, so: - - * any URL can be opened, not just `http:` - - * `mechanize.UserAgentBase` offers easy dynamic configuration of - user-agent features like protocol, cookie, redirection and - `robots.txt` handling, without having to make a new - `OpenerDirector` each time, e.g. by calling `build_opener()`. - - * Easy HTML form filling. - - * Convenient link parsing and following. - - * Browser history (`.back()` and `.reload()` methods). - - * The `Referer` HTTP header is added properly (optional). - - * Automatic observance of - [`robots.txt`](http://www.robotstxt.org/wc/norobots.html). - - * Automatic handling of HTTP-Equiv and Refresh. - - -Examples --------- - -The examples below are written for a website that does not exist -(`example.com`), so cannot be run. There are also some [working -examples](documentation.html#examples) that you can run. - -~~~~{.python} -import re -import mechanize - -br = mechanize.Browser() -br.open("http://www.example.com/") -# follow second link with element text matching regular expression -response1 = br.follow_link(text_regex=r"cheese\s*shop", nr=1) -assert br.viewing_html() -print br.title() -print response1.geturl() -print response1.info() # headers -print response1.read() # body - -br.select_form(name="order") -# Browser passes through unknown attributes (including methods) -# to the selected HTMLForm. -br["cheeses"] = ["mozzarella", "caerphilly"] # (the method here is __setitem__) -# Submit current form. Browser calls .close() on the current response on -# navigation, so this closes response1 -response2 = br.submit() - -# print currently selected form (don't call .submit() on this, use br.submit()) -print br.form - -response3 = br.back() # back to cheese shop (same data as response1) -# the history mechanism returns cached response objects -# we can still use the response, even though it was .close()d -response3.get_data() # like .seek(0) followed by .read() -response4 = br.reload() # fetches from server - -for form in br.forms(): - print form -# .links() optionally accepts the keyword args of .follow_/.find_link() -for link in br.links(url_regex="python.org"): - print link - br.follow_link(link) # takes EITHER Link instance OR keyword args - br.back() -~~~~ - -You may control the browser's policy by using the methods of -`mechanize.Browser`'s base class, `mechanize.UserAgent`. For example: - -~~~~{.python} -br = mechanize.Browser() -# Explicitly configure proxies (Browser will attempt to set good defaults). -# Note the userinfo ("joe:password@") and port number (":3128") are optional. -br.set_proxies({"http": "joe:password@myproxy.example.com:3128", - "ftp": "proxy.example.com", - }) -# Add HTTP Basic/Digest auth username and password for HTTP proxy access. -# (equivalent to using "joe:password@..." form above) -br.add_proxy_password("joe", "password") -# Add HTTP Basic/Digest auth username and password for website access. -br.add_password("http://example.com/protected/", "joe", "password") -# Don't handle HTTP-EQUIV headers (HTTP headers embedded in HTML). -br.set_handle_equiv(False) -# Ignore robots.txt. Do not do this without thought and consideration. -br.set_handle_robots(False) -# Don't add Referer (sic) header -br.set_handle_referer(False) -# Don't handle Refresh redirections -br.set_handle_refresh(False) -# Don't handle cookies -br.set_cookiejar() -# Supply your own mechanize.CookieJar (NOTE: cookie handling is ON by -# default: no need to do this unless you have some reason to use a -# particular cookiejar) -br.set_cookiejar(cj) -# Log information about HTTP redirects and Refreshes. -br.set_debug_redirects(True) -# Log HTTP response bodies (ie. the HTML, most of the time). -br.set_debug_responses(True) -# Print HTTP headers. -br.set_debug_http(True) - -# To make sure you're seeing all debug output: -logger = logging.getLogger("mechanize") -logger.addHandler(logging.StreamHandler(sys.stdout)) -logger.setLevel(logging.INFO) - -# Sometimes it's useful to process bad headers or bad HTML: -response = br.response() # this is a copy of response -headers = response.info() # currently, this is a mimetools.Message -headers["Content-type"] = "text/html; charset=utf-8" -response.set_data(response.get_data().replace(" - - diff --git a/docs/support.txt b/docs/support.txt deleted file mode 100644 index 41670de..0000000 --- a/docs/support.txt +++ /dev/null @@ -1,26 +0,0 @@ -% mechanize -- Support - -Documentation -------------- - -See links at right. [Start here](documentation.html). - - -Bug tracker ------------ - -The bug tracker is [here on github](http://github.com/jjlee/mechanize/issues). -It's equally acceptable to file bugs on the tracker or post about them to the -mailing list. - - -Contact -------- - -There is a [mailing -list](http://lists.sourceforge.net/lists/listinfo/wwwsearch-general). - - - - - diff --git a/examples/forms/data.dat b/examples/forms/data.dat deleted file mode 100644 index d9ca6b0..0000000 --- a/examples/forms/data.dat +++ /dev/null @@ -1 +0,0 @@ -Let's pretend this is a binary file. diff --git a/examples/forms/data.txt b/examples/forms/data.txt deleted file mode 100644 index cfc5b73..0000000 --- a/examples/forms/data.txt +++ /dev/null @@ -1,3 +0,0 @@ -Text, text, text. - -Blah. diff --git a/examples/forms/echo.cgi b/examples/forms/echo.cgi deleted file mode 100755 index 2cbfb3f..0000000 --- a/examples/forms/echo.cgi +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/python -# -*-python-*- - -print "Content-Type: text/html\n" -import sys, os, string, cgi - -from types import ListType - -print "Form submission parameters" -form = cgi.FieldStorage() -print "

Received parameters:

" -print "
"
-for k in form.keys():
-    v = form[k]
-    if isinstance(v, ListType):
-        vs = []
-        for item in v:
-            vs.append(item.value)
-        text = string.join(vs, ", ")
-    else:
-        text = v.value
-    print "%s: %s" % (cgi.escape(k), cgi.escape(text))
-print "
" diff --git a/examples/forms/example.html b/examples/forms/example.html deleted file mode 100644 index c1878f3..0000000 --- a/examples/forms/example.html +++ /dev/null @@ -1,54 +0,0 @@ - - -Example - - - - -
- - - - - - - - - - - - - - - - - - - -