diff --git a/Documentation.pdf b/Documentation.pdf new file mode 100644 index 0000000000..9aa0b245a8 Binary files /dev/null and b/Documentation.pdf differ diff --git a/Project Progress Report.pdf b/Project Progress Report.pdf new file mode 100644 index 0000000000..c76c70fde2 Binary files /dev/null and b/Project Progress Report.pdf differ diff --git a/Project Proposal.pdf b/Project Proposal.pdf new file mode 100644 index 0000000000..f369792d51 Binary files /dev/null and b/Project Proposal.pdf differ diff --git a/README.md b/README.md index a7b40d2cc8..310313eac8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ -# CourseProject +# Enhancing and Extending Metapy versioning and functionality -Please fork this repository and paste the github link of your fork on Microsoft CMT. Detailed instructions are on Coursera under Week 1: Course Project Overview/Week 9 Activities. +Where are respective files placed? + +Project Progress Report: This can be found in the root directory of the project within a file labeled: Project Progress Report.pdf + +Source Code: + The source code for the core functionality can be found at: /metapy/src/nltk_additions.py + The respective test code for the core functionality can be found at: /metapy/src/nltk_test.py + +Documentation: Detailed documentation can be found in the root directory of the project within a file labeled: Documentation.pdf + +Software Tutorial Link: + +Software Tutorial Presentation Link: \ No newline at end of file diff --git a/metapy/CMakeLists.txt b/metapy/CMakeLists.txt new file mode 100644 index 0000000000..9a8fef42fa --- /dev/null +++ b/metapy/CMakeLists.txt @@ -0,0 +1,70 @@ +cmake_minimum_required(VERSION 3.2.0) + +project(metapy) + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + +set(METAPY_PYTHON_VERSION "" CACHE STRING + "Python version to use for compiling the extension") + +add_subdirectory(deps/meta EXCLUDE_FROM_ALL) + +if (NOT PYTHON_INCLUDE_DIRS) + if (NOT ${METAPY_PYTHON_VERSION} STREQUAL "") + list(APPEND Python_ADDITIONAL_VERSIONS ${METAPY_PYTHON_VERSION}) + find_package(PythonLibs ${METAPY_PYTHON_VERSION} EXACT) + if (NOT PythonLibs_FOUND) + find_package(PythonLibs ${METAPY_PYTHON_VERSION} REQUIRED) + endif() + else() + find_package(PythonLibs REQUIRED) + endif() +else() + message("-- Using manual Python include dirs: ${PYTHON_INCLUDE_DIRS}") +endif() + +include_directories(${PYTHON_INCLUDE_DIRS}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/deps/pybind11/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +add_library(metapy SHARED src/metapy_analyzers.cpp + src/metapy_classify.cpp + src/metapy_embeddings.cpp + src/metapy_index.cpp + src/metapy_learn.cpp + src/metapy_sequence.cpp + src/metapy_stats.cpp + src/metapy_parser.cpp + src/metapy_topics.cpp + src/metapy.cpp) +target_link_libraries(metapy meta-index meta-classify meta-ranker + meta-sequence meta-sequence-analyzers meta-greedy-tagger meta-parser + meta-parser-analyzers meta-embeddings meta-topics) + +# don't add a "lib" prefix to the metapy shared library +set_target_properties(metapy PROPERTIES PREFIX "") + +if (APPLE) + # OS X stupid fixes + # (see http://pybind11.readthedocs.org/en/latest/cmake.html) + set_target_properties(metapy PROPERTIES + MACOSX_RPATH "." + LINK_FLAGS "-undefined dynamic_lookup " + SUFFIX ".so") +endif() + +if (WIN32) + set_target_properties(metapy PROPERTIES SUFFIX ".pyd") + target_link_libraries(metapy ${PYTHON_LIBRARY}) + target_compile_definitions(metapy PUBLIC -DMS_WIN64) + + # fix for std::_hypot has not been declared + target_compile_definitions(metapy PUBLIC -D_hypot=hypot) +endif() + +install(TARGETS metapy DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/dist/metapy) diff --git a/metapy/LICENSE.mit b/metapy/LICENSE.mit new file mode 100644 index 0000000000..a4742a0159 --- /dev/null +++ b/metapy/LICENSE.mit @@ -0,0 +1,18 @@ +Copyright (c) 2016 Sean Massung, Chase Geigle + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/metapy/LICENSE.ncsa b/metapy/LICENSE.ncsa new file mode 100644 index 0000000000..5a1e3c5324 --- /dev/null +++ b/metapy/LICENSE.ncsa @@ -0,0 +1,29 @@ +Copyright (c) 2016 Sean Massung, Chase Geigle +All rights reserved. + +Developed by: MeTA Team + University of Illinois at Urbana-Champaign + https://meta-toolkit.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimers. Redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimers in the documentation and/or other materials provided with +the distribution. Neither the names of MeTA, University of Illinois, nor the +names of its contributors may be used to endorse or promote products derived +from this Software without specific prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. diff --git a/metapy/MANIFEST.in b/metapy/MANIFEST.in new file mode 100644 index 0000000000..7e520961df --- /dev/null +++ b/metapy/MANIFEST.in @@ -0,0 +1,16 @@ +# Licenses +include LICENSE.mit +include LICENSE.ncsa + +# Build system +include CMakeLists.txt + +# Dependencies +recursive-include deps ** + +# metapy source code +recursive-include src ** +recursive-include include ** + +# metapy package +recursive-include dist/metapy ** diff --git a/metapy/README.md b/metapy/README.md new file mode 100644 index 0000000000..e2436d3120 --- /dev/null +++ b/metapy/README.md @@ -0,0 +1,61 @@ +# metapy: (experimental) Python bindings for [MeTA][meta] + +[![Build Status](https://travis-ci.org/meta-toolkit/metapy.svg?branch=master)](https://travis-ci.org/meta-toolkit/metapy) + +[![Windows Build Status](https://ci.appveyor.com//api/projects/status/github/meta-toolkit/metapy?svg=true&branch=master)](https://ci.appveyor.com/project/skystrife/metapy) + +This project provides Python (2.7 and 3.x are supported) bindings for the +MeTA toolkit. They are still very much under construction, but the goal is +to make it seamless to use MeTA's components within any Python application +(e.g., a Django or Flask web app). + +This project is made possible by the excellent [pybind11][pybind11] +library. + +## Getting Started (the easy way) + +```bash +# Ensure your pip is up to date +pip install --upgrade pip + +# install metapy! +pip install metapy +``` + +This should work on Linux, OS X, and Windows with pretty much any recent +Python version >= 2.7. On Linux, make sure to update your `pip` to version +8.1 so you can install from a binary package---this will save you a lot of +time. + +## Getting Started (the hard way) + +You will, of course, need Python installed. You will also need its headers +to be installed as well, so look for a `python-dev` or similar package for +your system. Beyond that, you'll of course need to satisfy the requirements +for [building MeTA itself][build-guide]. + +This repository should have everything you need to get started. You should +ensure that you've fetched all of the submodules first, though: + +```bash +git submodule update --init --recursive +``` + +Once that's done, you should be able to build the library like so: + +```bash +mkdir build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release +make +``` + +You can force building against a specific version of Python if you happen +to have multiple versions installed by specifying +`-DMETAPY_PYTHON_VERSION=x.y` when invoking `cmake`. + +The module should be written to `metapy.so` in the build directory. + +[meta]: https://meta-toolkit.org +[pybind11]: https://github.com/pybind/pybind11 +[build-guide]: https://meta-toolkit.org/setup-guide.html diff --git a/metapy/examples/query_runner.py b/metapy/examples/query_runner.py new file mode 100644 index 0000000000..ca2b421f23 --- /dev/null +++ b/metapy/examples/query_runner.py @@ -0,0 +1,55 @@ +""" +Mimics MeTA's query-runner program. +""" + +import math +import sys +import time + +import metapy + +class PL2Ranker(metapy.index.RankingFunction): + """ + Create a new ranking function in Python that can be used in MeTA + """ + def __init__(self, c_param=0.5): + self.c = c_param + super(PL2Ranker, self).__init__() + + def score_one(self, sd): + lda = sd.num_docs / sd.corpus_term_count + tfn = sd.doc_term_count * math.log2(1.0 + self.c * sd.avg_dl / + sd.doc_size) + if lda < 1 or tfn <= 0: + return 0.0 + numerator = tfn * math.log2(tfn * lda) \ + + math.log2(math.e) * (1.0 / lda - tfn) \ + + 0.5 * math.log2(2.0 * math.pi * tfn) + return sd.query_term_weight * numerator / (tfn + 1.0) + +if __name__ == '__main__': + + if len(sys.argv) != 4: + print("Usage: {} config.toml queries.txt start_query".format(sys.argv[0])) + sys.exit(1) + + cfg = sys.argv[1] + idx = metapy.index.make_inverted_index(cfg) + + query_path = sys.argv[2] + query_num = int(sys.argv[3]) + start_time = time.time() + with open(query_path) as query_file: + pl2 = PL2Ranker() + for line in query_file: + query = metapy.index.Document() + query.content(line.strip()) + res_num = 1 + for doc in pl2.score(idx, query, 1000): + docno = idx.metadata(doc[0]).get('name') + print("{}\t_\t{}\t{}\t{}\tMeTA".format( query_num, docno, + res_num, doc[1])) + res_num += 1 + query_num += 1 + + print("Elapsed: {} seconds".format(round(time.time() - start_time, 4))) diff --git a/metapy/get-release.py b/metapy/get-release.py new file mode 100755 index 0000000000..454277d707 --- /dev/null +++ b/metapy/get-release.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +from __future__ import print_function + +from clint.textui import progress +import requests +import sys + +if len(sys.argv) != 2: + print("Usage: {} release-tag".format(sys.argv[0])) + sys.exit(1) + +baseurl = 'https://api.github.com/repos/meta-toolkit/metapy/releases/tags' + +r = requests.get('{}/{}'.format(baseurl, sys.argv[1])) + +if r.status_code != 200: + print("Error: {}".format(r.status_code)) + print(r.text) + sys.exit(1) + +json = r.json() + +print("Found release {} tagged by {}".format(json['tag_name'], + json['author']['login'])) + +for asset in json['assets']: + url = asset['browser_download_url'] + name = asset['name'] + print("Fetching {}...".format(name)) + + r = requests.get(url, stream=True) + if r.status_code != 200: + print("Error fetching {}: {}".format(name, r.status_code)) + print(r.text) + sys.exit(1) + + with open('dist/{}'.format(name), 'wb') as f: + total_length = int(r.headers.get('content-length')) + for chunk in progress.bar(r.iter_content(chunk_size = 4096), + expected_size = total_length / 4096 + 1): + if chunk: + f.write(chunk) + f.flush() + +print("Done!") diff --git a/metapy/include/metapy_analyzers.h b/metapy/include/metapy_analyzers.h new file mode 100644 index 0000000000..e06405b704 --- /dev/null +++ b/metapy/include/metapy_analyzers.h @@ -0,0 +1,13 @@ +/** + * @file metapy_analyzers.h + * @author Chase Geigle + */ + +#ifndef METAPY_ANALYZERS_H_ +#define METAPY_ANALYZERS_H_ + +#include +#include + +void metapy_bind_analyzers(pybind11::module& m); +#endif diff --git a/metapy/include/metapy_classify.h b/metapy/include/metapy_classify.h new file mode 100644 index 0000000000..32a19f0377 --- /dev/null +++ b/metapy/include/metapy_classify.h @@ -0,0 +1,14 @@ +/** + * @file metapy_classify.h + * @author Chase Geigle + */ + +#ifndef METAPY_CLASSIFY_H_ +#define METAPY_CLASSIFY_H_ + +#include +#include + +void metapy_bind_classify(pybind11::module& m); + +#endif diff --git a/metapy/include/metapy_embeddings.h b/metapy/include/metapy_embeddings.h new file mode 100644 index 0000000000..d03fa31d03 --- /dev/null +++ b/metapy/include/metapy_embeddings.h @@ -0,0 +1,14 @@ +/** + * @file metapy_embeddings.h + * @author Chase Geigle + */ + +#ifndef METAPY_EMBEDDINGS_H_ +#define METAPY_EMBEDDINGS_H_ + +#include +#include + +void metapy_bind_embeddings(pybind11::module& m); + +#endif diff --git a/metapy/include/metapy_identifiers.h b/metapy/include/metapy_identifiers.h new file mode 100644 index 0000000000..3c53d602eb --- /dev/null +++ b/metapy/include/metapy_identifiers.h @@ -0,0 +1,59 @@ +/** + * @file metapy_identifiers.h + * @author Chase Geigle + * + * Provides a caster to/from meta::util::identifier for pybind11. + */ + +#ifndef METAPY_IDENTIFIERS_H_ +#define METAPY_IDENTIFIERS_H_ + +#include +#include + +#include "meta/meta.h" + +namespace pybind11 +{ +namespace detail +{ + +template +struct identifier_caster +{ + using underlying_type = typename Type::underlying_type; + using type_conv = make_caster; + + PYBIND11_TYPE_CASTER(Type, _("id[") + type_conv::name() + _("]")); + + bool load(handle src, bool convert) + { + type_conv conv; + if (!conv.load(src, convert)) + return false; + value = Type{(underlying_type)conv}; + return true; + } + + static handle cast(const Type& src, return_value_policy policy, + handle parent) + { + return type_conv::cast(static_cast(src), policy, + parent); + } +}; + +template +struct type_caster> + : identifier_caster> +{ +}; + +template +struct type_caster> + : identifier_caster> +{ +}; +} +} +#endif diff --git a/metapy/include/metapy_index.h b/metapy/include/metapy_index.h new file mode 100644 index 0000000000..09ec959dfe --- /dev/null +++ b/metapy/include/metapy_index.h @@ -0,0 +1,91 @@ +/** + * @file metapy_index.h + * @author Chase Geigle + * + */ + +#ifndef METAPY_INDEX_H_ +#define METAPY_INDEX_H_ + +#include +#include + +#include "meta/index/ranker/ranker.h" +#include "meta/util/optional.h" +#include "metapy_identifiers.h" + +namespace pybind11 +{ +namespace detail +{ +// add conversion for meta::index::search_result +// see: pybind11/cast.h +template <> +struct type_caster +{ + using type = std::pair; + + // Python -> C++ + bool load(handle src, bool convert) + { + value = meta::util::nullopt; + make_caster conv; + if (!conv.load(src, convert)) + return false; + + auto pr = static_cast(conv); + value = meta::index::search_result{pr.first, pr.second}; + return true; + } + + // C++ -> Python + static handle cast(const meta::index::search_result& sr, + return_value_policy policy, handle parent) + { + auto o1 = reinterpret_steal( + make_caster::cast(sr.d_id, policy, parent)); + auto o2 = reinterpret_steal( + make_caster::cast(sr.score, policy, parent)); + + if (!o1 || !o2) + return handle(); + + tuple result(2); + PyTuple_SET_ITEM(result.ptr(), 0, o1.release().ptr()); + PyTuple_SET_ITEM(result.ptr(), 1, o2.release().ptr()); + return result.release(); + } + + static PYBIND11_DESCR name() + { + return type_descr(_("SearchResult")); + } + + static handle cast(const meta::index::search_result* sr, + return_value_policy policy, handle parent) + { + return cast(*sr, policy, parent); + } + + operator meta::index::search_result*() + { + return &*value; + } + + operator meta::index::search_result&() + { + return *value; + } + + template + using cast_op_type = pybind11::detail::cast_op_type; + + protected: + meta::util::optional value; +}; +} +} + +void metapy_bind_index(pybind11::module& m); + +#endif diff --git a/metapy/include/metapy_learn.h b/metapy/include/metapy_learn.h new file mode 100644 index 0000000000..8bcbff345b --- /dev/null +++ b/metapy/include/metapy_learn.h @@ -0,0 +1,33 @@ +/** + * @file metapy_learn.h + * @author Chase Geigle + */ + +#ifndef METAPY_LEARN_H_ +#define METAPY_LEARN_H_ + +#include +#include + +template +DatasetView make_sliced_dataset_view(const DatasetView& dv, + pybind11::slice slice) +{ + std::size_t start, stop, step, slicelength; + if (!slice.compute(dv.size(), &start, &stop, &step, &slicelength)) + throw pybind11::error_already_set{}; + + std::vector indices(slicelength); + auto it = dv.begin() + start; + for (std::size_t i = 0; i < slicelength; ++i) + { + indices[i] = it->id; + it += step; + } + + return DatasetView{dv, std::move(indices)}; +} + +void metapy_bind_learn(pybind11::module& m); + +#endif diff --git a/metapy/include/metapy_parser.h b/metapy/include/metapy_parser.h new file mode 100644 index 0000000000..7304ddbfe4 --- /dev/null +++ b/metapy/include/metapy_parser.h @@ -0,0 +1,14 @@ +/** + * @file metapy_parser.h + * @author Chase Geigle + */ + +#ifndef METAPY_PARSER_H_ +#define METAPY_PARSER_H_ + +#include +#include + +void metapy_bind_parser(pybind11::module& m); + +#endif diff --git a/metapy/include/metapy_probe_map.h b/metapy/include/metapy_probe_map.h new file mode 100644 index 0000000000..bf3bf3e8c3 --- /dev/null +++ b/metapy/include/metapy_probe_map.h @@ -0,0 +1,81 @@ +/** + * @file metapy_probe_map.h + * @author Chase Geigle + * + * Provides a caster from hashing::probe_map for pybind11. + */ + +#ifndef METAPY_PROBE_MAP_H_ +#define METAPY_PROBE_MAP_H_ + +#include +#include + +#include "meta/hashing/probe_map.h" + +namespace pybind11 +{ +namespace detail +{ + +// add conversion for hashing::probe_map +// @see pybind11/stl.h +template +struct probe_map_caster +{ + using type = Type; + using key_conv = type_caster::type>; + using value_conv = type_caster::type>; + + bool load(handle src, bool convert) + { + dict d{src, true}; + if (!d.check()) + return false; + make_caster kconv; + make_caster vconv; + value.clear(); + for (auto it : d) + { + if (!kconv.load(it.first.ptr(), convert) + || !vconv.load(it.second.ptr(), convert)) + return false; + value.emplace((Key)kconv, (Value)vconv); + } + return true; + } + + static handle cast(const type& src, return_value_policy policy, + handle parent) + { + dict d; + for (const auto& kv : src) + { + auto key = reinterpret_steal( + make_caster::cast(kv.key(), policy, parent)); + auto value = reinterpret_steal( + make_caster::cast(kv.value(), policy, parent)); + if (!key || !value) + return handle{}; + d[key] = value; + } + return d.release(); + } + + PYBIND11_TYPE_CASTER(type, _("dict<") + key_conv::name() + _(", ") + + value_conv::name() + _(">")); +}; + +template +struct type_caster> + : probe_map_caster, + Key, Value> +{ +}; +} +} + +#endif diff --git a/metapy/include/metapy_sequence.h b/metapy/include/metapy_sequence.h new file mode 100644 index 0000000000..3689e5a520 --- /dev/null +++ b/metapy/include/metapy_sequence.h @@ -0,0 +1,14 @@ +/** + * @file metapy_sequence.h + * @author Chase Geigle + */ + +#ifndef METAPY_SEQUENCE_H_ +#define METAPY_SEQUENCE_H_ + +#include +#include + +void metapy_bind_sequence(pybind11::module& m); + +#endif diff --git a/metapy/include/metapy_stats.h b/metapy/include/metapy_stats.h new file mode 100644 index 0000000000..efffafe124 --- /dev/null +++ b/metapy/include/metapy_stats.h @@ -0,0 +1,145 @@ +/** + * @file metapy_stats.h + * @author Chase Geigle + */ + +#ifndef METAPY_STATS_H_ +#define METAPY_STATS_H_ + +#include +#include + +#include "meta/stats/multinomial.h" + +/** + * Wrapper class for stats::multinomial for Python. This makes it so we + * don't have to bind stats::multinomial multiple times for each T we want + * to use. Instead, we just need to convert it to a py_multinomial at the + * return site in the python binding function. + */ +class py_multinomial +{ + public: + template + py_multinomial(const meta::stats::multinomial& dist) + : concept_{meta::make_unique>(dist)} + { + // nothing + } + + void increment(pybind11::object obj, double count) + { + concept_->increment(obj, count); + } + + void decrement(pybind11::object obj, double count) + { + concept_->decrement(obj, count); + } + + double counts(pybind11::object obj) const + { + return concept_->counts(obj); + } + + double counts() const + { + return concept_->counts(); + } + + uint64_t unique_events() const + { + return concept_->unique_events(); + } + + void each_seen_event(std::function fun) const + { + concept_->each_seen_event(fun); + } + + void clear() + { + concept_->clear(); + } + + double probability(pybind11::object obj) const + { + return concept_->probability(obj); + } + + private: + class multinomial_concept + { + public: + virtual ~multinomial_concept() = default; + virtual void increment(pybind11::object obj, double count) = 0; + virtual void decrement(pybind11::object obj, double count) = 0; + virtual double counts(pybind11::object obj) const = 0; + virtual double counts() const = 0; + virtual uint64_t unique_events() const = 0; + virtual void each_seen_event( + std::function fun) const = 0; + virtual void clear() = 0; + virtual double probability(pybind11::object obj) const = 0; + }; + + template + class multinomial_impl : public multinomial_concept + { + public: + multinomial_impl(const meta::stats::multinomial& dist) : dist_{dist} + { + // nothing + } + + void increment(pybind11::object obj, double count) override + { + dist_.increment(obj.cast(), count); + } + + void decrement(pybind11::object obj, double count) override + { + dist_.decrement(obj.cast(), count); + } + + double counts(pybind11::object obj) const override + { + return dist_.counts(obj.cast()); + } + + double counts() const override + { + return dist_.counts(); + } + + uint64_t unique_events() const override + { + return dist_.unique_events(); + } + + void each_seen_event( + std::function fun) const override + { + dist_.each_seen_event( + [&](const T& event) { fun(pybind11::cast(event)); }); + } + + void clear() override + { + dist_.clear(); + } + + double probability(pybind11::object obj) const override + { + return dist_.probability(obj.cast()); + } + + private: + meta::stats::multinomial dist_; + }; + + std::unique_ptr concept_; +}; + +void metapy_bind_stats(pybind11::module& m); +#endif diff --git a/metapy/include/metapy_topics.h b/metapy/include/metapy_topics.h new file mode 100644 index 0000000000..0aa9a1e653 --- /dev/null +++ b/metapy/include/metapy_topics.h @@ -0,0 +1,83 @@ +/** + * @file metapy_topics.h + * @author Sean Massung + */ + +#ifndef METAPY_TOPICS_H_ +#define METAPY_TOPICS_H_ + +#include +#include + +#include "meta/topics/topic_model.h" +#include "metapy_identifiers.h" + +namespace pybind11 +{ +namespace detail +{ +namespace metapy +{ +template +struct prob_caster +{ + PYBIND11_TYPE_CASTER(TupleType, _("Probability")); + + bool load(handle src, bool convert) + { + if (!isinstance(src)) + return false; + + const auto seq = reinterpret_borrow(src); + if (seq.size() != 2) + return false; + + if (!first.load(seq[0], convert) || !second.load(seq[1], convert)) + return false; + value.tid = (Identifier)first; + value.probability = (double)second; + return true; + } + + static handle cast(const TupleType& src, return_value_policy& policy, + handle& parent) + { + auto id = reinterpret_steal( + make_caster::cast(src.tid, policy, parent)); + auto prob = reinterpret_steal( + make_caster::cast(src.probability, policy, parent)); + + if (!id || !prob) + return handle(); + + tuple result(2); + PyTuple_SET_ITEM(result.ptr(), 0, id.release().ptr()); + PyTuple_SET_ITEM(result.ptr(), 1, prob.release().ptr()); + return result.release(); + } + + protected: + make_caster first; + make_caster second; +}; +} + +// add conversion for meta::topics::term_prob +template <> +struct type_caster + : metapy::prob_caster +{ +}; + +// add conversion for meta::topics::topic_prob +template <> +struct type_caster + : metapy::prob_caster +{ +}; +} +} + +void metapy_bind_topics(pybind11::module& m); + +#endif diff --git a/metapy/make-release.sh b/metapy/make-release.sh new file mode 100755 index 0000000000..4113092559 --- /dev/null +++ b/metapy/make-release.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -eo pipefail + +version=$(git describe --tags) + +confirm() { + read -r -p "${1:-Are you sure? [y/N]} " response + case $response in + [yY][eE][sS]|[yY]) + true + ;; + *) + false + ;; + esac +} + +echo "Releasing metapy-${version}..." +confirm || exit 1 + +echo "Creating source distribution..." +python setup.py sdist --formats=gztar + +echo "Fetching wheels from GitHub release..." +python get-release.py ${version} + +echo "Uploading to PyPI..." +twine upload -s dist/*.{gz,whl} + +echo "Done!" diff --git a/metapy/setup.py b/metapy/setup.py new file mode 100644 index 0000000000..2e6f6e85e8 --- /dev/null +++ b/metapy/setup.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python +from __future__ import print_function + +import contextlib +import os +import platform +import shutil +import subprocess +import sys +import tempfile + +from setuptools import setup, Extension +from setuptools.command import build_ext, bdist_egg, develop +from distutils.spawn import find_executable +from distutils import log, sysconfig +from distutils.command import build + +VERSION = "0.2.13" + +# See: +# http://stackoverflow.com/questions/3223604/how-to-create-a-temporary-directory-and-get-the-path-file-name-in-python +@contextlib.contextmanager +def cd(newdir, cleanup=lambda: True): + previdr = os.getcwd() + os.chdir(os.path.expanduser(newdir)) + try: + yield + finally: + os.chdir(previdr) + cleanup() + +@contextlib.contextmanager +def tempdir(): + dirpath = tempfile.mkdtemp() + def cleanup(): + # see http://stackoverflow.com/questions/2656322/shutil-rmtree-fails-on-windows-with-access-is-denied + def onerror(func, path, exc_info): + import stat + if not os.access(path, os.W_OK): + os.chmod(path, stat.S_IWUSR) + func(path) + else: + raise + + shutil.rmtree(dirpath, onerror=onerror) + with cd(dirpath, cleanup): + yield dirpath + +# Based on https://github.com/symengine/symengine.py/blob/master/setup.py +class CMakeBuildExt(build.build): + user_options = build.build.user_options + \ + [('icu-root=', None, "Path to ICU root"), + ('generator=', None, "CMake build generator")] + + def initialize_options(self): + build.build.initialize_options(self) + self.icu_root = None + self.generator = 'MSYS Makefiles' if platform.system() == 'Windows' else None + + def cmake_build(self): + src_dir = os.path.dirname(os.path.realpath(__file__)) + + cmake_exe = find_executable("cmake") + if not cmake_exe: + raise EnvironmentError("Could not find cmake executable") + + py_version = "{}.{}".format(sys.version_info[0], sys.version_info[1]) + cmake_cmd = [cmake_exe, src_dir, "-DCMAKE_BUILD_TYPE=Release", + "-DMETA_STATIC_UTF=On", "-DBUILD_STATIC_ICU=On"] + + cmake_cmd.append("-DPYTHON_INCLUDE_DIRS={}".format(sysconfig.get_python_inc())) + if platform.system() == 'Windows': + libpython = "libpython{}{}.a".format(sys.version_info[0], + sys.version_info[1]) + libpython_path = os.path.join(sysconfig.get_python_inc(), + '..', 'libs', libpython) + cmake_cmd.append("-DPYTHON_LIBRARY={}".format(libpython_path)) + + if self.icu_root: + cmake_cmd.extend(["-DICU_ROOT={}".format(self.icu_root)]) + + if self.generator: + cmake_cmd.extend(["-G{}".format(self.generator)]) + + with tempdir() as dirpath: + print("Build directory: {}".format(os.getcwd())) + if subprocess.call(cmake_cmd) != 0: + raise EnvironmentError("CMake invocation failed") + + if subprocess.call([cmake_exe, "--build", "."]) != 0: + raise EnvironmentError("CMake build failed") + + if subprocess.call([cmake_exe, "--build", ".", "--target", + "install"]) != 0: + raise EnvironmentError("CMake install failed") + + # Make dummy __init__.py + initpy = os.path.join(src_dir, "dist", "metapy", "__init__.py") + + with open(initpy, "w") as f: + f.write("from .metapy import *\n") + f.write('__version__ = "{}"\n'.format(VERSION)) + + # Copy over extra DLLs on Windows + if platform.system() == 'Windows': + dlls = ['libwinpthread-1.dll', 'libgcc_s_seh-1.dll', 'libstdc++-6.dll', 'zlib1.dll'] + for dll in dlls: + shutil.copyfile(os.path.join("c:", os.sep, "msys64", "mingw64", "bin", dll), + os.path.join(src_dir, "dist", "metapy", dll)) + + def run(self): + self.cmake_build() + return build.build.run(self) + +class DummyBuildExt(build_ext.build_ext): + def __init__(self, *args, **kwargs): + build_ext.build_ext.__init__(self, *args, **kwargs) + + def run(self): + # do nothing; cmake already built the extension + pass + +class DummyBDistEgg(bdist_egg.bdist_egg): + def __init__(self, *args, **kwargs): + bdist_egg.bdist_egg.__init__(self, *args, **kwargs) + + def run(self): + self.run_command("build") + return bdist_egg.bdist_egg.run(self) + +class DummyDevelop(develop.develop): + def __init__(self, *args, **kwargs): + develop.develop.__init__(self, *args, **kwargs) + + def run(self): + self.run_command("build") + return develop.develop.run(self) + +def clean_dist(): + src_dir = os.path.dirname(os.path.realpath(__file__)) + + dist_dir = os.path.join(src_dir, "dist", "metapy") + if os.path.exists(dist_dir): + log.info("Deleting distribution directory {}".format(dist_dir)) + shutil.rmtree(dist_dir) + + os.makedirs(dist_dir) + +clean_dist() + +setup(name = 'metapy', + version = VERSION, + description = 'Python bindings for MeTA', + author = 'Chase Geigle', + author_email = 'geigle1@illinois.edu', + url = 'https://github.com/meta-toolkit/metapy', + license = 'MIT', + packages = ['metapy'], + package_dir = { '': 'dist' }, + include_package_data = True, + cmdclass = { + 'build': CMakeBuildExt, + 'build_ext': DummyBuildExt, + 'bdist_egg': DummyBDistEgg, + 'develop': DummyDevelop + }, + zip_safe = False, + ext_modules = [Extension('metapy', [])], + ext_package='metapy', + classifiers = [ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'License :: OSI Approved :: University of Illinois/NCSA Open Source License', + 'Operating System :: POSIX :: Linux', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: Microsoft :: Windows', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Scientific/Engineering :: Information Analysis', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Filters', + 'Topic :: Text Processing :: General', + 'Topic :: Text Processing :: Indexing', + 'Topic :: Text Processing :: Linguistic', + ], + keywords = [ + 'NLP', 'natural language processing', + 'IR', 'information retrieval', + 'CL', 'computational lingusitics', + 'parsing', 'tagging', 'tokenizing', 'syntax', 'lingustics', + 'natural language', 'text mining', 'text analysis' + ]) diff --git a/metapy/src/metapy.cpp b/metapy/src/metapy.cpp new file mode 100644 index 0000000000..64c23378d2 --- /dev/null +++ b/metapy/src/metapy.cpp @@ -0,0 +1,76 @@ +/** + * @file metapy.cpp + * @author Chase Geigle + * + * This file defines the metapy module and bindings for the MeTA API. It + * does not attempt to be completely comprehensive at this time (though + * that is an eventual goal), but it aims to provide at least enough of an + * API surface so that interactive web demos can be made. + */ + +#include "metapy_analyzers.h" +#include "metapy_classify.h" +#include "metapy_embeddings.h" +#include "metapy_index.h" +#include "metapy_learn.h" +#include "metapy_parser.h" +#include "metapy_sequence.h" +#include "metapy_stats.h" +#include "metapy_topics.h" + +#include "meta/logging/logger.h" +#include "meta/parser/analyzers/tree_analyzer.h" +#include "meta/sequence/analyzers/ngram_pos_analyzer.h" + +namespace py = pybind11; + +PYBIND11_PLUGIN(metapy) +{ + py::module m{"metapy", "MeTA toolkit python bindings"}; + + meta::sequence::register_analyzers(); + meta::parser::register_analyzers(); + + metapy_bind_index(m); + metapy_bind_analyzers(m); + metapy_bind_learn(m); + metapy_bind_classify(m); + metapy_bind_sequence(m); + metapy_bind_parser(m); + metapy_bind_embeddings(m); + metapy_bind_stats(m); + metapy_bind_topics(m); + + // printing::progress makes this really difficult to reason about. + // Progress updating occurs from a separate thread. This is fine, + // except that we need to use the Python stderr here instead of the + // usual std::cerr. In order to do that, we need the GIL. We run into + // problems when the current thread holds the GIL and then the progress + // thread attempts to acquire it. So, **any function that uses progress + // reporting must release the GIL before being invoked**! + m.def("log_to_stderr", []() { + // separate logging for progress output + meta::logging::add_sink( + {[](const std::string& line) { + py::gil_scoped_acquire gil; + py::module::import("sys").attr("stderr").attr("write")(line); + }, + []() {}, + [](const meta::logging::logger::log_line& ll) { + return ll.severity() + == meta::logging::logger::severity_level::progress; + }, + [](const meta::logging::logger::log_line& ll) { + return " " + ll.str(); + }}); + + meta::logging::add_sink( + {[](const std::string& line) { + py::gil_scoped_acquire gil; + py::module::import("sys").attr("stderr").attr("write")(line); + }, + []() {}, meta::logging::logger::severity_level::trace}); + }); + + return m.ptr(); +} diff --git a/metapy/src/metapy_analyzers.cpp b/metapy/src/metapy_analyzers.cpp new file mode 100644 index 0000000000..dbe060a18e --- /dev/null +++ b/metapy/src/metapy_analyzers.cpp @@ -0,0 +1,477 @@ +/** + * @file metapy_analyzers.cpp + * @author Chase Geigle + * + * This file defines the metapy.analyzers submodule and creates bindings for + * that part of the MeTA API. + */ + +#include +#include +#include + +#include +#include + +#include "metapy_analyzers.h" +#include "metapy_identifiers.h" +#include "metapy_probe_map.h" + +#include "cpptoml.h" +#include "meta/analyzers/all.h" +#include "meta/analyzers/filters/all.h" +#include "meta/analyzers/token_stream.h" +#include "meta/analyzers/tokenizers/character_tokenizer.h" +#include "meta/analyzers/tokenizers/icu_tokenizer.h" +#include "meta/corpus/document.h" +#include "meta/parallel/thread_pool.h" +#include "meta/parser/analyzers/featurizers/all.h" +#include "meta/parser/analyzers/tree_analyzer.h" +#include "meta/sequence/analyzers/ngram_pos_analyzer.h" +#include "meta/util/algorithm.h" + +namespace py = pybind11; +using namespace meta; + +/** + * This class is a "trampoline" class to bounce functions back to Python + * if they are overloaded there rather than in C++ directly. + */ +class py_token_stream + : public util::clonable +{ + public: + virtual std::string next() override + { + PYBIND11_OVERLOAD_PURE(std::string, analyzers::token_stream, next, ); + return ""; + } + + /** + * Determines whether there are more tokens available in the + * stream. + */ + virtual operator bool() const override + { +#if PY_MAJOR_VERSION < 3 + PYBIND11_OVERLOAD_PURE_NAME(bool, analyzers::token_stream, + "__nonzero__", operator bool,); +#else + PYBIND11_OVERLOAD_PURE_NAME(bool, analyzers::token_stream, + "__bool__", operator bool,); +#endif + return false; + } + + /** + * Sets the content for the stream. + * @param content The string content to set + */ + virtual void set_content(std::string&& content) override + { + PYBIND11_OVERLOAD_PURE(void, analyzers::token_stream, set_content, + std::move(content)); + } +}; + +/** + * This class holds a token_stream that was defined in Python, but was + * created using C++. + * + * This is where stuff gets weird. We want our tokenizer_factory to return + * std::unique_ptr when invoked with an id and a config + * group. The problem is that we don't have a good way to get a unique_ptr + * out of Python code if the token stream is defined there, since that + * basically entails having Python relinquish ownership of something, which + * it isn't apt to do. + * + * Instead, what we do is have a separate class that can wrap an object + * created by invoking Python code directly from C++. This doesn't give us + * a unique_ptr, but we can enforce that ourselves directly. The object + * will down convert nicely to a token_stream through pybind11's casting + * utilities, since it is still a token_stream derivative at heart. We just + * can't get at its unique_ptr. We store a py::object to keep the reference + * count > 0, and then a token_stream* that we do all of the actual work + * with. We have to override all of the virtual functions again, but that + * isn't too much work. + * + * Since we're not using the PYBIND11_OVERLOAD functions anymore (since + * this object isn't the one registered with Python), we have to be careful + * with the GIL. Each function here acquires the GIL immediately before + * doing anything else so that hitting Python is safely behind the lock. + * This means things are going to be a lot slower, of course, but it's the + * only way I can think of for doing this safely for now. + * + * We also want to be able to clone token_streams, since that's how we set + * up the pipeline replicas across all of the threads. We can do that by + * providing a copy constructor that calls into Python by invoking + * `copy.deepcopy(obj)` to copy our current Python object. + * + * Finally, our destructor is weird since we want to decrement the object's + * reference count while still inside the GIL. + */ +class cpp_created_py_token_stream + : public util::clonable +{ + public: + cpp_created_py_token_stream(py::object obj) + : obj_{obj}, stream_{obj_.cast()} + { + // nothing + } + + cpp_created_py_token_stream(const cpp_created_py_token_stream& other) + { + py::gil_scoped_acquire acq; + auto deepcopy = py::module::import("copy").attr("deepcopy"); + obj_ = deepcopy.cast()(other.obj_); + stream_ = obj_.cast(); + } + + virtual std::string next() override + { + py::gil_scoped_acquire acq; + return stream_->next(); + } + + virtual operator bool() const override + { + py::gil_scoped_acquire acq; + return *stream_; + } + + virtual void set_content(std::string&& content) override + { + py::gil_scoped_acquire acq; + stream_->set_content(std::move(content)); + } + + ~cpp_created_py_token_stream() + { + py::gil_scoped_acquire acq; + obj_.release().dec_ref(); + } + + private: + py::object obj_; + token_stream* stream_; +}; + +/** + * Registers a Python object with a factory. + * + * There are two major assumptions here. First, we assume that the *Class* + * object passed here has an "id" property, just like we assume the MeTA + * classes do in C++. + * + * We have to do a bit of trickery here, though. The factories typically + * map util::string_view to creation functions. This is fine in the C++ + * code, since every id ends up being a static C string somewhere in the + * data segment. But here, the ids are in Python and are dynamically + * allocated. To save ourselves headache and prevent UB, we have a static + * function level cache here to store the strings we've added to the + * factories so that the util::string_view will be valid. + * + * There's probably a better way of doing this, but this currently works. + */ +template +void py_factory_register(py::object cls, FactoryType& factory, + CreationFunction&& c_fun) +{ + static std::vector ids; + static std::mutex mut; + util::string_view id; + { + std::lock_guard lock{mut}; + ids.push_back(cls.attr("id").cast()); + id = ids.back(); + } + std::cerr << "filter_factory adding " << id << std::endl; + factory.add(id, c_fun); +} + +class py_analyzer : public util::clonable +{ + virtual void tokenize(const corpus::document& doc, + analyzers::featurizer& counts) override + { + PYBIND11_OVERLOAD_PURE(void, analyzers::analyzer, tokenize, doc, + counts); + } +}; + +template +void make_token_stream(TokenStream& next, const analyzers::token_stream& prev, + Args... args) +{ + new (&next) TokenStream(prev.clone(), args...); +} + +template +py::object ngram_analyze(NGramAnalyzer& ana, const corpus::document& doc) +{ + if (ana.n_value() == 1) + return py::cast(ana.template analyze(doc)); + + auto ngrams = ana.template analyze(doc); + + py::dict ret; + for (const auto& kv : ngrams) + { + const auto& key = kv.key(); + + using iterator = decltype(key.begin()); + + py::tuple newkey{ana.n_value()}; + uint64_t idx = 0; + util::for_each_token(key.begin(), key.end(), "_", + [&](iterator first, iterator last) { + if (first != last) + newkey[idx++] = py::str({first, last}); + }); + ret[newkey] = py::cast(kv.value()); + } + + return ret; +} + +/** + * A visitor class for converting a TOML configuration group to a Python + * dictionary. We use this to convert TOML tables to keyword arguments for + * token_streams defined in Python. + */ +class py_toml_visitor +{ + public: + template + void visit(const cpptoml::value& v, py::object& obj) + { + obj = py::cast(v.get()); + } + + void visit(const cpptoml::table& table, py::object& obj) + { + obj = py::dict(); + auto dict = obj.cast(); + + for (const auto& pr : table) + { + auto key = py::cast(pr.first); + py::object value; + pr.second->accept(*this, value); + dict[key] = value; + } + } + + void visit(const cpptoml::array& arr, py::object& obj) + { + obj = py::list(); + auto lst = obj.cast(); + for (const auto& val : arr) + { + py::object value; + val->accept(*this, value); + lst.append(value); + } + } + + void visit(const cpptoml::table_array& tarr, py::object& obj) + { + obj = py::list(); + auto lst = obj.cast(); + for (const auto& table : tarr) + { + py::object value; + table->accept(*this, value); + lst.append(value); + } + } +}; + +class py_token_stream_iterator +{ + analyzers::token_stream& stream_; + py::object ref_; + + public: + py_token_stream_iterator(analyzers::token_stream& stream, py::object ref) + : stream_(stream), ref_(ref) + { + // nothing + } + + std::string next() + { + if (!stream_) + throw py::stop_iteration(); + return stream_.next(); + } +}; + +void metapy_bind_analyzers(py::module& m) +{ + using namespace analyzers; + + auto m_ana = m.def_submodule("analyzers"); + + py::class_ ts_base{m_ana, "TokenStream"}; + ts_base.def(py::init<>()) + .def("next", + [](token_stream& ts) { + if (!ts) + throw py::stop_iteration(); + return ts.next(); + }) + .def("set_content", + [](token_stream& ts, std::string str) { + ts.set_content(std::move(str)); + }) + .def("__bool__", [](token_stream& ts) { return static_cast(ts); }) + .def("__iter__", + [](py::object ts) { + return py_token_stream_iterator(ts.cast(), ts); + }) + .def("__deepcopy__", + [](token_stream& ts, py::dict&) { return ts.clone(); }); + + py::class_(ts_base, "Iterator") + .def("__iter__", + [](py_token_stream_iterator& it) -> py_token_stream_iterator& { + return it; + }) + .def("__next__", &py_token_stream_iterator::next); + + // tokenizers + py::class_{m_ana, "CharacterTokenizer", + ts_base} + .def(py::init<>()); + + py::class_{m_ana, "ICUTokenizer", ts_base}.def( + py::init(), + "Creates a tokenizer using the UTF text segmentation standard", + // hack around g++ 4.8 ambiguous overloaded operator= + py::arg_t{"suppress_tags", false}); + // py::arg("suppress_tags") = false); + + // filters + py::class_{m_ana, "AlphaFilter", ts_base}.def( + "__init__", &make_token_stream); + + py::class_{m_ana, "EmptySentenceFilter", + ts_base} + .def("__init__", &make_token_stream); + + py::class_{m_ana, "EnglishNormalizer", ts_base} + .def("__init__", &make_token_stream); + + py::class_{m_ana, "ICUFilter", ts_base}.def( + "__init__", + &make_token_stream); + + py::class_{m_ana, "LengthFilter", ts_base}.def( + "__init__", + &make_token_stream, + py::arg("source"), py::arg("min"), py::arg("max")); + + py::class_ list_filter{m_ana, "ListFilter", ts_base}; + py::enum_{list_filter, "Type"} + .value("Accept", filters::list_filter::type::ACCEPT) + .value("Reject", filters::list_filter::type::REJECT); + list_filter.def("__init__", + &make_token_stream); + + py::class_{m_ana, "LowercaseFilter", ts_base} + .def("__init__", &make_token_stream); + + py::class_{m_ana, "Porter2Filter", ts_base}.def( + "__init__", &make_token_stream); + + py::class_{m_ana, "PennTreebankNormalizer", + ts_base} + .def("__init__", &make_token_stream); + + py::class_{m_ana, "SentenceBoundaryAdder", + ts_base} + .def("__init__", &make_token_stream); + + // analyzers + py::class_ analyzer_base{m_ana, + "Analyzer"}; + analyzer_base.def(py::init<>()) + .def("analyze", &analyzer::analyze) + .def("featurize", &analyzer::analyze); + + py::class_{m_ana, "NGramWordAnalyzer", analyzer_base} + .def("__init__", + [](ngram_word_analyzer& ana, uint16_t n, const token_stream& ts) { + new (&ana) ngram_word_analyzer(n, ts.clone()); + }) + .def("analyze", &ngram_analyze) + .def("featurize", &ngram_analyze); + + py::class_{m_ana, "NGramPOSAnalyzer", analyzer_base} + .def("__init__", + [](ngram_pos_analyzer& ana, uint16_t n, const token_stream& ts, + const std::string& crf_prefix) { + py::gil_scoped_release rel; + new (&ana) ngram_pos_analyzer(n, ts.clone(), crf_prefix); + }) + .def("analyze", &ngram_analyze) + .def("featurize", &ngram_analyze); + + py::class_ py_tree_feat{m_ana, "TreeFeaturizer"}; + py_tree_feat.def("tree_tokenize", &tree_featurizer::tree_tokenize); + + py::class_{m_ana, "BranchFeaturizer", py_tree_feat}.def( + py::init<>()); + py::class_{m_ana, "DepthFeaturizer", py_tree_feat}.def( + py::init<>()); + py::class_{m_ana, "SemiSkeletonFeaturizer", + py_tree_feat} + .def(py::init<>()); + py::class_{m_ana, "SkeletonFeaturizer", py_tree_feat} + .def(py::init<>()); + py::class_{m_ana, "SubtreeFeaturizer", py_tree_feat} + .def(py::init<>()); + py::class_{m_ana, "TagFeaturizer", py_tree_feat}.def( + py::init<>()); + + py::class_{m_ana, "TreeAnalyzer", analyzer_base} + .def("__init__", + [](tree_analyzer& ana, const token_stream& ts, + const std::string& tagger_prefix, + const std::string& parser_prefix) { + py::gil_scoped_release rel; + new (&ana) + tree_analyzer(ts.clone(), tagger_prefix, parser_prefix); + }) + .def("add", [](tree_analyzer& ana, const tree_featurizer& featurizer) { + ana.add(featurizer.clone()); + }); + + py::class_{m_ana, "MultiAnalyzer", analyzer_base}; + + m_ana.def("load", [](const std::string& filename) { + py::gil_scoped_release rel; + auto config = cpptoml::parse_file(filename); + return analyzers::load(*config); + }); + + m_ana.def("register_filter", [](py::object cls) { + py_factory_register(cls, filter_factory::get(), + [=](std::unique_ptr source, + const cpptoml::table& cfg) { + py::gil_scoped_acquire acq; + + py::dict kwargs; + py_toml_visitor vtor; + cfg.accept(vtor, kwargs); + PyDict_DelItemString(kwargs.ptr(), "type"); + + return make_unique( + cls(source->clone(), **kwargs)); + }); + }); +} diff --git a/metapy/src/metapy_classify.cpp b/metapy/src/metapy_classify.cpp new file mode 100644 index 0000000000..0f13b08433 --- /dev/null +++ b/metapy/src/metapy_classify.cpp @@ -0,0 +1,602 @@ +/** + * @file metapy_classify.cpp + * @author Chase Geigle + */ + +#include +#include +#include +#include + +#include "cpptoml.h" +#include "meta/classify/binary_dataset_view.h" +#include "meta/classify/classifier/all.h" +#include "meta/classify/kernel/all.h" +#include "meta/index/ranker/ranker_factory.h" +#include "meta/learn/dataset.h" +#include "meta/learn/loss/loss_function_factory.h" +#include "meta/logging/logger.h" +#include "meta/util/iterator.h" +#include "metapy_classify.h" +#include "metapy_identifiers.h" +#include "metapy_learn.h" + +namespace py = pybind11; +using namespace meta; + +template +class py_binary_classifier : public ClassifierBase +{ + public: + double predict(const learn::feature_vector& instance) const override + { + PYBIND11_OVERLOAD_PURE(double, ClassifierBase, predict, instance); + return 0; + } + + void save(std::ostream& /* os */) const override + { + throw std::runtime_error{ + "cannot serialize python-defined binary classifiers"}; + } +}; + +class py_online_binary_classifier + : public py_binary_classifier +{ + public: + void train(dataset_view_type docs) override + { + PYBIND11_OVERLOAD_PURE(void, classify::online_binary_classifier, train, + docs); + } + + void train_one(const feature_vector& doc, bool label) override + { + PYBIND11_OVERLOAD_PURE(void, classify::online_binary_classifier, + train_one, doc, label); + } +}; + +template +class py_classifier : public ClassifierBase +{ + public: + class_label classify(const learn::feature_vector& instance) const override + { + PYBIND11_OVERLOAD_PURE(class_label, ClassifierBase, classify, instance); + return "[none]"_cl; + } + + void save(std::ostream& /* os */) const override + { + throw std::runtime_error{ + "cannot serialize python-defined multiclass classifiers"}; + } +}; + +class py_online_classifier : public py_classifier +{ + public: + void train(dataset_view_type docs) override + { + PYBIND11_OVERLOAD_PURE(void, classify::online_classifier, train, docs); + } + + void train_one(const feature_vector& doc, const class_label& lbl) override + { + PYBIND11_OVERLOAD_PURE(void, classify::online_classifier, train_one, + doc, lbl); + } +}; + +class py_kernel : public classify::kernel::kernel +{ + public: + double operator()(const learn::feature_vector& first, + const learn::feature_vector& second) const override + { + PYBIND11_OVERLOAD_PURE_NAME(double, classify::kernel::kernel, + "__call__", operator(), first, second); + return 0; + } + + void save(std::ostream& /* os */) const override + { + throw std::runtime_error{"cannot serialize python-defined kernels"}; + } +}; + +/** + * This class holds a binary_classifier that was created by invoking + * Python code. + * + * We need to be able to supply a function to the ensemble methods (e.g. + * one_vs_all) that creates a std::unique_ptr from a + * binary_dataset_view. We can't get std::unique_ptrs from Python code + * directly. Instead, we grab a reference to the py::object that Python + * created for us, and make a unique_ptr to this class that contains it and + * just forwards the calls to the classifier by converting that py::object + * to a binary_classifier reference. + */ +class cpp_created_py_binary_classifier + : public classify::online_binary_classifier +{ + public: + cpp_created_py_binary_classifier(py::object cls) : cls_{cls} + { + // nothing + } + + double predict(const learn::feature_vector& instance) const override + { + return cls_.cast().predict(instance); + } + + void save(std::ostream& os) const override + { + cls_.cast().save(os); + } + + void train(classify::binary_dataset_view bdv) override + { + cls_.cast().train(bdv); + } + + void train_one(const learn::feature_vector& instance, bool label) override + { + cls_.cast().train_one(instance, + label); + } + + private: + py::object cls_; +}; + +void metapy_bind_classify(py::module& m) +{ + auto pydset = (py::object)m.attr("learn").attr("Dataset"); + auto pydset_view = (py::object)m.attr("learn").attr("DatasetView"); + auto m_classify = m.def_submodule("classify"); + + // binary datasets/views + py::class_{m_classify, "BinaryDataset", pydset} + .def("__init__", + [](classify::binary_dataset& dset, + const std::shared_ptr& fidx, + std::function labeler) { + py::gil_scoped_release release; + new (&dset) classify::binary_dataset(fidx, labeler); + }) + .def("__init__", + [](classify::binary_dataset& dset, + const std::shared_ptr& fidx, + const std::vector& docs, + std::function labeler) { + py::gil_scoped_release release; + new (&dset) classify::binary_dataset(fidx, docs, labeler); + }) + .def("__init__", + [](classify::binary_dataset& dset, py::list& data, + std::size_t total_features, py::function& featurizer, + py::function& labeler) { + new (&dset) classify::binary_dataset( + data.begin(), data.end(), total_features, + [&](py::handle obj) { + return py::cast( + featurizer(obj)); + }, + [&](py::handle obj) { + return py::cast(labeler(obj)); + }); + }) + .def("label", &classify::binary_dataset::label) + .def("__getitem__", + [](const classify::binary_dataset& dset, int64_t offset) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : dset.size() + offset; + if (idx >= dset.size()) + throw py::index_error(); + return *(dset.begin() + idx); + }) + .def("__getitem__", + [](const classify::binary_dataset& bdset, py::slice slice) { + classify::binary_dataset_view bdv{bdset}; + return make_sliced_dataset_view(bdv, slice); + }, + py::keep_alive<0, 1>()); + + py::class_{m_classify, "BinaryDatasetView", + pydset_view} + .def(py::init(), + py::keep_alive<0, 1>()) + .def("__getitem__", + [](const classify::binary_dataset_view& dv, int64_t offset) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : dv.size() + offset; + if (idx >= dv.size()) + throw py::index_error(); + return *(dv.begin() + idx); + }) + .def("__getitem__", + [](const classify::binary_dataset_view& bdv, py::slice slice) { + return make_sliced_dataset_view(bdv, slice); + }, + py::keep_alive<0, 1>()); + + py::implicitly_convertible(); + + // multiclass datasets/views + py::class_{m_classify, "MulticlassDataset", + pydset} + .def("__init__", + [](classify::multiclass_dataset& dset, + const std::shared_ptr& fidx) { + py::gil_scoped_release release; + new (&dset) classify::multiclass_dataset(fidx); + }) + .def("__init__", + [](classify::multiclass_dataset& dset, + const std::shared_ptr& fidx, + const std::vector& docs) { + py::gil_scoped_release release; + new (&dset) classify::multiclass_dataset(fidx, docs); + }) + .def("__init__", + [](classify::multiclass_dataset& dset, py::list& data, + std::size_t total_features, py::function& featurizer, + py::function& labeler) { + new (&dset) classify::multiclass_dataset( + data.begin(), data.end(), total_features, + [&](py::handle obj) { + return py::cast( + featurizer(obj)); + }, + [&](py::handle obj) { + return py::cast(labeler(obj)); + }); + }) + .def("label", + [](const classify::multiclass_dataset& dset, + const learn::instance& inst) { return dset.label(inst); }) + .def("total_labels", &classify::multiclass_dataset::total_labels) + .def("label_id_for", &classify::multiclass_dataset::label_id_for) + .def("label_for", &classify::multiclass_dataset::label_for) + .def("__getitem__", + [](const classify::multiclass_dataset& dset, int64_t offset) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : dset.size() + offset; + if (idx >= dset.size()) + throw py::index_error(); + return *(dset.begin() + idx); + }) + .def("__getitem__", + [](const classify::multiclass_dataset& dset, py::slice slice) { + classify::multiclass_dataset_view mdv{dset}; + return make_sliced_dataset_view(mdv, slice); + }, + py::keep_alive<0, 1>()); + + py::class_{ + m_classify, "MulticlassDatasetView", pydset_view} + .def(py::init(), + py::keep_alive<0, 1>()) + .def("__getitem__", + [](const classify::multiclass_dataset_view& dv, int64_t offset) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : dv.size() + offset; + if (idx >= dv.size()) + throw py::index_error(); + return *(dv.begin() + idx); + }) + .def("__getitem__", + [](const classify::multiclass_dataset_view& mdv, py::slice slice) { + return make_sliced_dataset_view(mdv, slice); + }, + py::keep_alive<0, 1>()) + .def("total_labels", &classify::multiclass_dataset_view::total_labels) + .def("label", &classify::multiclass_dataset_view::label) + .def("labels", + [](const classify::multiclass_dataset_view& self) { + return py::make_iterator(self.labels_begin(), + self.labels_end()); + }, + py::keep_alive<0, 1>()) + .def("create_even_split", + [](const classify::multiclass_dataset_view& mdv) { + return mdv.create_even_split(); + }, + py::keep_alive<0, 1>()); + + py::implicitly_convertible(); + + // confusion matrix + py::class_{m_classify, "ConfusionMatrix"} + .def(py::init<>()) + .def("add", &classify::confusion_matrix::add, py::arg("predicted"), + py::arg("actual"), py::arg("num_times") = 1) + .def("add_fold_accuracy", + &classify::confusion_matrix::add_fold_accuracy) + .def("fold_accuracy", &classify::confusion_matrix::fold_accuracy) + .def("print_stats", + [](const classify::confusion_matrix& matrix) { + std::stringstream ss; + matrix.print_stats(ss); + py::print(ss.str()); + }) + .def("__str__", + [](const classify::confusion_matrix& matrix) { + std::stringstream ss; + matrix.print(ss); + return ss.str(); + }) + .def("print", + [](py::object self) { py::print(self.attr("__str__")()); }) + .def("print_result_pairs", + [](const classify::confusion_matrix& matrix) { + std::stringstream ss; + matrix.print_result_pairs(ss); + py::print(ss.str()); + }) + .def("predictions", &classify::confusion_matrix::predictions) + .def("accuracy", &classify::confusion_matrix::accuracy) + .def("f1_score", + [](const classify::confusion_matrix& matrix) { + return matrix.f1_score(); + }) + .def("f1_score", + [](const classify::confusion_matrix& matrix, + const class_label& lbl) { return matrix.f1_score(lbl); }) + .def("precision", + [](const classify::confusion_matrix& matrix) { + return matrix.precision(); + }) + .def("precision", + [](const classify::confusion_matrix& matrix, + const class_label& lbl) { return matrix.precision(lbl); }) + .def("recall", + [](const classify::confusion_matrix& matrix) { + return matrix.recall(); + }) + .def("recall", + [](const classify::confusion_matrix& matrix, + const class_label& lbl) { return matrix.recall(lbl); }) + .def(py::self + py::self) + .def(py::self += py::self) + .def_static("mcnemar_significant", + &classify::confusion_matrix::mcnemar_significant); + + // kernels + auto m_kernel = m_classify.def_submodule("kernel"); + py::class_ pykernel{m_classify, + "Kernel"}; + pykernel.def("__call__", &classify::kernel::kernel::operator()); + + py::class_{m_kernel, "Polynomial", pykernel} + .def(py::init(), + py::arg("power") = classify::kernel::polynomial::default_power, + py::arg("c") = classify::kernel::polynomial::default_c) + .def_property_readonly_static( + "id", + [](py::object /* self */) { + return classify::kernel::polynomial::id.to_string(); + }) + .def_readonly_static("default_power", + &classify::kernel::polynomial::default_power) + .def_readonly_static("default_c", + &classify::kernel::polynomial::default_c); + + py::class_{m_kernel, "RadialBasis", + pykernel} + .def(py::init(), py::arg("gamma")) + .def_property_readonly_static("id", [](py::object /* self */) { + return classify::kernel::radial_basis::id.to_string(); + }); + + py::class_{m_kernel, "Sigmoid", pykernel} + .def(py::init(), py::arg("alpha"), py::arg("c")) + .def_property_readonly_static("id", [](py::object /* self */) { + return classify::kernel::sigmoid::id.to_string(); + }); + + // binary classifiers + py::class_> pybincls{ + m_classify, "BinaryClassifier"}; + pybincls.def("classify", &classify::binary_classifier::classify) + .def("predict", &classify::binary_classifier::predict); + + py::class_ + py_online_bincls{m_classify, "OnlineBinaryClassifier", pybincls}; + py_online_bincls.def("train", &classify::online_binary_classifier::train) + .def("train_one", &classify::online_binary_classifier::train_one); + + py::class_{m_classify, "SGD", py_online_bincls} + .def_property_readonly_static( + "id", + [](py::object /* self */) { return classify::sgd::id.to_string(); }) + .def_readonly_static("default_gamma", &classify::sgd::default_gamma) + .def_readonly_static("default_max_iter", + &classify::sgd::default_max_iter) + .def("__init__", + [](classify::sgd& cls, classify::binary_dataset_view training, + const std::string& loss_id, + learn::sgd_model::options_type options, double gamma, + std::size_t max_iter, bool calibrate) { + // release the GIL before training the classifier; this + // allows other threads inside an ensemble method to train + // simultaneiously + py::gil_scoped_release rel; + new (&cls) classify::sgd( + training, learn::loss::make_loss_function(loss_id), + options, gamma, max_iter, calibrate); + }, + py::arg("training"), py::arg("loss_id"), + py::arg("options") = learn::sgd_model::options_type{}, + py::arg("gamma") = classify::sgd::default_gamma, + py::arg("max_iter") = classify::sgd::default_max_iter, + py::arg("calibrate") = true); + + // multiclass classifiers + py::class_> pycls{m_classify, + "Classifier"}; + pycls.def("classify", &classify::classifier::classify) + .def("test", &classify::classifier::test); + + py::class_ py_online_cls{ + m_classify, "OnlineClassifier", pycls}; + py_online_cls.def("train", &classify::online_classifier::train) + .def("train_one", &classify::online_classifier::train_one); + + py::class_{m_classify, "DualPerceptron", pycls} + .def("__init__", + [](classify::dual_perceptron& cls, + classify::multiclass_dataset_view training, + const classify::kernel::kernel& kernel, double alpha, + double gamma, double bias, uint64_t max_iter) { + std::stringstream ss; + kernel.save(ss); + + new (&cls) classify::dual_perceptron( + std::move(training), classify::kernel::load_kernel(ss), + alpha, gamma, bias, max_iter); + }, + py::arg("training"), py::arg("kernel"), + py::arg("alpha") = classify::dual_perceptron::default_alpha, + py::arg("gamma") = classify::dual_perceptron::default_gamma, + py::arg("bias") = classify::dual_perceptron::default_bias, + py::arg("max_iter") = classify::dual_perceptron::default_max_iter) + .def_readonly_static("default_alpha", + &classify::dual_perceptron::default_alpha) + .def_readonly_static("default_gamma", + &classify::dual_perceptron::default_gamma) + .def_readonly_static("default_bias", + &classify::dual_perceptron::default_bias) + .def_readonly_static("default_max_iter", + &classify::dual_perceptron::default_max_iter); + + py::class_{m_classify, "KNN", pycls}.def( + "__init__", + [](classify::knn& cls, classify::multiclass_dataset_view training, + std::shared_ptr idx, uint16_t k, + const index::ranker& ranker, bool weighted) { + std::stringstream ss; + ranker.save(ss); + + new (&cls) classify::knn(std::move(training), std::move(idx), k, + index::load_ranker(ss), weighted); + }, + py::arg("training"), py::arg("inv_idx"), py::arg("k"), + py::arg("ranker"), py::arg("weighted") = false); + + py::class_{m_classify, "LogisticRegression", + pycls} + .def(py::init(), + py::arg("training"), + py::arg("options") = learn::sgd_model::options_type{}, + py::arg("gamma") = classify::sgd::default_gamma, + py::arg("max_iter") = classify::sgd::default_max_iter) + .def("predict", &classify::logistic_regression::predict); + + py::class_{m_classify, "NaiveBayes", pycls} + .def(py::init(), + py::arg("training"), + py::arg("alpha") = classify::naive_bayes::default_alpha, + py::arg("beta") = classify::naive_bayes::default_beta) + .def_readonly_static("default_alpha", + &classify::naive_bayes::default_alpha) + .def_readonly_static("default_beta", + &classify::naive_bayes::default_beta); + + py::class_{m_classify, "NearestCentroid", pycls} + .def(py::init>(), + py::arg("training"), py::arg("inv_idx")); + + py::class_{m_classify, "OneVsAll", py_online_cls}.def( + "__init__", + [](classify::one_vs_all& ova, classify::multiclass_dataset_view mdv, + py::object cls, py::kwargs kwargs) { + + auto creator = [=](const classify::binary_dataset_view& bdv) { + // must acquire the GIL before calling back into Python + // code to construct the classifier + py::gil_scoped_acquire acq; + return make_unique( + cls(bdv, **kwargs)); + }; + + // release the GIL so that it can be re-acquired in the threads + // that are spawned to create the sub-classifiers + py::gil_scoped_release rel; + new (&ova) classify::one_vs_all(std::move(mdv), std::move(creator)); + }); + + py::class_{m_classify, "OneVsOne", py_online_cls}.def( + "__init__", + [](classify::one_vs_one& ovo, classify::multiclass_dataset_view mdv, + py::object cls, py::kwargs kwargs) { + + auto creator = [=](const classify::binary_dataset_view& bdv) { + // must acquire the GIL before calling back into Python + // code to construct the classifier + py::gil_scoped_acquire acq; + return make_unique( + cls(bdv, **kwargs)); + }; + + // release the GIL so that it can be re-acquired in the threads + // that are spawned to create the sub-classifiers + py::gil_scoped_release rel; + new (&ovo) classify::one_vs_one(std::move(mdv), std::move(creator)); + }); + + py::class_{m_classify, "Winnow", pycls} + .def(py::init(), + py::arg("training"), py::arg("m") = classify::winnow::default_m, + py::arg("gamma") = classify::winnow::default_gamma, + py::arg("max_iter") = classify::winnow::default_max_iter) + .def_readonly_static("default_m", &classify::winnow::default_m) + .def_readonly_static("default_gamma", &classify::winnow::default_gamma) + .def_readonly_static("default_max_iter", + &classify::winnow::default_max_iter); + + // utility functions + m_classify.def( + "cross_validate", + [](std::function creator, + classify::multiclass_dataset_view mdv, std::size_t k, + bool even_split) { + struct creator_type + { + py::object cls_; + std::function& + creator_; + + creator_type(std::function& creator) + : creator_(creator) + { + // nothing + } + + classify::classifier* + operator()(const classify::multiclass_dataset_view& mdv) + { + cls_ = creator_(mdv); + return cls_.cast(); + } + } maker(creator); + + return classify::cross_validate(maker, mdv, k, even_split); + }, + py::arg("creator"), py::arg("mdv"), py::arg("k"), + py::arg("even_split") = false); +} diff --git a/metapy/src/metapy_embeddings.cpp b/metapy/src/metapy_embeddings.cpp new file mode 100644 index 0000000000..457542323c --- /dev/null +++ b/metapy/src/metapy_embeddings.cpp @@ -0,0 +1,71 @@ +/** + * @file metapy_parser.cpp + * @author Chase Geigle + * + * This file defines the metapy.parser submodule and creates bindings for + * that part of the MeTA API. + */ + +#include +#include +#include +#include + +#include "cpptoml.h" +#include "meta/embeddings/word_embeddings.h" +#include "metapy_embeddings.h" +#include "metapy_identifiers.h" + +namespace py = pybind11; +using namespace meta; + +void metapy_bind_embeddings(py::module& m) +{ + auto m_emb = m.def_submodule("embeddings"); + + using namespace py::literals; + + py::class_{m_emb, "WordEmbeddings"} + .def("at", + [](embeddings::word_embeddings& self, const std::string& term) { + auto emb = self.at(term); + + return py::make_tuple(emb.tid, + py::array(emb.v.size(), emb.v.begin())); + }) + .def("term", [](embeddings::word_embeddings& self, + std::size_t tid) { return self.term(tid).to_string(); }) + .def("top_k", + [](embeddings::word_embeddings& self, + py::array_t + query, + std::size_t k) { + util::array_view avquery{query.data(), + query.size()}; + auto scores = self.top_k(avquery, k); + + std::vector result; + result.reserve(scores.size()); + + std::transform( + scores.begin(), scores.end(), std::back_inserter(result), + [](const embeddings::scored_embedding& se) { + return py::make_tuple( + se.e.tid, py::array(se.e.v.size(), se.e.v.begin()), + se.score); + }); + return result; + }, + "query"_a, "k"_a = 100) + .def("vector_size", &embeddings::word_embeddings::vector_size); + + m_emb.def("load_embeddings", [](const std::string& filename) { + auto config = cpptoml::parse_file(filename); + auto embed_cfg = config->get_table("embeddings"); + if (!embed_cfg) + throw embeddings::word_embeddings_exception{ + "missing [embeddings] configuration in " + filename}; + + return embeddings::load_embeddings(*embed_cfg); + }); +} diff --git a/metapy/src/metapy_index.cpp b/metapy/src/metapy_index.cpp new file mode 100644 index 0000000000..193e1ec4b5 --- /dev/null +++ b/metapy/src/metapy_index.cpp @@ -0,0 +1,408 @@ +/** + * @file metapy_index.cpp + * @author Chase Geigle + * + * This file defines the metapy.index submodule and creates bindings for + * that part of the MeTA API. + */ + +#include + +#include +#include +#include + +#include "metapy_identifiers.h" +#include "metapy_index.h" + +#include "cpptoml.h" +#include "meta/index/eval/ir_eval.h" +#include "meta/index/forward_index.h" +#include "meta/index/inverted_index.h" +#include "meta/index/make_index.h" +#include "meta/index/ranker/all.h" +#include "meta/index/ranker/ranker_factory.h" +#include "meta/index/score_data.h" + +namespace py = pybind11; + +using namespace meta; + +class py_ranking_function : public index::ranking_function +{ + public: + using index::ranking_function::ranking_function; + + float score_one(const meta::index::score_data& sd) override + { + PYBIND11_OVERLOAD_PURE(float, index::ranking_function, score_one, sd); + return 0.0f; + } + + void save(std::ostream&) const override + { + throw std::runtime_error{"cannot serialize python-defined rankers"}; + } +}; + +class py_lm_ranker : public index::language_model_ranker +{ + public: + using index::language_model_ranker::language_model_ranker; + + float smoothed_prob(const index::score_data& sd) const override + { + PYBIND11_OVERLOAD_PURE(float, index::language_model_ranker, + smoothed_prob, sd); + return 0.0f; + } + + float doc_constant(const index::score_data& sd) const override + { + PYBIND11_OVERLOAD_PURE(float, index::language_model_ranker, + doc_constant, sd); + return 0.0f; + } + + void save(std::ostream&) const override + { + throw std::runtime_error{"cannot serialize python-defined rankers"}; + } +}; + +void metapy_bind_index(py::module& m) +{ + py::module m_idx = m.def_submodule("index"); + + py::class_{m_idx, "Document"} + .def(py::init(), + py::arg("d_id") = doc_id{0}, + py::arg("label") = class_label{"[NONE]"}) + .def("label", [](const corpus::document& doc) { return doc.label(); }, + "Gets the label for the document") + .def("label", [](corpus::document& doc, + const class_label& label) { doc.label(label); }, + "Sets the label for the document") + .def("content", + [](const corpus::document& doc) { return doc.content(); }, + "Gets the content of the document") + .def( + "content", + [](corpus::document& doc, const std::string& content, + const std::string& encoding) { doc.content(content, encoding); }, + "Sets the content of the document", py::arg("content"), + py::arg("encoding") = std::string{"utf-8"}) + .def("encoding", + [](const corpus::document& doc) { return doc.encoding(); }, + "Gets the encoding for the document's content") + .def("encoding", + [](corpus::document& doc, const std::string& encoding) { + doc.encoding(encoding); + }, + "Sets the encoding for the document's content") + .def("id", &corpus::document::id) + .def("contains_content", &corpus::document::contains_content); + + py::class_{m_idx, "Metadata"}.def( + "get", + [](corpus::metadata& md, const std::string& name) -> py::object { + using field_type = corpus::metadata::field_type; + + py::object ret; + const auto& schema = md.schema(); + + // find the entry for this field name if it exists + for (uint64_t i = 0; i < schema.size(); ++i) + { + if (schema[i].name == name) + { + switch (schema[i].type) + { + case field_type::SIGNED_INT: + { + auto val = md.get(name); + if (val) + return py::cast(*val); + break; + } + + case field_type::UNSIGNED_INT: + { + auto val = md.get(name); + if (val) + return py::cast(*val); + break; + } + + case field_type::DOUBLE: + { + auto val = md.get(name); + if (val) + return py::cast(*val); + break; + } + + case field_type::STRING: + { + auto val = md.get(name); + if (val) + return py::cast(*val); + break; + } + } + + return py::cast(nullptr); + } + } + + return py::cast(nullptr); + }, + "Returns the metadata value for a given field name"); + + py::class_>{ + m_idx, "DiskIndex"} + .def("index_name", &index::disk_index::index_name) + .def("num_docs", &index::disk_index::num_docs) + .def("docs", &index::disk_index::docs) + .def("doc_size", &index::disk_index::doc_size) + .def("label", &index::disk_index::label) + .def("lbl_id", &index::disk_index::lbl_id) + .def("class_label_from_id", &index::disk_index::class_label_from_id) + .def("num_labels", &index::disk_index::num_labels) + .def("class_labels", &index::disk_index::class_labels) + .def("metadata", [](index::disk_index& idx, + doc_id d_id) { return idx.metadata(d_id); }, + "Extract the metadata for a document", py::keep_alive<0, 1>()) + .def("unique_terms", + [](const index::disk_index& idx) { return idx.unique_terms(); }) + .def("unique_terms", [](const index::disk_index& idx, + doc_id did) { return idx.unique_terms(did); }) + .def("get_term_id", &index::disk_index::get_term_id) + .def("term_text", &index::disk_index::term_text); + + py::class_>{m_idx, "InvertedIndex"} + .def("tokenize", &index::inverted_index::tokenize) + .def("doc_freq", &index::inverted_index::doc_freq) + .def("term_freq", &index::inverted_index::term_freq) + .def("total_corpus_terms", &index::inverted_index::total_corpus_terms) + .def("total_num_occurences", + &index::inverted_index::total_num_occurences) + .def("avg_doc_length", &index::inverted_index::avg_doc_length); + + m_idx.def("make_inverted_index", + [](const std::string& filename) { + py::gil_scoped_release rel; + auto config = cpptoml::parse_file(filename); + return index::make_index(*config); + }, + "Builds or loads an inverted index from disk"); + + py::class_>{m_idx, "ForwardIndex"} + .def("liblinear_data", &index::forward_index::liblinear_data) + .def("tokenize", &index::forward_index::tokenize); + + m_idx.def("make_forward_index", [](const std::string& filename) { + py::gil_scoped_release rel; + auto config = cpptoml::parse_file(filename); + return index::make_index(*config); + }); + + py::class_ rank_base{m_idx, "Ranker"}; + rank_base + .def("score", + [](index::ranker& ranker, index::inverted_index& idx, + const corpus::document& query, uint64_t num_results, + const index::ranker::filter_function_type& filter) { + return ranker.score(idx, query, num_results, filter); + }, + "Scores the documents in the inverted index with respect to the " + "query using this ranker", + py::arg("idx"), py::arg("query"), py::arg("num_results") = 10, + py::arg("filter") + = std::function([](doc_id) { return true; })) + .def("score", + [](index::ranker& ranker, index::inverted_index& idx, + std::unordered_map& query, + uint64_t num_results, + const index::ranker::filter_function_type& filter) { + return ranker.score(idx, query.begin(), query.end(), + num_results, filter); + + }, + py::arg("idx"), py::arg("query"), py::arg("num_results") = 10, + py::arg("filter") + = std::function([](doc_id) { return true; })) + .def("score", + [](index::ranker& ranker, index::inverted_index& idx, + std::vector>& query, + uint64_t num_results, + const index::ranker::filter_function_type& filter) { + return ranker.score(idx, query.begin(), query.end(), + num_results, filter); + }, + py::arg("idx"), py::arg("query"), py::arg("num_results") = 10, + py::arg("filter") + = std::function([](doc_id) { return true; })); + + py::class_{m_idx, "ScoreData"} + .def(py::init()) + .def_property_readonly( + "idx", + [](index::score_data& sd) -> index::inverted_index& { + return sd.idx; + }) + .def_readwrite("avg_dl", &index::score_data::avg_dl) + .def_readwrite("num_docs", &index::score_data::num_docs) + .def_readwrite("total_terms", &index::score_data::total_terms) + .def_readwrite("query_length", &index::score_data::query_length) + .def_readwrite("t_id", &index::score_data::t_id) + .def_readwrite("query_term_weight", + &index::score_data::query_term_weight) + .def_readwrite("doc_count", &index::score_data::doc_count) + .def_readwrite("corpus_term_count", + &index::score_data::corpus_term_count) + .def_readwrite("d_id", &index::score_data::d_id) + .def_readwrite("doc_term_count", &index::score_data::doc_term_count) + .def_readwrite("doc_size", &index::score_data::doc_size) + .def_readwrite("doc_unique_terms", + &index::score_data::doc_unique_terms); + + py::class_ rf_base{ + m_idx, "RankingFunction", rank_base}; + + rf_base.def(py::init<>()) + .def("score_one", &index::ranking_function::score_one); + + py::class_ lm_rank_base{ + m_idx, "LanguageModelRanker", rf_base}; + lm_rank_base.def(py::init<>()); + + py::class_{m_idx, "AbsoluteDiscount", + lm_rank_base} + .def(py::init(), + py::arg("delta") = index::absolute_discount::default_delta); + + py::class_{m_idx, "DirichletPrior", lm_rank_base} + .def(py::init(), + py::arg("mu") = index::dirichlet_prior::default_mu); + + py::class_{m_idx, "JelinekMercer", lm_rank_base}.def( + py::init(), + py::arg("lambda") = index::jelinek_mercer::default_lambda); + + py::class_{m_idx, "PivotedLength", rf_base}.def( + py::init(), py::arg("s") = index::pivoted_length::default_s); + + py::class_{m_idx, "OkapiBM25", rf_base}.def( + py::init(), + py::arg("k1") = index::okapi_bm25::default_k1, + py::arg("b") = index::okapi_bm25::default_b, + py::arg("k3") = index::okapi_bm25::default_k3); + + py::class_{m_idx, "KLDivergencePRF", rank_base} + .def(py::init>()) + .def("__init__", + [](index::kl_divergence_prf& kl_div, + std::shared_ptr fwd, + index::language_model_ranker& lm_ranker, float alpha, + float lambda, uint64_t k, uint64_t max_terms) { + // + // TODO: make this less of an absolute hack; will need API + // changes in MeTA + // + // Ideally make ranker subclass util::clonable + // + std::stringstream ss; + lm_ranker.save(ss); + auto lm_ranker_clone = index::load_lm_ranker(ss); + + new (&kl_div) + index::kl_divergence_prf(fwd, std::move(lm_ranker_clone), + alpha, lambda, k, max_terms); + }, + py::arg("fwd"), py::arg("lm_ranker"), + py::arg("alpha") = index::kl_divergence_prf::default_alpha, + py::arg("lambda") = index::kl_divergence_prf::default_lambda, + py::arg("k") = index::kl_divergence_prf::default_k, + py::arg("max_terms") + = index::kl_divergence_prf::default_max_terms); + + py::class_{m_idx, "Rocchio", rank_base} + .def(py::init>()) + .def("__init__", + [](index::rocchio& rocchio, + std::shared_ptr fwd, + index::ranker& initial_ranker, float alpha, float beta, + uint64_t k, uint64_t max_terms) { + // + // TODO: make this less of an absolute hack; will need API + // changes in MeTA + // + // Ideally make ranker subclass util::clonable + // + std::stringstream ss; + initial_ranker.save(ss); + auto ranker_clone = index::load_ranker(ss); + + new (&rocchio) index::rocchio(fwd, std::move(ranker_clone), + alpha, beta, k, max_terms); + }, + py::arg("fwd"), py::arg("initial_ranker"), + py::arg("alpha") = index::rocchio::default_alpha, + py::arg("beta") = index::rocchio::default_beta, + py::arg("k") = index::rocchio::default_k, + py::arg("max_terms") = index::rocchio::default_max_terms); + + py::class_{m_idx, "IREval"} + .def("__init__", + [](index::ir_eval& ev, const std::string& cfg_path) { + new (&ev) index::ir_eval(*cpptoml::parse_file(cfg_path)); + }) + .def("precision", + [](const index::ir_eval& ev, + const index::ir_eval::result_type& results, query_id q_id, + uint64_t num_docs) { + return ev.precision(results, q_id, num_docs); + }, + "Return precision = (#relevant_retrieved / #retrieved)", + py::arg("results"), py::arg("q_id"), + py::arg("num_docs") = std::numeric_limits::max()) + .def("recall", + [](const index::ir_eval& ev, + const index::ir_eval::result_type& results, query_id q_id, + uint64_t num_docs) { + return ev.recall(results, q_id, num_docs); + }, + "Return recall = (#relevant_retrieved / #relevant)", + py::arg("results"), py::arg("q_id"), + py::arg("num_docs") = std::numeric_limits::max()) + .def("f1", + [](const index::ir_eval& ev, + const index::ir_eval::result_type& results, query_id q_id, + uint64_t num_docs, + double beta) { return ev.f1(results, q_id, num_docs, beta); }, + "Return F1 score, a balance between precision and recall", + py::arg("results"), py::arg("q_id"), + py::arg("num_docs") = std::numeric_limits::max(), + py::arg("beta") = 1.0) + .def("ndcg", + [](const index::ir_eval& ev, + const index::ir_eval::result_type& results, query_id q_id, + uint64_t num_docs) { return ev.ndcg(results, q_id, num_docs); }, + "Return normalized discounted cumulative gain score", + py::arg("results"), py::arg("q_id"), + py::arg("num_docs") = std::numeric_limits::max()) + .def("avg_p", + [](index::ir_eval& ev, const index::ir_eval::result_type& results, + query_id q_id, uint64_t num_docs) { + return ev.avg_p(results, q_id, num_docs); + }, + "Return average precision", py::arg("results"), py::arg("q_id"), + py::arg("num_docs") = std::numeric_limits::max()) + .def("map", &index::ir_eval::map) + .def("gmap", &index::ir_eval::gmap) + .def("reset_stats", &index::ir_eval::reset_stats); +} diff --git a/metapy/src/metapy_learn.cpp b/metapy/src/metapy_learn.cpp new file mode 100644 index 0000000000..86f1e72386 --- /dev/null +++ b/metapy/src/metapy_learn.cpp @@ -0,0 +1,246 @@ +/** + * @file metapy_learn.cpp + * @author Chase Geigle + */ + +#include +#include +#include + +#include "cpptoml.h" +#include "meta/learn/dataset.h" +#include "meta/learn/dataset_view.h" +#include "meta/learn/loss/all.h" +#include "meta/learn/sgd.h" +#include "meta/learn/transform.h" +#include "meta/util/iterator.h" +#include "metapy_identifiers.h" +#include "metapy_learn.h" + +namespace py = pybind11; +using namespace meta; + +template +void bind_loss_function(py::module& m, const char* name, Base& base) +{ + py::class_{m, name, base} + .def(py::init<>()) + .def_property_readonly_static("id", [](py::object /* self */) { + return LossFunction::id.to_string(); + }); +} + +struct py_loss_function : public learn::loss::loss_function +{ + double loss(double prediction, double expected) const override + { + PYBIND11_OVERLOAD_PURE(double, learn::loss::loss_function, loss, + prediction, expected); + return 0; + } + + double derivative(double prediction, double expected) const override + { + PYBIND11_OVERLOAD_PURE(double, learn::loss::loss_function, derivative, + prediction, expected); + return 0; + } + + virtual void save(std::ostream& /* os */) const override + { + throw std::runtime_error{ + "cannot serialize python-defined loss functions"}; + } +}; + +void metapy_bind_learn(py::module& m) +{ + auto m_learn = m.def_submodule("learn"); + + py::class_{m_learn, "FeatureVector"} + .def(py::init<>()) + .def(py::init()) + .def(py::init()) + .def("__init__", + [](learn::feature_vector& fv, py::iterable& iter) { + using pair_type = learn::feature_vector::pair_type; + auto cast_fn + = [](py::handle h) { return h.cast(); }; + new (&fv) learn::feature_vector( + util::make_transform_iterator(iter.begin(), cast_fn), + util::make_transform_iterator(iter.end(), cast_fn)); + }) + .def("__len__", &learn::feature_vector::size) + .def("__iter__", + [](learn::feature_vector& fv) { + return py::make_iterator(fv.begin(), fv.end()); + }, + py::keep_alive<0, 1>()) + .def("__getitem__", [](const learn::feature_vector& fv, + learn::feature_id fid) { return fv.at(fid); }) + .def("__setitem__", [](learn::feature_vector& fv, learn::feature_id fid, + double val) { fv[fid] = val; }) + .def("clear", &learn::feature_vector::clear) + .def("shrink_to_fit", &learn::feature_vector::shrink_to_fit) + .def("condense", &learn::feature_vector::condense) + .def("dot", + [](const learn::feature_vector& self, + const learn::feature_vector& other) { + return util::dot_product(self, other); + }) + .def("cosine", + [](const learn::feature_vector& self, + const learn::feature_vector& other) { + return util::cosine_sim(self, other); + }) + .def("l2norm", + [](const learn::feature_vector& self) { + return util::l2norm(self); + }) + .def("__str__", [](const learn::feature_vector& fv) { + std::stringstream ss; + util::string_view padding = ""; + ss << '['; + for (const auto& pr : fv) + { + ss << padding << '(' << pr.first << ", " << pr.second << ')'; + padding = ", "; + } + ss << ']'; + return ss.str(); + }); + + m_learn.def("dot", &util::dot_product); + m_learn.def("cosine", &util::cosine_sim); + m_learn.def("l2norm", [](const learn::feature_vector& vec) { + return util::l2norm(vec); + }); + + py::class_{m_learn, "Instance"} + .def(py::init()) + .def(py::init()) + .def_readonly("id", &learn::instance::id) + .def_readwrite("weights", &learn::instance::weights); + + py::class_ pydset{m_learn, "Dataset"}; + pydset + .def("__init__", + [](learn::dataset& dset, + const std::shared_ptr& fidx) { + py::gil_scoped_release release; + new (&dset) learn::dataset(fidx); + }) + .def("__init__", + [](learn::dataset& dset, + const std::shared_ptr& fidx, + const std::vector& docs) { + py::gil_scoped_release release; + new (&dset) learn::dataset(fidx, docs); + }) + .def("__init__", + [](learn::dataset& dset, py::list& data, + std::size_t total_features, py::function& featurizer) { + new (&dset) + learn::dataset(data.begin(), data.end(), total_features, + [&](py::handle obj) { + return py::cast( + featurizer(obj)); + }); + }) + .def("__getitem__", + [](learn::dataset& dset, int64_t offset) -> learn::instance& { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : dset.size() + offset; + if (idx >= dset.size()) + throw py::index_error(); + return *(dset.begin() + idx); + }, + py::return_value_policy::reference_internal) + .def("__getitem__", + [](learn::dataset& dset, py::slice slice) { + learn::dataset_view dv{dset}; + return make_sliced_dataset_view(dv, slice); + }, + py::keep_alive<0, 1>()) + .def("__len__", &learn::dataset::size) + .def("__iter__", + [](const learn::dataset& dset) { + return py::make_iterator(dset.begin(), dset.end()); + }, + py::keep_alive<0, 1>()) + .def("total_features", &learn::dataset::total_features); + + py::class_{m_learn, "DatasetView"} + .def(py::init(), py::keep_alive<0, 1>()) + .def("shuffle", &learn::dataset_view::shuffle) + .def("rotate", &learn::dataset_view::rotate) + .def("total_features", &learn::dataset_view::total_features) + .def("__getitem__", + [](const learn::dataset_view& dv, int64_t offset) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : dv.size() + offset; + if (idx >= dv.size()) + throw py::index_error(); + return *(dv.begin() + idx); + }) + .def("__getitem__", + [](const learn::dataset_view& dv, py::slice slice) { + return make_sliced_dataset_view(dv, slice); + }, + py::keep_alive<0, 1>()) + .def("__len__", &learn::dataset_view::size) + .def("__iter__", + [](const learn::dataset_view& dv) { + return py::make_iterator(dv.begin(), dv.end()); + }, + py::keep_alive<0, 1>()); + + py::implicitly_convertible(); + + m_learn.def("tfidf_transform", &learn::tfidf_transform); + m_learn.def("l2norm_transform", &learn::l2norm_transform); + + auto m_loss = m_learn.def_submodule("loss"); + + py::class_ pyloss{ + m_loss, "LossFunction"}; + pyloss.def("loss", &learn::loss::loss_function::loss) + .def("derivative", &learn::loss::loss_function::derivative); + + bind_loss_function(m_loss, "Hinge", pyloss); + bind_loss_function(m_loss, "Huber", pyloss); + bind_loss_function(m_loss, "LeastSquares", + pyloss); + bind_loss_function(m_loss, "Logistic", pyloss); + bind_loss_function(m_loss, "ModifiedHuber", + pyloss); + bind_loss_function(m_loss, "Perceptron", pyloss); + bind_loss_function(m_loss, "SmoothHinge", + pyloss); + bind_loss_function(m_loss, "SquaredHinge", + pyloss); + + py::class_ py_sgdmodel{m_learn, "SGDModel"}; + py::class_{py_sgdmodel, "Options"} + .def(py::init<>()) + .def_readwrite("learning_rate", + &learn::sgd_model::options_type::learning_rate) + .def_readwrite("l2_regularizer", + &learn::sgd_model::options_type::l2_regularizer) + .def_readwrite("l1_regularizer", + &learn::sgd_model::options_type::l1_regularizer); + py_sgdmodel + .def_readonly_static("default_learning_rate", + &learn::sgd_model::default_learning_rate) + .def_readonly_static("default_l2_regularizer", + &learn::sgd_model::default_l2_regularizer) + .def_readonly_static("default_l1_regularizer", + &learn::sgd_model::default_l1_regularizer) + .def(py::init()) + .def("predict", &learn::sgd_model::predict) + .def("train_one", &learn::sgd_model::train_one); +} diff --git a/metapy/src/metapy_parser.cpp b/metapy/src/metapy_parser.cpp new file mode 100644 index 0000000000..3dbdcea02c --- /dev/null +++ b/metapy/src/metapy_parser.cpp @@ -0,0 +1,330 @@ +/** + * @file metapy_parser.cpp + * @author Chase Geigle + * + * This file defines the metapy.parser submodule and creates bindings for + * that part of the MeTA API. + */ + +#include + +#include +#include +#include + +#include + +#include "meta/parser/trees/evalb.h" +#include "meta/parser/trees/internal_node.h" +#include "meta/parser/trees/leaf_node.h" +#include "meta/parser/trees/visitors/annotation_remover.h" +#include "meta/parser/trees/visitors/binarizer.h" +#include "meta/parser/trees/visitors/debinarizer.h" +#include "meta/parser/trees/visitors/empty_remover.h" +#include "meta/parser/trees/visitors/head_finder.h" +#include "meta/parser/trees/visitors/leaf_node_finder.h" +#include "meta/parser/trees/visitors/unary_chain_remover.h" + +#include "meta/parser/sequence_extractor.h" +#include "meta/parser/sr_parser.h" + +#include "meta/parser/io/ptb_reader.h" + +#include "metapy_identifiers.h" +#include "metapy_parser.h" + +namespace py = pybind11; +using namespace meta; + +template +class visitor_wrapper : public parser::visitor +{ + public: + virtual py::object operator()(parser::leaf_node& ln) override + { + return py::cast(vtor_(ln)); + } + + virtual py::object operator()(parser::internal_node& n) override + { + return py::cast(vtor_(n)); + } + + Visitor& visitor() + { + return vtor_; + } + + private: + Visitor vtor_; +}; + +template +class visitor_wrapper> + : public parser::visitor +{ + public: + virtual py::object operator()(parser::leaf_node& ln) override + { + return py::cast(vtor_(ln).release()); + } + + virtual py::object operator()(parser::internal_node& n) override + { + return py::cast(vtor_(n).release()); + } + + Visitor& visitor() + { + return vtor_; + } + + private: + Visitor vtor_; +}; + +template +class visitor_wrapper : public parser::visitor +{ + public: + virtual py::object operator()(parser::leaf_node& ln) override + { + vtor_(ln); + return py::cast(nullptr); + } + + virtual py::object operator()(parser::internal_node& n) override + { + vtor_(n); + return py::cast(nullptr); + } + + Visitor& visitor() + { + return vtor_; + } + + private: + Visitor vtor_; +}; + +class py_visitor : public parser::visitor +{ + public: + virtual py::object operator()(parser::leaf_node& ln) override + { + PYBIND11_OVERLOAD_PURE(py::object, parser::visitor, + visit_leaf, ln); + return py::cast(nullptr); + } + + virtual py::object operator()(parser::internal_node& n) override + { + + PYBIND11_OVERLOAD_PURE(py::object, parser::visitor, + visit_internal, n); + return py::cast(nullptr); + } +}; + +void metapy_bind_parser(py::module& m) +{ + using namespace parser; + + auto m_parse = m.def_submodule("parser"); + + py::class_{m_parse, "Node"} + .def("category", &node::category) + .def("is_leaf", &node::is_leaf) + .def("is_temporary", &node::is_temporary) + .def("equal", &node::equal) + .def("accept", [](node& n, parser::visitor& vtor) { + return n.accept(vtor); + }); + + py::class_{m_parse, "LeafNode"} + .def(py::init()) + .def(py::init()) + .def("word", [](const leaf_node& ln) { return *ln.word(); }); + + py::class_{m_parse, "InternalNode"} + .def("__init__", + [](internal_node& n, class_label cat, py::list pylist) { + std::vector> children(pylist.size()); + for (std::size_t i = 0; i < pylist.size(); ++i) + children[i] = pylist[i].cast().clone(); + + new (&n) internal_node(std::move(cat), std::move(children)); + }) + .def(py::init()) + .def("add_child", [](internal_node& n, + const node& child) { n.add_child(child.clone()); }) + .def("num_children", &internal_node::num_children) + .def("child", &internal_node::child, py::keep_alive<0, 1>()) + .def("head_lexicon", [](internal_node& n) { return n.head_lexicon(); }, + py::keep_alive<0, 1>()) + .def("head_lexicon", + [](internal_node& n, const leaf_node* descendent) { + n.head_lexicon(descendent); + }) + .def("head_constituent", + [](internal_node& n) { return n.head_constituent(); }, + py::keep_alive<0, 1>()) + .def("head_constituent", + [](internal_node& n, const node* descendent) { + n.head_constituent(descendent); + }) + // weirdness: need to ensure that the child nodes passed down to + // the python lambda preserve the lifetime of the current internal + // node in case they are stored internally + .def("each_child", + [](internal_node& n, py::function fn) { + n.each_child([&](node* child) { + auto handle = py::cast( + *child, py::return_value_policy::reference_internal, + py::cast(n)); + fn(handle); + }); + }) + .def("__getitem__", + [](internal_node& n, int64_t offset) { + uint64_t idx = offset >= 0 ? static_cast(offset) + : n.num_children() + offset; + if (idx >= n.num_children()) + throw py::index_error(); + return n.child(idx); + }, + py::keep_alive<0, 1>()) + .def("__len__", &internal_node::num_children); + + py::class_{m_parse, "ParseTree"} + .def("__init__", + [](parse_tree& tree, const node& n) { + new (&tree) parse_tree(n.clone()); + }) + .def(py::init()) + .def("__str__", + [](const parse_tree& tree) { + std::stringstream ss; + ss << tree; + return ss.str(); + }) + .def("pretty_str", + [](const parse_tree& tree) { + std::stringstream ss; + tree.pretty_print(ss); + return ss.str(); + }) + // keep_alive here is to ensure that the visitor keeps the tree + // alive as long as it is still referenced, since it might hold an + // internal pointer into the tree + .def("visit", + [](parse_tree& tree, parser::visitor& vtor) { + return tree.visit(vtor); + }, + py::keep_alive<2, 1>()); + + py::implicitly_convertible(); + + py::class_, py_visitor> vtorbase{m_parse, "Visitor"}; + vtorbase.def(py::init<>()) + .def("visit_leaf", + [](visitor& vtor, leaf_node& ln) { return vtor(ln); }) + .def("visit_internal", [](visitor& vtor, + internal_node& in) { return vtor(in); }); + + py::class_>{ + m_parse, "AnnotationRemover", vtorbase} + .def(py::init<>()); + py::class_>{m_parse, "Binarizer", vtorbase}.def( + py::init<>()); + py::class_>{m_parse, "Debinarizer", vtorbase} + .def(py::init<>()); + py::class_>{m_parse, "EmptyRemover", + vtorbase} + .def(py::init<>()); + py::class_>{ + m_parse, "UnaryChainRemover", vtorbase} + .def(py::init<>()); + + py::class_>{m_parse, "HeadFinder", vtorbase} + .def(py::init<>()); + py::class_>{m_parse, "LeafNodeFinder", + vtorbase} + .def(py::init<>()) + .def("leaves", [](visitor_wrapper& lnf) { + // need to manually create the py::list here since the + // pybind11 caster for vector operates on a const vector, + // not a mutable one + auto leaves = lnf.visitor().leaves(); + + py::list ret(leaves.size()); + for (std::size_t i = 0; i < leaves.size(); ++i) + { + ret[i] = py::reinterpret_steal( + py::detail::type_caster>::cast( + std::move(leaves[i]), + py::return_value_policy::automatic_reference, + py::handle())); + } + return ret; + }); + + py::class_>{ + m_parse, "SequenceExtractor", vtorbase} + .def(py::init<>()) + .def("sequence", [](visitor_wrapper& vtor) { + return vtor.visitor().sequence(); + }); + + py::class_{m_parse, "EvalB"} + .def(py::init<>()) + .def("matched", &evalb::matched) + .def("proposed_total", &evalb::proposed_total) + .def("gold_total", &evalb::gold_total) + .def("labeled_precision", &evalb::labeled_precision) + .def("labeled_recall", &evalb::labeled_recall) + .def("labeled_f1", &evalb::labeled_f1) + .def("perfect", &evalb::perfect) + .def("average_crossing", &evalb::average_crossing) + .def("zero_crossing", &evalb::zero_crossing) + .def("add_tree", &evalb::add_tree); + + m_parse.def("extract_trees_from_file", [](const std::string& filename) { + return parser::io::extract_trees(filename); + }); + + m_parse.def("extract_trees", [](const std::string& input) { + std::stringstream ss{input}; + return parser::io::extract_trees(ss); + }); + + m_parse.def("read_tree", [](const std::string& input) { + std::stringstream ss{input}; + return parser::io::extract_trees(ss).at(0); + }); + + py::class_ parser{m_parse, "Parser"}; + + py::enum_{parser, "TrainingAlgorithm"} + .value("EarlyTermination", + sr_parser::training_algorithm::EARLY_TERMINATION) + .value("BeamSearch", sr_parser::training_algorithm::BEAM_SEARCH); + + py::class_{parser, "TrainingOptions"} + .def(py::init<>()) + .def(py::init()) + .def_readwrite("batch_size", &sr_parser::training_options::batch_size) + .def_readwrite("beam_size", &sr_parser::training_options::beam_size) + .def_readwrite("max_iterations", + &sr_parser::training_options::max_iterations) + .def_readwrite("seed", &sr_parser::training_options::seed) + .def_readwrite("num_threads", &sr_parser::training_options::num_threads) + .def_readwrite("algorithm", &sr_parser::training_options::algorithm); + + parser.def(py::init<>()) + .def(py::init()) + .def("parse", &sr_parser::parse) + .def("train", &sr_parser::train) + .def("save", &sr_parser::save); +} diff --git a/metapy/src/metapy_sequence.cpp b/metapy/src/metapy_sequence.cpp new file mode 100644 index 0000000000..467c438ec7 --- /dev/null +++ b/metapy/src/metapy_sequence.cpp @@ -0,0 +1,134 @@ +/** + * @file metapy_sequence.cpp + * @author Chase Geigle + * + * This file defines the metapy.sequence submodule and creates bindings + * for that part of the MeTA API. + */ + +#include + +#include +#include + +#include "cpptoml.h" + +#include "meta/sequence/io/ptb_parser.h" +#include "meta/sequence/perceptron.h" +#include "meta/sequence/sequence.h" + +#include "metapy_identifiers.h" +#include "metapy_sequence.h" + +namespace py = pybind11; +using namespace meta; + +void metapy_bind_sequence(py::module& m) +{ + auto m_seq = m.def_submodule("sequence"); + + py::class_{m_seq, "Observation"} + .def(py::init()) + .def(py::init()) + .def_property( + "symbol", + [](const sequence::observation& obs) { return obs.symbol(); }, + [](sequence::observation& obs, sequence::symbol_t sym) { + obs.symbol(std::move(sym)); + }) + .def_property( + "tag", [](const sequence::observation& obs) { return obs.tag(); }, + [](sequence::observation& obs, sequence::tag_t tag) { + obs.tag(std::move(tag)); + }) + .def_property( + "label", + [](const sequence::observation& obs) { return obs.label(); }, + [](sequence::observation& obs, label_id lbl) { obs.label(lbl); }) + .def_property( + "features", + [](const sequence::observation& obs) { return obs.features(); }, + [](sequence::observation& obs, + sequence::observation::feature_vector feats) { + obs.features(std::move(feats)); + }) + .def("tagged", &sequence::observation::tagged); + + py::class_{m_seq, "Sequence"} + .def(py::init<>()) + .def("add_observation", &sequence::sequence::add_observation) + .def("add_symbol", &sequence::sequence::add_symbol) + .def("__getitem__", + [](sequence::sequence& seq, int64_t offset) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : seq.size() + offset; + if (idx >= seq.size()) + throw py::index_error(); + return seq[idx]; + }) + .def("__setitem__", + [](sequence::sequence& seq, int64_t offset, + sequence::observation obs) { + std::size_t idx = offset >= 0 + ? static_cast(offset) + : seq.size() + offset; + if (idx >= seq.size()) + throw py::index_error(); + seq[idx] = std::move(obs); + }) + .def("__len__", &sequence::sequence::size) + .def("__iter__", + [](const sequence::sequence& seq) { + return py::make_iterator(seq.begin(), seq.end()); + }, + py::keep_alive<0, 1>()) + .def("__str__", + [](const sequence::sequence& seq) { + std::string res; + for (auto it = seq.begin(); it != seq.end();) + { + res += "(" + static_cast(it->symbol()) + ", " + + (it->tagged() + ? static_cast(it->tag()) + : "???") + + ")"; + if (++it != seq.end()) + res += ", "; + } + return res; + }) + .def("tagged", [](const sequence::sequence& seq) { + std::vector> res(seq.size()); + std::transform(seq.begin(), seq.end(), res.begin(), + [](const sequence::observation& obs) { + return std::make_pair( + obs.symbol(), + obs.tagged() + ? static_cast(obs.tag()) + : "???"); + }); + return res; + }); + + m_seq.def("extract_sequences", &sequence::extract_sequences); + + using sequence::perceptron; + py::class_ perc_tagger{m_seq, "PerceptronTagger"}; + + py::class_{perc_tagger, "TrainingOptions"} + .def(py::init<>()) + .def_readwrite("max_iterations", + &perceptron::training_options::max_iterations) + .def_readwrite("seed", &perceptron::training_options::seed); + + perc_tagger.def(py::init<>()) + .def("__init__", + [](perceptron& model, const std::string& path) { + py::gil_scoped_release rel; + new (&model) perceptron(path); + }) + .def("tag", &perceptron::tag) + .def("train", &perceptron::train) + .def("save", &perceptron::save); +} diff --git a/metapy/src/metapy_stats.cpp b/metapy/src/metapy_stats.cpp new file mode 100644 index 0000000000..8a3cdea6ef --- /dev/null +++ b/metapy/src/metapy_stats.cpp @@ -0,0 +1,50 @@ +/** + * @file metapy_stats.cpp + * @author Chase Geigle + * + * This file defines the metapy.stats submodule and creates bindings + * for that part of the MeTA API. + */ + +#include + +#include +#include + +#include "meta/stats/multinomial.h" + +#include "metapy_identifiers.h" +#include "metapy_stats.h" + +namespace py = pybind11; +using namespace meta; + +void metapy_bind_stats(py::module& m) +{ + auto m_stats = m.def_submodule("stats"); + + py::class_{m_stats, "Multinomial"} + .def("increment", &py_multinomial::increment) + .def("decrement", &py_multinomial::decrement) + .def("counts", [](const py_multinomial& dist, + py::object obj) { return dist.counts(obj); }) + .def("counts", [](const py_multinomial& dist) { return dist.counts(); }) + .def("unique_events", &py_multinomial::unique_events) + .def("each_seen_event", &py_multinomial::each_seen_event) + .def("clear", &py_multinomial::clear) + .def("probability", &py_multinomial::probability) + .def("__repr__", [](const py_multinomial& mult) { + const auto size = mult.unique_events(); + uint64_t i = 0; + std::string result = "(); + result += ": "; + result += std::to_string(mult.probability(obj)); + if (++i != size) + result += ", "; + }); + result += "}>"; + return result; + }); +} diff --git a/metapy/src/metapy_topics.cpp b/metapy/src/metapy_topics.cpp new file mode 100644 index 0000000000..dd8a95ef76 --- /dev/null +++ b/metapy/src/metapy_topics.cpp @@ -0,0 +1,230 @@ +/** + * @file metapy_topics.cpp + * @author Sean Massung + */ + +#include +#include + +#include "cpptoml.h" +#include "meta/learn/dataset.h" +#include "meta/logging/logger.h" +#include "meta/topics/bl_term_scorer.h" +#include "meta/topics/inferencer.h" +#include "meta/topics/lda_cvb.h" +#include "meta/topics/lda_cvb_inferencer.h" +#include "meta/topics/lda_gibbs.h" +#include "meta/topics/lda_gibbs_inferencer.h" +#include "meta/topics/lda_scvb.h" +#include "meta/topics/parallel_lda_gibbs.h" +#include "meta/util/random.h" +#include "metapy_identifiers.h" +#include "metapy_stats.h" +#include "metapy_topics.h" + +namespace py = pybind11; +using namespace meta; + +void metapy_bind_topics(py::module& m) +{ + auto m_topics = m.def_submodule("topics"); + + py::class_{m_topics, "LDAModel"} + .def("run", + [](topics::lda_model& model, uint64_t num_iters, + double convergence) { + py::gil_scoped_release release; + model.run(num_iters, convergence); + }) + .def("save_doc_topic_distributions", + [](const topics::lda_model& model, const std::string& filename) { + std::ofstream output{filename, std::ios::binary}; + model.save_doc_topic_distributions(output); + }) + .def("save_topic_term_distributions", + [](const topics::lda_model& model, const std::string& filename) { + std::ofstream output{filename, std::ios::binary}; + model.save_topic_term_distributions(output); + }) + .def("save", &topics::lda_model::save) + .def("compute_term_topic_probability", + &topics::lda_model::compute_term_topic_probability) + .def("compute_doc_topic_probability", + &topics::lda_model::compute_doc_topic_probability) + .def("topic_distribution", + [](const topics::lda_model& model, doc_id doc) { + return py_multinomial{model.topic_distribution(doc)}; + }) + .def("term_distribution", + [](const topics::lda_model& model, topic_id k) { + return py_multinomial{model.term_distribution(k)}; + }) + .def("num_topics", &topics::lda_model::num_topics); + + py::class_{m_topics, "LDAInferencer"} + .def("term_distribution", + [](const topics::inferencer& inf, topic_id k) { + return py_multinomial{inf.term_distribution(k)}; + }, + py::arg("k")) + .def("num_topics", &topics::inferencer::num_topics); + + py::class_{m_topics, "LDACollapsedVB"} + .def(py::init(), + py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"), + py::arg("alpha"), py::arg("beta")) + .def("run", + [](topics::lda_cvb& lda, uint64_t num_iters, double convergence) { + py::gil_scoped_release release; + lda.run(num_iters, convergence); + }, + py::arg("num_iters"), py::arg("convergence") = 1e-3); + + py::class_{m_topics, + "CVBInferencer"} + .def("__init__", + [](topics::inferencer& inf, const std::string& cfgfile) { + py::gil_scoped_release release; + auto config = cpptoml::parse_file(cfgfile); + new (&inf) topics::inferencer(*config); + }, + py::arg("cfg_file")) + .def("__init__", + [](topics::inferencer& inf, const std::string& topicsfile, + double alpha) { + py::gil_scoped_release release; + std::ifstream topics_stream{topicsfile, std::ios::binary}; + new (&inf) topics::inferencer(topics_stream, alpha); + }, + py::arg("topics_file"), py::arg("alpha")) + .def("infer", + [](const topics::lda_cvb::inferencer& inf, + const learn::feature_vector& doc, std::size_t max_iters, + double convergence) { + return py_multinomial{inf(doc, max_iters, convergence)}; + }, + py::arg("doc"), py::arg("max_iters"), py::arg("convergence")); + + py::class_{m_topics, "LDAGibbs"} + .def(py::init(), + py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"), + py::arg("alpha"), py::arg("beta")) + .def( + "run", + [](topics::lda_gibbs& lda, uint64_t num_iters, double convergence) { + py::gil_scoped_release release; + lda.run(num_iters, convergence); + }, + py::arg("num_iters"), py::arg("convergence") = 1e-6); + + py::class_{ + m_topics, "GibbsInferencer"} + .def("__init__", + [](topics::inferencer& inf, const std::string& cfgfile) { + py::gil_scoped_release release; + auto config = cpptoml::parse_file(cfgfile); + new (&inf) topics::inferencer(*config); + }, + py::arg("cfg_file")) + .def("__init__", + [](topics::inferencer& inf, const std::string& topicsfile, + double alpha) { + py::gil_scoped_release release; + std::ifstream topics_stream{topicsfile, std::ios::binary}; + new (&inf) topics::inferencer(topics_stream, alpha); + }, + py::arg("topics_file"), py::arg("alpha")) + + .def("infer", + [](const topics::lda_gibbs::inferencer& inf, + const learn::feature_vector& doc, std::size_t num_iters, + std::size_t seed) { + random::xoroshiro128 rng{seed}; + return py_multinomial{inf(doc, num_iters, rng)}; + }, + py::arg("doc"), py::arg("max_iters"), py::arg("rng_seed")); + + py::class_{ + m_topics, "LDAParallelGibbs"} + .def(py::init(), + py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"), + py::arg("alpha"), py::arg("beta")); + + py::class_{m_topics, + "LDAStochasticCVB"} + .def(py::init(), + py::keep_alive<0, 1>(), py::arg("docs"), py::arg("num_topics"), + py::arg("alpha"), py::arg("beta"), py::arg("minibatch_size") = 100) + .def("run", + [](topics::lda_scvb& lda, uint64_t num_iters, double convergence) { + py::gil_scoped_release release; + lda.run(num_iters, convergence); + }, + py::arg("num_iters"), py::arg("convergence") = 0); + + py::class_{m_topics, "TopicModel"} + .def("__init__", + [](topics::topic_model& model, const std::string& prefix) { + py::gil_scoped_release release; + + std::ifstream theta{prefix + ".theta.bin", std::ios::binary}; + + if (!theta) + { + throw topics::topic_model_exception{ + "missing document topic probabilities file: " + prefix + + ".theta.bin"}; + } + + std::ifstream phi{prefix + ".phi.bin", std::ios::binary}; + if (!phi) + { + throw topics::topic_model_exception{ + "missing topic term probabilities file: " + prefix + + ".phi.bin"}; + } + + new (&model) topics::topic_model(theta, phi); + }) + .def("top_k", + [](const topics::topic_model& model, topic_id tid, std::size_t k) { + return model.top_k(tid, k); + }, + py::arg("tid"), py::arg("k") = 10) + .def("top_k", + [](const topics::topic_model& model, topic_id tid, std::size_t k, + std::function scorer) { + return model.top_k(tid, k, scorer); + }, + py::arg("tid"), py::arg("k") = 10, py::arg("scorer")) + .def("top_k", + [](const topics::topic_model& model, topic_id tid, std::size_t k, + const topics::bl_term_scorer& scorer) { + return model.top_k(tid, k, scorer); + }, + py::arg("tid"), py::arg("k") = 10, py::arg("scorer")) + .def("topic_distribution", + [](const topics::topic_model& self, doc_id did) { + return py_multinomial{self.topic_distribution(did)}; + }) + .def("term_distribution", + [](const topics::topic_model& self, topic_id k) { + return py_multinomial{self.term_distribution(k)}; + }) + .def("term_probability", &topics::topic_model::term_probability) + .def("topic_probability", &topics::topic_model::topic_probability) + .def("num_topics", &topics::topic_model::num_topics) + .def("num_words", &topics::topic_model::num_words) + .def("num_docs", &topics::topic_model::num_docs); + + m_topics.def("load_topic_model", [](const std::string& config_path) { + py::gil_scoped_release release; + auto config = cpptoml::parse_file(config_path); + return topics::load_topic_model(*config); + }); + + py::class_{m_topics, "BLTermScorer"} + .def(py::init(), py::keep_alive<0, 1>()) + .def("__call__", &topics::bl_term_scorer::operator()); +} diff --git a/metapy/src/nltk_additions.py b/metapy/src/nltk_additions.py new file mode 100644 index 0000000000..73a1c8524d --- /dev/null +++ b/metapy/src/nltk_additions.py @@ -0,0 +1,246 @@ +import nltk +import math +import numpy + +nltk.download('punkt') +nltk.download('wordnet') +nltk.download('omw-1.4') +nltk.download('averaged_perceptron_tagger') +nltk.download('vader_lexicon') +nltk.download('stopwords') + + +# NLTK integrated Inl2 retrieval functionality (MP Integration Function #1) +def get_stopwords(lang='english'): + return set(nltk.corpus.stopwords.words(lang)) + + +def inl2_retrieval(documents, query): + # Tokenize documents and query + def tokenize(text): + return nltk.word_tokenize(text.lower()) + + t_doc = [tokenize(doc) for doc in documents] + t_query = tokenize(query) + + # Compute TF-IDF scores for each document + def score(query_terms, document_terms, corpus): + tf_scores = [document_terms.count(term) for term in query_terms] + idf_scores = [math.log(len(corpus) / (1 + corpus.count(term))) for term in query_terms] + tfidf_scores = [tf * idf for tf, idf in zip(tf_scores, idf_scores)] + return sum(tfidf_scores) + + corpus = [token for doc in t_doc for token in doc] + scores = [(doc, score(t_query, doc, corpus)) for doc in t_doc] + + # Sort documents by score (highest score first) + scores.sort(key=lambda x: x[1], reverse=True) + + return scores + + +# NLTK integrated Part of speech tagging functionality (MP Integration Function #2) +def pos_tagging(text): + # Tokenize the input text into words + words = nltk.word_tokenize(text) + + # Perform POS tagging + pos_tags = nltk.pos_tag(words) + + return pos_tags + + +# NDCG scoring functionality (MP Integration Function #3) +def ndcg(ranker, queries, relevant_docs, k=10): + ndcg_scores = [] + + def dcg(relevance_scores): + # Calculate the Discounted Cumulative Gain (DCG) + dcg_score = 0.0 + for i, rel in enumerate(relevance_scores): + dcg_score += (2**rel - 1) / math.log2(i + 2) + return dcg_score + + for query in queries: + # Use the ranker to rank documents for the current query + ranked_docs = ranker(query)[:k] # Consider only the top k documents + + # Calculate the relevance scores for the ranked documents + relevance_scores = [1 if doc in relevant_docs.get(query, []) else 0 for doc in ranked_docs] + + # Calculate DCG at k + dcg_at_k = dcg(relevance_scores) + + # Sort the relevance scores in descending order for ideal DCG calculation + ideal_relevance_scores = sorted(relevance_scores, reverse=True) + + # Calculate ideal DCG at k + ideal_dcg_at_k = dcg(ideal_relevance_scores) + + # Calculate NDCG at k + if ideal_dcg_at_k == 0: + ndcg_score = 0.0 + else: + ndcg_score = dcg_at_k / ideal_dcg_at_k + + ndcg_scores.append(ndcg_score) + + # Calculate the average NDCG over all queries + avg_ndcg = numpy.mean(ndcg_scores) + + return avg_ndcg + +# NLTK integrated Naive Bayes Classifier (MP Integration Function #4) +def naive_bayes_classifier(training_data, new_text): + # Define a feature extractor function (simple bag-of-words) + def extract_features(text): + words = set(text) + features = {} + for word in word_features: + features['contains({})'.format(word)] = (word in words) + return features + + # Get the most frequent words as features + all_words = nltk.FreqDist(w.lower() for w in nltk.word_tokenize(' '.join([review for review, _ in training_data]))) + word_features = list(all_words.keys())[:2000] # Use the top 2000 words as features + + # Extract features for each review + feature_sets = [(extract_features(review.split()), label) for (review, label) in training_data] + + # Train a Naive Bayes classifier + classifier = nltk.classify.NaiveBayesClassifier.train(feature_sets) + + # Classify the new text + new_features = extract_features(new_text.split()) + classification = classifier.classify(new_features) + + return classification + + +# NLTK integrated stemmer and lemmatizer (MP Integration Function #5) +def stem_lemmatize(text): + # Tokenize the text into words + words = nltk.word_tokenize(text) + + # Initialize the stemmer and lemmatizer + stemmer = nltk.stem.PorterStemmer() + lemmatizer = nltk.stem.WordNetLemmatizer() + + # Apply stemming and lemmatization to each word + stemmed_words = [stemmer.stem(word) for word in words] + lemmatized_words = [lemmatizer.lemmatize(word) for word in words] + + return stemmed_words, lemmatized_words + + +def get_text_sentiment(text, negative_thres=-0.05, positive_thres=0.05): + # Create an instance of SentimentIntensityAnalyzer + from nltk.sentiment import SentimentIntensityAnalyzer + sid = SentimentIntensityAnalyzer() + + # Get the sentiment scores + sentiment_score = sid.polarity_scores(text) + + # Determine sentiment based on the compound score + if sentiment_score['compound'] >= positive_thres: + sentiment = 'Positive' + elif sentiment_score['compound'] <= negative_thres: + sentiment = 'Negative' + else: + sentiment = 'Neutral' + + return sentiment, sentiment_score + + +def max_entropy(train_data, test_data, algorithm='GIS', trace=0, max_iter=1000): + # Feature extraction function + def document_features(document): + return {word: (word in document) for word in document_words} + + # Get all unique words in the dataset + document_words = set(word.lower() for doc, _ in train_data for word in doc) + + # Train the MaxEnt classifier + classifier = nltk.classify.MaxentClassifier.train(train_data, algorithm=algorithm, trace=trace, max_iter=max_iter) + + # Evaluate the classifier + probs = [] + for featureset in test_data: + pdist = classifier.prob_classify(featureset) + prob = {} + for word in ['x', 'y']: + prob[word] = round(pdist.prob(word), 2) + probs.append(prob) + + # Get prediction for test data + many = classifier.classify_many(test_data) + + return classifier, probs, many + + +def analyze_collocation(text, gram): + # Tools to utilize depending on the gram number + finders = { + 2: nltk.collocations.BigramCollocationFinder, + 3: nltk.collocations.TrigramCollocationFinder, + 4: nltk.collocations.QuadgramCollocationFinder + } + measures = { + 2: nltk.collocations.BigramAssocMeasures, + 3: nltk.collocations.TrigramAssocMeasures, + 4: nltk.collocations.QuadgramAssocMeasures + } + + # Tokenize the text + words = nltk.wordpunct_tokenize(text) + + # Create a gramCollocationFinder + gram_finder = finders[gram].from_words(words) + + # Filter out collocations based on frequency and other measures + gram_measures = measures[gram]() + + return {'gram_finder': gram_finder, 'gram_measures': gram_measures} + + +def get_collocation_ngram_score(text, gram=2, n=2, method='raw_freq'): + analysis = analyze_collocation(text, gram) + return analysis['gram_finder'].score_ngrams(getattr(analysis['gram_measures'], method)) + + +def get_collocation_n_best(text, gram=2, n=2, method='raw_freq'): + analysis = analyze_collocation(text, gram) + return analysis['gram_finder'].nbest(getattr(analysis['gram_measures'], method), n) + + +def analyze_corpus(corpus): + # Tokenize and remove stopwords + tokenized_corpus = [nltk.word_tokenize(doc.lower()) for doc in corpus] + stop_words = get_stopwords() + filtered_corpus = [[word.lower() for word in doc if word.isalnum() and word not in stop_words] for doc in tokenized_corpus] + + # Perform part-of-speech tagging + pos_tagged_corpus = [nltk.pos_tag(doc) for doc in filtered_corpus] + + return pos_tagged_corpus + + +def generate_tree(text, grammar, text_is_formatted=False): + def build_trees_from_text(): + # Tokenize the input string + words = nltk.word_tokenize(text) + + # Perform syntactic parsing + parser = nltk.ChartParser(grammar) + return list(parser.parse(words)) + + if text_is_formatted: + trees = list(nltk.tree.Tree.fromstring(text)) + else: + trees = build_trees_from_text() + + for tree in trees: + # Visualize the tree + tree.pretty_print() + + return trees diff --git a/metapy/src/nltk_test.py b/metapy/src/nltk_test.py new file mode 100644 index 0000000000..9fbfdb7e6b --- /dev/null +++ b/metapy/src/nltk_test.py @@ -0,0 +1,301 @@ +import unittest +import nltk +import math +import numpy + +nltk.download('punkt') +nltk.download('wordnet') +nltk.download('omw-1.4') +nltk.download('averaged_perceptron_tagger') + +# Import the functions to be tested +from nltk_additions import * + + +class TestYourFunctions(unittest.TestCase): + + # Test NLTK integrated inl2 retrieval function + def test_inl2_retrieval(self): + documents = [ + "This is the first document.", + "This document is the second document.", + "And this is the third one." + ] + query = "first document" + result = inl2_retrieval(documents, query) + + eq = [(['this', 'is', 'the', 'first', 'document', '.'], 3.9120230054281464), + (['this', 'document', 'is', 'the', 'second', 'document', '.'], 3.2188758248682006), + (['and', 'this', 'is', 'the', 'third', 'one', '.'], 0.0)] + self.assertEqual(result, eq) + + documents = [ + "This is the second test document.", + "Could this be another sample document?", + "Let's run some more tests on this document?" + ] + query = "a second document that will be sampled for tests" + result = inl2_retrieval(documents, query) + + eq = [(['this', 'is', 'the', 'second', 'test', 'document', '.'], 4.276666119016055), + (['could', 'this', 'be', 'another', 'sample', 'document', '?'], 4.276666119016055), + (['let', "'s", 'run', 'some', 'more', 'tests', 'on', 'this', 'document', '?'], 4.276666119016055)] + self.assertEqual(result, eq) + + documents = [ + "This is the third test we will perform.", + "I wonder what range of values we will get.", + "Let's throw in our names: [Nikil, David]." + ] + query = "a third document that Nikil uses as a range of values" + result = inl2_retrieval(documents, query) + + eq = [(['i', 'wonder', 'what', 'range', 'of', 'values', 'we', 'will', 'get', '.'], 8.317766166719343), + (['this', 'is', 'the', 'third', 'test', 'we', 'will', 'perform', '.'], 2.772588722239781), ( + ['let', "'s", 'throw', 'in', 'our', 'names', ':', '[', 'nikil', ',', 'david', ']', '.'], + 2.772588722239781)] + self.assertEqual(result, eq) + + # Test NLTK integrated part of speech tagging function + def test_pos_tagging(self): + text = "NLTK is a powerful library for natural language processing." + tags = pos_tagging(text) + + eq = [('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), + ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')] + self.assertEqual(tags, eq) + + text = "Another test to ensure this function works." + tags = pos_tagging(text) + + eq = [('Another', 'DT'), ('test', 'NN'), ('to', 'TO'), ('ensure', 'VB'), ('this', 'DT'), ('function', 'NN'), + ('works', 'VBZ'), ('.', '.')] + self.assertEqual(tags, eq) + + text = "A final test to verify correctness." + tags = pos_tagging(text) + + eq = [('A', 'DT'), ('final', 'JJ'), ('test', 'NN'), ('to', 'TO'), ('verify', 'VB'), ('correctness', 'NN'), + ('.', '.')] + self.assertEqual(tags, eq) + + # Test with a perfect ranking where all relevant documents are at the top + def test_ndcg_perfect_ranking(self): + ranker = lambda query: ["doc1", "doc2", "doc3", "doc4"] + queries = ["query1"] + relevant_docs = { + "query1": ["doc1", "doc2", "doc3"] + } + k = 3 + result = ndcg(ranker, queries, relevant_docs, k) + self.assertAlmostEqual(result, 1.0, places=2, msg="Perfect ranking should result in NDCG=1.0") + + # Test with a partial ranking where only some relevant documents are at the top + def test_ndcg_partial_ranking(self): + ranker = lambda query: ["doc1", "doc4", "doc2", "doc3"] + queries = ["query1"] + relevant_docs = { + "query1": ["doc1", "doc2", "doc3"] + } + k = 3 + result = ndcg(ranker, queries, relevant_docs, k) + self.assertAlmostEqual(result, 0.919, places=2, msg="Partial ranking should result in NDCG ~0.794") + + # Test with no relevant documents in the ranking + def test_ndcg_no_relevant_docs(self): + ranker = lambda query: ["doc4", "doc5", "doc6"] + queries = ["query1"] + relevant_docs = { + "query1": ["doc1", "doc2", "doc3"] + } + k = 3 + result = ndcg(ranker, queries, relevant_docs, k) + self.assertAlmostEqual(result, 0.0, places=2, msg="No relevant docs in ranking should result in NDCG=0.0") + + # Test the classifier with positive training data + def test_nbc_positive(self): + training_data = [ + ("This is a positive review", "positive"), + ("Great product, highly recommended", "positive"), + ("I love this!", "positive") + ] + new_text = "This product is amazing" + result = naive_bayes_classifier(training_data, new_text) + self.assertEqual(result, "positive", "Classification should be positive") + + # Test the classifier with negative training data + def test_nbc_negative(self): + training_data = [ + ("Terrible experience, do not buy", "negative"), + ("Waste of money", "negative"), + ("I regret buying this", "negative") + ] + new_text = "I'm so disappointed with this product" + result = naive_bayes_classifier(training_data, new_text) + self.assertEqual(result, "negative", "Classification should be negative") + + # Test the classifier with neutral training data + def test_nbc_neutral(self): + training_data = [ + ("It's okay, not great but not terrible", "neutral"), + ("Average product, nothing special", "neutral"), + ("I have mixed feelings about this", "neutral") + ] + new_text = "It's neither good nor bad" + result = naive_bayes_classifier(training_data, new_text) + self.assertEqual(result, "neutral", "Classification should be neutral") + + # Test the function's stemming capability + def test_stemming(self): + text = "running jumps" + stemmed_words, _ = stem_lemmatize(text) + self.assertEqual(stemmed_words, ["run", "jump"], "Stemming should produce ['run', 'jump']") + + # Test the function's lemmatization capability + def test_lemmatization(self): + text = "better best" + _, lemmatized_words = stem_lemmatize(text) + self.assertEqual(lemmatized_words, ["better", "best"], "Lemmatization should produce ['better', 'best']") + + # Test both stemming and lemmatization together + def test_stem_lemmatize_combined(self): + text = "running better" + stemmed_words, lemmatized_words = stem_lemmatize(text) + self.assertEqual(stemmed_words, ["run", "better"], "Stemming should produce ['run', 'better']") + self.assertEqual(lemmatized_words, ["running", "better"], "Lemmatization should produce ['running', 'better']") + + # Test a declarative statement for neutral sentimentality + def test_neutral_sentiment(self, negative_thres=-0.05, positive_thres=0.05): + text = 'This is a test.' + sentiment, score = get_text_sentiment(text, negative_thres, positive_thres) + self.assertGreaterEqual(score['compound'], negative_thres) + self.assertLessEqual(score['compound'], positive_thres) + + # Test a negative statement for negative sentimentality + def test_negative_sentiment(self, negative_thres=-0.05, positive_thres=0.05): + text = 'This failing test will fail.' + sentiment, score = get_text_sentiment(text, negative_thres, positive_thres) + self.assertLessEqual(score['compound'], negative_thres) + + # Test a positive statement for positive sentimentality + def test_positive_sentiment(self, negative_thres=-0.05, positive_thres=0.05): + text = 'This successful test will pass.' + sentiment, score = get_text_sentiment(text, negative_thres, positive_thres) + self.assertGreaterEqual(score['compound'], positive_thres) + + def test_max_entropy(self): + train = [ + (dict(a=1, b=1, c=1), 'y'), + (dict(a=1, b=1, c=1), 'x'), + (dict(a=1, b=1, c=0), 'y'), + (dict(a=0, b=1, c=1), 'x'), + (dict(a=0, b=1, c=1), 'y'), + (dict(a=0, b=0, c=1), 'y'), + (dict(a=0, b=1, c=0), 'x'), + (dict(a=0, b=0, c=0), 'x'), + (dict(a=0, b=1, c=1), 'y'), + (dict(a=None, b=1, c=0), 'x'), + ] + test = [ + (dict(a=1, b=0, c=1)), # unseen + (dict(a=1, b=0, c=0)), # unseen + (dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x + (dict(a=0, b=1, c=0)), # seen 1 time, label=x + ] + + gis = max_entropy(train_data=train, test_data=test, algorithm='GIS') + self.assertEqual([ + {'x': 0.16, "y": 0.84}, + {'x': 0.46, "y": 0.54}, + {'x': 0.41, "y": 0.59}, + {'x': 0.76, "y": 0.24} + ], gis[1]) + self.assertEqual(['y', 'y', 'y', 'x'], gis[2]) + + iis = max_entropy(train_data=train, test_data=test, algorithm='IIS') + self.assertEqual([ + {'x': 0.16, "y": 0.84}, + {'x': 0.46, "y": 0.54}, + {'x': 0.41, "y": 0.59}, + {'x': 0.76, "y": 0.24} + ], iis[1]) + self.assertEqual(['y', 'y', 'y', 'x'], gis[2]) + + def test_analyze_collocation_bigram(self): + # Input text for testing + input_text = "I do not like green eggs and ham, I do not like them Sam I am!" + + # Expected collocations based on the input text + expected_collocations = [(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'), + ('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'), + ('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'), + ('them', 'Sam')] + + # Perform collocation analysis + result_collocations = get_collocation_ngram_score(input_text, gram=2) + + # Assert that the result matches the expected collocations + score_ngrams = sorted(gram for gram, score in result_collocations) + self.assertEqual(score_ngrams, expected_collocations) + + def test_analyze_collocation_trigram(self): + # Input text for testing + input_text = "I do not like green eggs and ham, I do not like them Sam I am!" + + # Expected collocations based on the input text + expected_collocations = [('I', 'do', 'not'), ('do', 'not', 'like')] + + # Perform collocation analysis + result_collocations = get_collocation_n_best(input_text, gram=3, n=2) + + # Assert that the result matches the expected collocations + n_best = sorted(result_collocations) + self.assertEqual(n_best, expected_collocations) + + def test_analyze_corpus(self): + # Input corpus for testing + corpus = [ + "This is a sample document. It contains various words.", + "Another document with different words and punctuation!" + ] + + # Expected result (dummy result for demonstration purposes) + expected_result = [ + [('sample', 'JJ'), ('document', 'NN'), ('contains', 'VBZ'), ('various', 'JJ'), + ('words', 'NNS')], + [('another', 'DT'), ('document', 'NN'), ('different', 'JJ'), ('words', 'NNS'), ('punctuation', 'NN')] + ] + + # Perform the analysis + result = analyze_corpus(corpus) + + # Assert that the result matches the expected result + self.assertEqual(result, expected_result) + + def test_generate_tree(self): + # Input and (expected) output + text = 'I rode an elephant in my pajamas' + output = ['''(S (NP I) (VP (VP (V rode) (NP (Det an) (N elephant))) (PP (P in) (NP (Det my) (N pajamas)))))''', + '''(S (NP I) (VP (V rode) (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))'''] + + # Perform syntactic parsing + grammar = nltk.CFG.fromstring(""" + S -> NP VP + PP -> P NP + NP -> Det N | Det N PP | 'I' + VP -> V NP | VP PP + Det -> 'an' | 'my' + N -> 'elephant' | 'pajamas' + V -> 'rode' + P -> 'in' + """) + + # For each tree generated due to differentiating sentence structure, confirm it is in the output above + trees = generate_tree(text, grammar) + for tree in trees: + tree = ' '.join(str(tree).split()) + self.assertTrue(tree in output) + + +if __name__ == '__main__': + unittest.main() diff --git a/metapy/travis/build_linux.sh b/metapy/travis/build_linux.sh new file mode 100755 index 0000000000..6bee45165c --- /dev/null +++ b/metapy/travis/build_linux.sh @@ -0,0 +1,6 @@ +#!/bin/bash +sudo docker run --rm \ + -e PYTHON_VERSION=$VERSION \ + -e UNICODE_WIDTH=$UNICODE_WIDTH \ + -v `pwd`:/metapy \ + quay.io/pypa/manylinux1_x86_64 /metapy/travis/build_linux_wheel.sh diff --git a/metapy/travis/build_linux_wheel.sh b/metapy/travis/build_linux_wheel.sh new file mode 100755 index 0000000000..0c87b96a13 --- /dev/null +++ b/metapy/travis/build_linux_wheel.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# based heavily on https://github.com/matthew-brett/manylinux-builds +set -eo pipefail + +# UNICODE_WIDTH selects "32"=wide (UCS4) or "16"=narrow (UTF-16) builds +UNICODE_WIDTH="${UNICODE_WIDTH:-32}" + +# Install cmake +wget --no-check-certificate http://www.cmake.org/files/v3.2/cmake-3.2.0-Linux-x86_64.sh +sh cmake-3.2.0-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir + +# Install zlib +yum install -y zlib-devel + +# taken from https://github.com/matthew-brett/manylinux-builds/blob/master/common_vars.sh +function lex_ver { + # Echoes dot-separated version string padded with zeros + # Thus: + # 3.2.1 -> 003002001 + # 3 -> 003000000 + echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}' +} + +# taken from https://github.com/matthew-brett/manylinux-builds/blob/master/common_vars.sh +function strip_dots { + # Strip "." characters from string + echo $1 | sed "s/\.//g" +} + +# taken from https://github.com/matthew-brett/manylinux-builds/blob/master/common_vars.sh +function cpython_path { + # Return path to cpython given + # * version (of form "2.7") + # * u_width ("16" or "32" default "32") + # + # For back-compatibility "u" as u_width also means "32" + local py_ver="${1:-2.7}" + local u_width="${2:-${UNICODE_WIDTH}}" + local u_suff=u + # Back-compatibility + if [ "$u_width" == "u" ]; then u_width=32; fi + # For Python >= 3.3, "u" suffix not meaningful + if [ $(lex_ver $py_ver) -ge $(lex_ver 3.3) ] || + [ "$u_width" == "16" ]; then + u_suff="" + elif [ "$u_width" != "32" ]; then + echo "Incorrect u_width value $u_width" + # exit 1 + fi + local no_dots=$(strip_dots $py_ver) + echo "/opt/python/cp${no_dots}-cp${no_dots}m${u_suff}" +} + +# taken from https://github.com/matthew-brett/manylinux-builds/blob/master/common_vars.sh +function repair_wheelhouse { + local in_dir=$1 + local out_dir=$2 + for whl in $in_dir/*.whl; do + if [[ $whl == *none-any.whl ]]; then + cp $whl $out_dir + else + auditwheel repair $whl -w $out_dir/ + fi + done + chmod -R a+rwX $out_dir +} + +# adapted from https://github.com/matthew-brett/manylinux-builds/blob/master/build_sklearns.sh +PIP="$(cpython_path $PYTHON_VERSION)/bin/pip" +pushd /metapy +$PIP wheel -w unfixed_wheels --verbose ./ +ls unfixed_wheels/*.whl +repair_wheelhouse unfixed_wheels dist +ls dist/*.whl +$PIP install dist/*.whl +popd diff --git a/metapy/travis/build_osx.sh b/metapy/travis/build_osx.sh new file mode 100755 index 0000000000..407428180f --- /dev/null +++ b/metapy/travis/build_osx.sh @@ -0,0 +1,4 @@ +#!/bin/bash +pip wheel -w dist --verbose ./ +ls dist/*.whl +pip install dist/*.whl diff --git a/metapy/travis/install_linux.sh b/metapy/travis/install_linux.sh new file mode 100755 index 0000000000..03ef4607a3 --- /dev/null +++ b/metapy/travis/install_linux.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sudo docker pull quay.io/pypa/manylinux1_x86_64 diff --git a/metapy/travis/install_osx.sh b/metapy/travis/install_osx.sh new file mode 100755 index 0000000000..c32d55a675 --- /dev/null +++ b/metapy/travis/install_osx.sh @@ -0,0 +1,8 @@ +#!/bin/bash +brew update +brew outdated cmake || brew upgrade cmake +git clone --recursive https://github.com/MacPython/terryfy +git clone https://github.com/matthew-brett/multibuild +source multibuild/osx_utils.sh +get_macpython_environment $VERSION venv +pip install wheel diff --git a/metapy/tutorials/1-analyzers-tokenizers-filters.ipynb b/metapy/tutorials/1-analyzers-tokenizers-filters.ipynb new file mode 100644 index 0000000000..c19d461ec3 --- /dev/null +++ b/metapy/tutorials/1-analyzers-tokenizers-filters.ipynb @@ -0,0 +1,311 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This is the Analyzers, Tokenizers, and Filters metapy tutorial. First, you should read the following two MeTA tutorials:\n", + "- [MeTA System Overview](https://meta-toolkit.org/overview-tutorial.html). Everything on this page is relevant to metapy except for the *Unit tests* section (you can't run them in Python).\n", + "- [Analyzers, Tokenizers, and Filters](https://meta-toolkit.org/analyzers-filters-tutorial.html). Everything on this page is relevant except for the *Extending MeTA With Your Own Filters* section.\n", + "\n", + "Let's get started!\n", + "\n", + "First, let's create a document to play with." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import metapy\n", + "doc = metapy.index.Document()\n", + "doc.content(\"I said that I can't believe that it only costs $19.95!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can make our own filter chain and run it on the document's content. Let's start with a simple example of only using `ICUTokenizer`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer()\n", + "tok.set_content(doc.content())\n", + "[t for t in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "See how the begin and end sentence markers (`` and ``) are inserted at the beginning and end of each sentence. We get an ordered list from using a tokenizer or filter.\n", + "\n", + "Next, use `LowercaseFilter` to convert each token to lowercase. We use the previous `tok` (which is an `ICUTokenizer`) in the constructor to `LowercaseFilter`. This lets us connect an arbitrary amount of filters together with a tokenizer at the start." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'i',\n", + " 'said',\n", + " 'that',\n", + " 'i',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer()\n", + "tok = metapy.analyzers.LowercaseFilter(tok)\n", + "tok.set_content(doc.content())\n", + "[t for t in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Just like in MeTA, metapy's filter chain can be created from a config file. Create the following file called `config.toml`. It will perform the same tokenization and filtering as above (`ICUTokenizer -> LowercaseFilter`). Then, it will aggregate token counts together using an *n*-gram words analyzer.\n", + "\n", + "```toml\n", + "[[analyzers]]\n", + "method = \"ngram-word\"\n", + "ngram = 1\n", + "filter = [{type = \"icu-tokenizer\"}, {type = \"lowercase\"}]\n", + "```\n", + "\n", + "Now, you can load this config file to create a unigram words analyzer. This uses the specified tokenizer/filter chain and analyzer type to convert a document into a dictionary of features and their counts." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'!': 1,\n", + " '$': 1,\n", + " '19.95': 1,\n", + " '': 1,\n", + " '': 1,\n", + " 'believe': 1,\n", + " \"can't\": 1,\n", + " 'costs': 1,\n", + " 'i': 2,\n", + " 'it': 1,\n", + " 'only': 1,\n", + " 'said': 1,\n", + " 'that': 2}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.load('config.toml')\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The tokens *i* and *that* are shown with two counts, while all the other tokens have 1 count. These features can then be passed to other parts of metapy, such as ranking functions or indexers.\n", + "\n", + "We can also manually specify the analyzer instead of loading it from the config file:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'!': 1,\n", + " '$': 1,\n", + " '19.95': 1,\n", + " '': 1,\n", + " '': 1,\n", + " 'believe': 1,\n", + " \"can't\": 1,\n", + " 'costs': 1,\n", + " 'i': 2,\n", + " 'it': 1,\n", + " 'only': 1,\n", + " 'said': 1,\n", + " 'that': 2}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.NGramWordAnalyzer(1, tok)\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{('$', '19.95', '!'): 1,\n", + " ('19.95', '!', ''): 1,\n", + " ('', 'i', 'said'): 1,\n", + " ('believe', 'that', 'it'): 1,\n", + " (\"can't\", 'believe', 'that'): 1,\n", + " ('costs', '$', '19.95'): 1,\n", + " ('i', \"can't\", 'believe'): 1,\n", + " ('i', 'said', 'that'): 1,\n", + " ('it', 'only', 'costs'): 1,\n", + " ('only', 'costs', '$'): 1,\n", + " ('said', 'that', 'i'): 1,\n", + " ('that', 'i', \"can't\"): 1,\n", + " ('that', 'it', 'only'): 1}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.NGramWordAnalyzer(3, tok)\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Usually, metapy applications will create and call analyzers based on a config file, so you won't have to create your own manually. However, it may still be useful if you are performing your own analysis that is not part of MeTA." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/2-search-and-ir-eval.ipynb b/metapy/tutorials/2-search-and-ir-eval.ipynb new file mode 100644 index 0000000000..bc7f4a57f0 --- /dev/null +++ b/metapy/tutorials/2-search-and-ir-eval.ipynb @@ -0,0 +1,538 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This is the Indexing and Search metapy tutorial. First, you should read the following tutorial:\n", + "- [Search Tutorial](https://meta-toolkit.org/search-tutorial.html). Read *Initially setting up the config file* and *Relevance judgements*.\n", + "\n", + "First, let's create an index. We will use the AP News dataset. Your current directory should look like this:\n", + "- `apnews`: AP News 88 dataset in MeTA format.\n", + "- `queries.txt`: 100 queries, one per line.\n", + "- `qrels.txt`: Over 10,000 relevance judgements for the queries.\n", + "- `stopwords.txt`: A file containing stopwords that will not be indexed.\n", + "- `apnews-config.toml`: A config file with paths set to all the above files, including index and ranker settings.\n", + "\n", + "Here's how we can use metapy to create the index." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import metapy\n", + "idx = metapy.index.make_inverted_index('apnews-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This may take a minute at first, since the index needs to be built. Subsequent calls to `make_inverted_index` with this config file will simply load the index, which will not take any time.\n", + "\n", + "Here's how we can interact with the index object:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "164465" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "idx.num_docs()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "299769" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "idx.unique_terms()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "526.3216552734375" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "idx.avg_doc_length()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "86561496" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "idx.total_corpus_terms()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "All the disk index and inverted index functions from MeTA are implemented in metapy.\n", + "\n", + "Let's create a `Ranker` object so we can search the index;" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ranker = metapy.index.OkapiBM25()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, we need a query. Create a `Document` and set its content to our query:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "query = metapy.index.Document()\n", + "query.content('Airbus Subsidies') # query from AP news" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Search our index using our ranker and query:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(49687, 20.686737060546875),\n", + " (8005, 20.367332458496094),\n", + " (8645, 20.20011329650879),\n", + " (13158, 20.071775436401367),\n", + " (10212, 19.91327667236328)]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_docs = ranker.score(idx, query, num_results=5)\n", + "top_docs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We are returned a ranked list of *(doc_id, score)* pairs. The scores are from the ranker, which in this case was Okapi BM25. Since our `line.toml` file in the AP News dataset has `store-full-text = true`, we can verify the content of our top documents by inspecting the document metadata field \"content\"." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. A top West German economic official said Sunday that reduction of government subsidies for Airbus Industrie will be a main topic at a planned September meeting ofthe consortium's member nations in Britain . Erich Riedl , parlimentary state secretary ...\n", + "\n", + "2. The United States , angry over European subsidies for the Airbus aircraft - manufacturing consortium , is increasing pressure on Airbus nations to abolish or at least reduce the payments , say diplomatic sources . ` ` The Americans at a minimum want ...\n", + "\n", + "3. U.S . and European trade official sare holding a new round of talks in the lengthy dispute over government subsidies to the Airbus aircraft manufacturing consortium . But both sides remain far apart on the long - simmering issue of subsidies to Airbu...\n", + "\n", + "4. U.S . Trade Representative Clayton Yeutter tol dthe governments of Britain , France , West Germany and Spain onWednesday they are risking a trade war by their ` ` enormous subsidies ' ' to Airbus passenger planes . The major trade bill now before Con...\n", + "\n", + "5. The omnibus trade bill pending in Congres swill include a provision allowing the United States to ` ` get tough ' ' with a European airplane manufacturer which domestic aerospace companies claim receives subsidies and engages in unfair trading practi...\n", + "\n" + ] + } + ], + "source": [ + "for num, (d_id, _) in enumerate(top_docs):\n", + " content = idx.metadata(d_id).get('content')\n", + " print(\"{}. {}...\\n\".format(num + 1, content[0:250]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Since we have the queries file and relevance judgements, we can do an IR evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ev = metapy.index.IREval('apnews-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We will loop over the queries file and add each result to the `IREval` object `ev`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query 1 average precision: 1.0\n", + "Query 2 average precision: 1.0\n", + "Query 3 average precision: 0.14694444444444443\n", + "Query 4 average precision: 0.5308730158730158\n", + "Query 5 average precision: 0.08833333333333333\n", + "Query 6 average precision: 0.8154365079365078\n", + "Query 7 average precision: 0.4084126984126984\n", + "Query 8 average precision: 1.0\n", + "Query 9 average precision: 0.042222222222222223\n", + "Query 10 average precision: 0.0\n", + "Query 11 average precision: 0.9\n", + "Query 12 average precision: 0.05\n", + "Query 13 average precision: 0.3\n", + "Query 14 average precision: 0.0\n", + "Query 15 average precision: 0.0\n", + "Query 16 average precision: 0.0\n", + "Query 17 average precision: 0.0\n", + "Query 18 average precision: 0.26666666666666666\n", + "Query 19 average precision: 0.016666666666666666\n", + "Query 20 average precision: 0.671111111111111\n", + "Query 21 average precision: 0.5508730158730158\n", + "Query 22 average precision: 0.0\n", + "Query 23 average precision: 0.0\n", + "Query 24 average precision: 0.0\n", + "Query 25 average precision: 0.0325\n", + "Query 26 average precision: 0.2\n", + "Query 27 average precision: 0.4163095238095238\n", + "Query 28 average precision: 1.0\n", + "Query 29 average precision: 0.0\n", + "Query 30 average precision: 0.0\n", + "Query 31 average precision: 0.24166666666666664\n", + "Query 32 average precision: 0.8154365079365078\n", + "Query 33 average precision: 0.12333333333333334\n", + "Query 34 average precision: 0.05\n", + "Query 35 average precision: 0.6708730158730158\n", + "Query 36 average precision: 0.32666666666666666\n", + "Query 37 average precision: 0.0\n", + "Query 38 average precision: 0.0125\n", + "Query 39 average precision: 0.014285714285714285\n", + "Query 40 average precision: 0.3796428571428571\n", + "Query 41 average precision: 0.0\n", + "Query 42 average precision: 0.0\n", + "Query 43 average precision: 0.4747619047619048\n", + "Query 44 average precision: 0.0\n", + "Query 45 average precision: 0.0\n", + "Query 46 average precision: 0.0125\n", + "Query 47 average precision: 0.16333333333333333\n", + "Query 48 average precision: 0.3083333333333333\n", + "Query 49 average precision: 0.03888888888888888\n", + "Query 50 average precision: 0.38936507936507936\n", + "Query 51 average precision: 0.24285714285714288\n", + "Query 52 average precision: 0.20714285714285713\n", + "Query 53 average precision: 0.21341269841269842\n", + "Query 54 average precision: 0.15833333333333333\n", + "Query 55 average precision: 0.0\n", + "Query 56 average precision: 0.2596428571428572\n", + "Query 57 average precision: 0.78\n", + "Query 58 average precision: 0.03666666666666667\n", + "Query 59 average precision: 0.0\n", + "Query 60 average precision: 0.075\n", + "Query 61 average precision: 1.0\n", + "Query 62 average precision: 0.2816666666666666\n", + "Query 63 average precision: 0.0\n", + "Query 64 average precision: 0.0\n", + "Query 65 average precision: 0.05\n", + "Query 66 average precision: 0.0325\n", + "Query 67 average precision: 0.22063492063492066\n", + "Query 68 average precision: 0.43555555555555553\n", + "Query 69 average precision: 0.4397619047619047\n", + "Query 70 average precision: 0.06666666666666667\n", + "Query 71 average precision: 0.0\n", + "Query 72 average precision: 0.09\n", + "Query 73 average precision: 0.36944444444444446\n", + "Query 74 average precision: 0.34444444444444444\n", + "Query 75 average precision: 0.8521031746031745\n", + "Query 76 average precision: 0.4514285714285714\n", + "Query 77 average precision: 0.03333333333333333\n", + "Query 78 average precision: 0.569047619047619\n", + "Query 79 average precision: 0.34444444444444444\n", + "Query 80 average precision: 0.639047619047619\n", + "Query 81 average precision: 0.0\n", + "Query 82 average precision: 0.18051587301587302\n", + "Query 83 average precision: 0.5380952380952382\n", + "Query 84 average precision: 0.6863095238095237\n", + "Query 85 average precision: 0.3155555555555556\n", + "Query 86 average precision: 0.16190476190476188\n", + "Query 87 average precision: 0.7042063492063492\n", + "Query 88 average precision: 0.0\n", + "Query 89 average precision: 0.07222222222222222\n", + "Query 90 average precision: 0.02\n", + "Query 91 average precision: 0.13333333333333333\n", + "Query 92 average precision: 0.25\n", + "Query 93 average precision: 0.03333333333333333\n", + "Query 94 average precision: 0.21000000000000002\n", + "Query 95 average precision: 0.13583333333333333\n", + "Query 96 average precision: 0.44642857142857145\n", + "Query 97 average precision: 0.15\n", + "Query 98 average precision: 0.315\n", + "Query 99 average precision: 0.27166666666666667\n", + "Query 100 average precision: 0.5258333333333334\n" + ] + } + ], + "source": [ + "num_results = 10\n", + "with open('queries.txt') as query_file:\n", + " for query_num, line in enumerate(query_file):\n", + " query.content(line.strip())\n", + " results = ranker.score(idx, query, num_results) \n", + " avg_p = ev.avg_p(results, query_num, num_results)\n", + " print(\"Query {} average precision: {}\".format(query_num + 1, avg_p))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Afterwards, we can get the mean average precision of all the queries." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.26801309523809536" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ev.map()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Try experimenting with different rankers, ranker parameters, tokenization, and filters. What combination give you the best MAP?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Lastly, it's possible to define your own ranking function in Python." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "class SimpleRanker(metapy.index.RankingFunction): \n", + " \"\"\" \n", + " Create a new ranking function in Python that can be used in MeTA. \n", + " \"\"\" \n", + " def __init__(self, some_param=1.0): \n", + " self.param = some_param\n", + " # You *must* call the base class constructor here!\n", + " super(SimpleRanker, self).__init__() \n", + " \n", + " def score_one(self, sd):\n", + " \"\"\"\n", + " You need to override this function to return a score for a single term.\n", + " For fields available in the score_data sd object,\n", + " @see https://meta-toolkit.org/doxygen/structmeta_1_1index_1_1score__data.html\n", + " \"\"\"\n", + " return (self.param + sd.doc_term_count) / (self.param * sd.doc_unique_terms + sd.doc_size)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/3-deeper-text-analysis.ipynb b/metapy/tutorials/3-deeper-text-analysis.ipynb new file mode 100644 index 0000000000..5244136e66 --- /dev/null +++ b/metapy/tutorials/3-deeper-text-analysis.ipynb @@ -0,0 +1,3571 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "First, we'll import the `metapy` python bindings." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import metapy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's create a document with some content." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "doc = metapy.index.Document()\n", + "doc.content(\"I said that I can't believe that it only costs $19.95!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "MeTA provides a stream-based interface for performing document tokenization. Each stream starts off with a Tokenizer object, and in most cases you should use the [Unicode standard aware](http://site.icu-project.org) `ICUTokenizer`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tok = metapy.analyzers.ICUTokenizer()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Tokenizers operate on raw text and provide an Iterable that spits out the individual text tokens. Let's try running just the `ICUTokenizer` to see what it does." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok.set_content(doc.content()) # this could be any string\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "One thing that you likely immediately notice is the insertion of these pseudo-XML looking `` and `` tags. These are called \"sentence boundary tags\". As a side-effect, a default-construted `ICUTokenizer` discovers the sentences in a document by delimiting them with the sentence boundary tags. Let's try tokenizing a multi-sentence document to see what that looks like." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '',\n", + " '',\n", + " 'I',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.',\n", + " '']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc.content(\"I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.\")\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Most of the information retrieval techniques you have likely been learning about in this class don't need to concern themselves with finding the boundaries between separate sentences in a document, but later today we'll explore a scenario where this might matter more.\n", + "\n", + "Let's pass a flag to the `ICUTokenizer` constructor to disable sentence boundary tags for now." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " 'I',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "I mentioned earlier that MeTA treats tokenization as a *streaming* process, and that it *starts* with a tokenizer. As you've learned, for optimal search performance it's often beneficial to modify the raw underlying tokens of a document, and thus change its representation, before adding it to an inverted index structure for searching.\n", + "\n", + "The \"intermediate\" steps in the tokenization stream are represented with objects called Filters. Each filter consumes the content of a previous filter (or a tokenizer) and modifies the tokens coming out of the stream in some way.\n", + "\n", + "Let's start by using a simple filter that can help eliminate a lot of noise that we might encounter when tokenizing web documents: a `LengthFilter`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['said',\n", + " 'that',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '19.95',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '30',\n", + " 'before']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.LengthFilter(tok, min=2, max=30)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Here, we can see that the `LengthFilter` is consuming our original `ICUTokenizer`. It modifies the token stream by only emitting tokens that are of a minimum length of 2 and a maximum length of 30. This can get rid of a lot of punctuation tokens, but also excessively long tokens such as URLs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Another common trick is to remove stopwords. (Can anyone tell me what a stopword is?) In MeTA, this is done using a `ListFilter`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-03-28 12:52:33-- https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving raw.githubusercontent.com... 151.101.44.133\n", + "Connecting to raw.githubusercontent.com|151.101.44.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2747 (2.7K) [text/plain]\n", + "Saving to: ‘lemur-stopwords.txt’\n", + "\n", + "lemur-stopwords.txt 100%[===================>] 2.68K --.-KB/s in 0s \n", + "\n", + "2017-03-28 12:52:33 (22.6 MB/s) - ‘lemur-stopwords.txt’ saved [2747/2747]\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[\"can't\", 'believe', 'costs', '19.95', 'find', '30']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!wget -nc https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt\n", + "\n", + "tok = metapy.analyzers.ListFilter(tok, \"lemur-stopwords.txt\", metapy.analyzers.ListFilter.Type.Reject)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Here we've downloaded a common list of stopwords obtained from the [Lemur project](http://lemurproject.org) and created a `ListFilter` to reject any tokens that occur in that list of words.\n", + "\n", + "You can see how much of a difference removing stopwords can make on the size of a document's token stream! This translates to a lot of space savings in the inverted index as well." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Another common filter that people use is called a stemmer, or lemmatizer. This kind of filter tries to modify individual tokens in such a way that different inflected forms of a word all reduce to the same representation. This lets you, for example, find documents about a \"run\" when you search \"running\" or \"runs\". A common stemmer is the [Porter2 Stemmer](http://snowball.tartarus.org/algorithms/english/stemmer.html), which MeTA has an implementation of. Let's try it!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[\"can't\", 'believ', 'cost', '19.95', 'find', '30']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.Porter2Filter(tok)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Notice how \"believe\" becomes \"believ\" and \"costs\" becomes \"cost\". Stemming can help search by allowing queries to return more matched documents by relaxing what it means for a document to match a query term. Note that it's important to ensure that queries are tokenized in the *exact same way* as your documents were before indexing them. If you ignore this, your query is unlikely to contain the raw token \"believ\" and you'll miss a lot of results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Finally, after you've got the token stream configured the way you'd like, it's time to analyze the document by consuming each token from its token stream and performing some actions based on these tokens. In the simplest case, which often is enough for \"good enough\" search results, our action can simply be counting how many times these tokens occur.\n", + "\n", + "For clarity, let's switch back to a simpler token stream first. Write me a token stream that tokenizes using the Unicode standard, and then lowercases each token. (Hint: `help(metapy.analyzers)`.)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on module metapy.metapy.analyzers in metapy.metapy:\n", + "\n", + "NAME\n", + " metapy.metapy.analyzers\n", + "\n", + "CLASSES\n", + " pybind11_builtins.pybind11_object_48(builtins.object)\n", + " Analyzer\n", + " MultiAnalyzer\n", + " NGramWordAnalyzer\n", + " TokenStream\n", + " AlphaFilter\n", + " CharacterTokenizer\n", + " EmptySentenceFilter\n", + " EnglishNormalizer\n", + " ICUFilter\n", + " ICUTokenizer\n", + " LengthFilter\n", + " ListFilter\n", + " LowercaseFilter\n", + " PennTreebankNormalizer\n", + " Porter2Filter\n", + " SentenceBoundaryAdder\n", + " \n", + " class AlphaFilter(TokenStream)\n", + " | Method resolution order:\n", + " | AlphaFilter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.AlphaFilter, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class Analyzer(pybind11_builtins.pybind11_object_48)\n", + " | Method resolution order:\n", + " | Analyzer\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.Analyzer) -> None\n", + " | \n", + " | analyze(...) from builtins.PyCapsule\n", + " | analyze(self: metapy.metapy.analyzers.Analyzer, arg0: metapy.metapy.index.Document) -> dict\n", + " | \n", + " | featurize(...) from builtins.PyCapsule\n", + " | featurize(self: metapy.metapy.analyzers.Analyzer, arg0: metapy.metapy.index.Document) -> dict\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class CharacterTokenizer(TokenStream)\n", + " | Method resolution order:\n", + " | CharacterTokenizer\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.CharacterTokenizer) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class EmptySentenceFilter(TokenStream)\n", + " | Method resolution order:\n", + " | EmptySentenceFilter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.EmptySentenceFilter, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class EnglishNormalizer(TokenStream)\n", + " | Method resolution order:\n", + " | EnglishNormalizer\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.EnglishNormalizer, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class ICUFilter(TokenStream)\n", + " | Method resolution order:\n", + " | ICUFilter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.ICUFilter, arg0: metapy.metapy.analyzers.TokenStream, arg1: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class ICUTokenizer(TokenStream)\n", + " | Method resolution order:\n", + " | ICUTokenizer\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.ICUTokenizer, suppress_tags: bool=False) -> None\n", + " | \n", + " | Creates a tokenizer using the UTF text segmentation standard\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class LengthFilter(TokenStream)\n", + " | Method resolution order:\n", + " | LengthFilter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.LengthFilter, source: metapy.metapy.analyzers.TokenStream, min: int, max: int) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class ListFilter(TokenStream)\n", + " | Method resolution order:\n", + " | ListFilter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.ListFilter, arg0: metapy.metapy.analyzers.TokenStream, arg1: str, arg2: metapy.metapy.analyzers.Type) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes defined here:\n", + " | \n", + " | Type = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class LowercaseFilter(TokenStream)\n", + " | Method resolution order:\n", + " | LowercaseFilter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.LowercaseFilter, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class MultiAnalyzer(Analyzer)\n", + " | Method resolution order:\n", + " | MultiAnalyzer\n", + " | Analyzer\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(self, /, *args, **kwargs)\n", + " | Initialize self. See help(type(self)) for accurate signature.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from Analyzer:\n", + " | \n", + " | analyze(...) from builtins.PyCapsule\n", + " | analyze(self: metapy.metapy.analyzers.Analyzer, arg0: metapy.metapy.index.Document) -> dict\n", + " | \n", + " | featurize(...) from builtins.PyCapsule\n", + " | featurize(self: metapy.metapy.analyzers.Analyzer, arg0: metapy.metapy.index.Document) -> dict\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class NGramWordAnalyzer(Analyzer)\n", + " | Method resolution order:\n", + " | NGramWordAnalyzer\n", + " | Analyzer\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.NGramWordAnalyzer, arg0: int, arg1: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | analyze(...) from builtins.PyCapsule\n", + " | analyze(self: metapy.metapy.analyzers.NGramWordAnalyzer, arg0: metapy.metapy.index.Document) -> object\n", + " | \n", + " | featurize(...) from builtins.PyCapsule\n", + " | featurize(self: metapy.metapy.analyzers.NGramWordAnalyzer, arg0: metapy.metapy.index.Document) -> object\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class PennTreebankNormalizer(TokenStream)\n", + " | Method resolution order:\n", + " | PennTreebankNormalizer\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.PennTreebankNormalizer, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class Porter2Filter(TokenStream)\n", + " | Method resolution order:\n", + " | Porter2Filter\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.Porter2Filter, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class SentenceBoundaryAdder(TokenStream)\n", + " | Method resolution order:\n", + " | SentenceBoundaryAdder\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.SentenceBoundaryAdder, arg0: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from TokenStream:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes inherited from TokenStream:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + " \n", + " class TokenStream(pybind11_builtins.pybind11_object_48)\n", + " | Method resolution order:\n", + " | TokenStream\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __bool__(...) from builtins.PyCapsule\n", + " | __bool__(self: metapy.metapy.analyzers.TokenStream) -> bool\n", + " | \n", + " | __deepcopy__(...) from builtins.PyCapsule\n", + " | __deepcopy__(self: metapy.metapy.analyzers.TokenStream, arg0: dict) -> metapy.metapy.analyzers.TokenStream\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.analyzers.TokenStream) -> None\n", + " | \n", + " | __iter__(...) from builtins.PyCapsule\n", + " | __iter__(self: object) -> py_token_stream_iterator\n", + " | \n", + " | next(...) from builtins.PyCapsule\n", + " | next(self: metapy.metapy.analyzers.TokenStream) -> str\n", + " | \n", + " | set_content(...) from builtins.PyCapsule\n", + " | set_content(self: metapy.metapy.analyzers.TokenStream, arg0: str) -> None\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data and other attributes defined here:\n", + " | \n", + " | Iterator = \n", + " | \n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + "\n", + "FUNCTIONS\n", + " load(...) method of builtins.PyCapsule instance\n", + " load(arg0: str) -> metapy.metapy.analyzers.Analyzer\n", + " \n", + " register_filter(...) method of builtins.PyCapsule instance\n", + " register_filter(arg0: object) -> None\n", + "\n", + "FILE\n", + " (built-in)\n", + "\n", + "\n" + ] + } + ], + "source": [ + "help(metapy.analyzers)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['i',\n", + " 'said',\n", + " 'that',\n", + " 'i',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " 'i',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)\n", + "tok = metapy.analyzers.LowercaseFilter(tok)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's count how often each individual token appears in the stream. You might have called this representation the \"bag of words\" representation, but it is also often called \"unigram word counts\". In MeTA, classes that consume a token stream and emit a document representation are called Analyzers." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'!': 1,\n", + " '$': 2,\n", + " '.': 1,\n", + " '19.95': 1,\n", + " '30': 1,\n", + " 'before': 1,\n", + " 'believe': 1,\n", + " \"can't\": 1,\n", + " 'costs': 1,\n", + " 'could': 1,\n", + " 'find': 1,\n", + " 'for': 1,\n", + " 'i': 3,\n", + " 'it': 2,\n", + " 'more': 1,\n", + " 'only': 2,\n", + " 'said': 1,\n", + " 'than': 1,\n", + " 'that': 2}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.NGramWordAnalyzer(1, tok)\n", + "print(doc.content())\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If you noticed the name of the analyzer, you might have realized that you can count not just individual tokens, but groups of them. \"Unigram\" means \"1-gram\", and we count individual tokens. \"Bigram\" means \"2-gram\", and we count adjacent tokens together as a group. Let's try that now." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{('!', 'i'): 1,\n", + " ('$', '19.95'): 1,\n", + " ('$', '30'): 1,\n", + " ('19.95', '!'): 1,\n", + " ('30', 'before'): 1,\n", + " ('before', '.'): 1,\n", + " ('believe', 'that'): 1,\n", + " (\"can't\", 'believe'): 1,\n", + " ('costs', '$'): 1,\n", + " ('could', 'only'): 1,\n", + " ('find', 'it'): 1,\n", + " ('for', 'more'): 1,\n", + " ('i', \"can't\"): 1,\n", + " ('i', 'could'): 1,\n", + " ('i', 'said'): 1,\n", + " ('it', 'for'): 1,\n", + " ('it', 'only'): 1,\n", + " ('more', 'than'): 1,\n", + " ('only', 'costs'): 1,\n", + " ('only', 'find'): 1,\n", + " ('said', 'that'): 1,\n", + " ('than', '$'): 1,\n", + " ('that', 'i'): 1,\n", + " ('that', 'it'): 1}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.NGramWordAnalyzer(2, tok)\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now the individual \"tokens\" we're counting are pairs of tokens. You can analyze any n-gram of tokens you would like to in this way (and this is a simple way to attempt to support phrase search). Note, however, that as you increase the size of the n-grams you are counting, you are also increasing (exponentially!) the number of possible n-grams you could observe, so there's no free lunch here.\n", + "\n", + "This analysis pipeline feeds both the creation of the `InvertedIndex`, which is used for search applications, and the `ForwardIndex`, which is used for topic modeling and classification applications. For classification, sometimes looking at n-grams of characters is useful." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{(' ', '$', '1', '9'): 1,\n", + " (' ', '$', '3', '0'): 1,\n", + " (' ', 'I', ' ', 'c'): 2,\n", + " (' ', 'b', 'e', 'f'): 1,\n", + " (' ', 'b', 'e', 'l'): 1,\n", + " (' ', 'c', 'a', 'n'): 1,\n", + " (' ', 'c', 'o', 's'): 1,\n", + " (' ', 'c', 'o', 'u'): 1,\n", + " (' ', 'f', 'i', 'n'): 1,\n", + " (' ', 'f', 'o', 'r'): 1,\n", + " (' ', 'i', 't', ' '): 2,\n", + " (' ', 'm', 'o', 'r'): 1,\n", + " (' ', 'o', 'n', 'l'): 2,\n", + " (' ', 's', 'a', 'i'): 1,\n", + " (' ', 't', 'h', 'a'): 3,\n", + " ('!', ' ', 'I', ' '): 1,\n", + " ('$', '1', '9', '.'): 1,\n", + " ('$', '3', '0', ' '): 1,\n", + " (\"'\", 't', ' ', 'b'): 1,\n", + " ('.', '9', '5', '!'): 1,\n", + " ('0', ' ', 'b', 'e'): 1,\n", + " ('1', '9', '.', '9'): 1,\n", + " ('3', '0', ' ', 'b'): 1,\n", + " ('5', '!', ' ', 'I'): 1,\n", + " ('9', '.', '9', '5'): 1,\n", + " ('9', '5', '!', ' '): 1,\n", + " ('I', ' ', 'c', 'a'): 1,\n", + " ('I', ' ', 'c', 'o'): 1,\n", + " ('I', ' ', 's', 'a'): 1,\n", + " ('a', 'i', 'd', ' '): 1,\n", + " ('a', 'n', ' ', '$'): 1,\n", + " ('a', 'n', \"'\", 't'): 1,\n", + " ('a', 't', ' ', 'I'): 1,\n", + " ('a', 't', ' ', 'i'): 1,\n", + " ('b', 'e', 'f', 'o'): 1,\n", + " ('b', 'e', 'l', 'i'): 1,\n", + " ('c', 'a', 'n', \"'\"): 1,\n", + " ('c', 'o', 's', 't'): 1,\n", + " ('c', 'o', 'u', 'l'): 1,\n", + " ('d', ' ', 'i', 't'): 1,\n", + " ('d', ' ', 'o', 'n'): 1,\n", + " ('d', ' ', 't', 'h'): 1,\n", + " ('e', ' ', 't', 'h'): 2,\n", + " ('e', 'f', 'o', 'r'): 1,\n", + " ('e', 'l', 'i', 'e'): 1,\n", + " ('e', 'v', 'e', ' '): 1,\n", + " ('f', 'i', 'n', 'd'): 1,\n", + " ('f', 'o', 'r', ' '): 1,\n", + " ('f', 'o', 'r', 'e'): 1,\n", + " ('h', 'a', 'n', ' '): 1,\n", + " ('h', 'a', 't', ' '): 2,\n", + " ('i', 'd', ' ', 't'): 1,\n", + " ('i', 'e', 'v', 'e'): 1,\n", + " ('i', 'n', 'd', ' '): 1,\n", + " ('i', 't', ' ', 'f'): 1,\n", + " ('i', 't', ' ', 'o'): 1,\n", + " ('l', 'd', ' ', 'o'): 1,\n", + " ('l', 'i', 'e', 'v'): 1,\n", + " ('l', 'y', ' ', 'c'): 1,\n", + " ('l', 'y', ' ', 'f'): 1,\n", + " ('m', 'o', 'r', 'e'): 1,\n", + " ('n', ' ', '$', '3'): 1,\n", + " ('n', \"'\", 't', ' '): 1,\n", + " ('n', 'd', ' ', 'i'): 1,\n", + " ('n', 'l', 'y', ' '): 2,\n", + " ('o', 'n', 'l', 'y'): 2,\n", + " ('o', 'r', ' ', 'm'): 1,\n", + " ('o', 'r', 'e', ' '): 1,\n", + " ('o', 'r', 'e', '.'): 1,\n", + " ('o', 's', 't', 's'): 1,\n", + " ('o', 'u', 'l', 'd'): 1,\n", + " ('r', ' ', 'm', 'o'): 1,\n", + " ('r', 'e', ' ', 't'): 1,\n", + " ('s', ' ', '$', '1'): 1,\n", + " ('s', 'a', 'i', 'd'): 1,\n", + " ('s', 't', 's', ' '): 1,\n", + " ('t', ' ', 'I', ' '): 1,\n", + " ('t', ' ', 'b', 'e'): 1,\n", + " ('t', ' ', 'f', 'o'): 1,\n", + " ('t', ' ', 'i', 't'): 1,\n", + " ('t', ' ', 'o', 'n'): 1,\n", + " ('t', 'h', 'a', 'n'): 1,\n", + " ('t', 'h', 'a', 't'): 2,\n", + " ('t', 's', ' ', '$'): 1,\n", + " ('u', 'l', 'd', ' '): 1,\n", + " ('v', 'e', ' ', 't'): 1,\n", + " ('y', ' ', 'c', 'o'): 1,\n", + " ('y', ' ', 'f', 'i'): 1}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.CharacterTokenizer()\n", + "ana = metapy.analyzers.NGramWordAnalyzer(4, tok)\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Different analyzers can be combined together to create document representations that have many unique perspectives. Once things start to get more complicated, we recommend using a configuration file to specify each of the analyzers you wish to combine for your document representation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's explore something a little bit different. MeTA also has a natural language processing (NLP) component, which currently supports two major NLP tasks: part-of-speech tagging and syntactic parsing.\n", + "\n", + "(Does anyone know what part-of-speech tagging is?) POS tagging is a task in NLP that involves identifying a type for each word in a sentence. For example, POS tagging can be used to identify all of the nouns in a sentence, or all of the verbs, or adjectives, or... This is useful as first step towards developing an understanding of the meaning of a particular sentence." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "MeTA places its POS tagging component in its \"sequences\" library. Let's play with some sequences first to get an idea of how they work. We'll start of by creating a sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq = metapy.sequence.Sequence()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, we can add individual words to this sequence. Sequences consist of a list of `Observation`s, which are essentially (word, tag) pairs. If we don't yet know the tags for a `Sequence`, we can just add individual words and leave the tags unset. Words are called \"symbols\" in the library terminology." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(The, ???), (dog, ???), (ran, ???), (across, ???), (the, ???), (park, ???), (., ???)\n" + ] + } + ], + "source": [ + "for word in [\"The\", \"dog\", \"ran\", \"across\", \"the\", \"park\", \".\"]:\n", + " seq.add_symbol(word)\n", + "print(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The printed form of the sequence shows that we do not yet know the tags for each word. Let's fill them in by using a pre-trained POS-tagger model that's distributed with MeTA." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-03-28 12:52:33-- https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-perceptron-tagger.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving github.com... 192.30.253.113, 192.30.253.112\n", + "Connecting to github.com|192.30.253.113|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://github-cloud.s3.amazonaws.com/releases/16466317/5becfb4a-07f9-11e7-9984-0b59d0729937.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAISTNZFOVBIJMK3TQ%2F20170328%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20170328T175233Z&X-Amz-Expires=300&X-Amz-Signature=9ff6c60870700e0c06de2dfd82410a889aeee8a791c98830969c9d27c913149e&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dgreedy-perceptron-tagger.tar.gz&response-content-type=application%2Foctet-stream [following]\n", + "--2017-03-28 12:52:33-- https://github-cloud.s3.amazonaws.com/releases/16466317/5becfb4a-07f9-11e7-9984-0b59d0729937.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAISTNZFOVBIJMK3TQ%2F20170328%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20170328T175233Z&X-Amz-Expires=300&X-Amz-Signature=9ff6c60870700e0c06de2dfd82410a889aeee8a791c98830969c9d27c913149e&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dgreedy-perceptron-tagger.tar.gz&response-content-type=application%2Foctet-stream\n", + "Resolving github-cloud.s3.amazonaws.com... 54.231.121.43\n", + "Connecting to github-cloud.s3.amazonaws.com|54.231.121.43|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 6622528 (6.3M) [application/octet-stream]\n", + "Saving to: ‘greedy-perceptron-tagger.tar.gz’\n", + "\n", + "greedy-perceptron-t 100%[===================>] 6.32M 5.59MB/s in 1.1s \n", + "\n", + "2017-03-28 12:52:35 (5.59 MB/s) - ‘greedy-perceptron-tagger.tar.gz’ saved [6622528/6622528]\n", + "\n", + "perceptron-tagger/\n", + "perceptron-tagger/feature.mapping.gz\n", + "perceptron-tagger/label.mapping\n", + "perceptron-tagger/tagger.model.gz\n" + ] + } + ], + "source": [ + "!wget -nc https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-perceptron-tagger.tar.gz\n", + "!tar xvf greedy-perceptron-tagger.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tagger = metapy.sequence.PerceptronTagger(\"perceptron-tagger/\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now let's fill in the missing tags in our sentence based on the best guess this model has." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(The, DT), (dog, NN), (ran, VBD), (across, IN), (the, DT), (park, NN), (., .)\n" + ] + } + ], + "source": [ + "tagger.tag(seq)\n", + "print(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Each tag indicates the type of a word, and this particular tagger was trained to output the tags present in the [Penn Treebank tagset](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).\n", + "\n", + "But what if we want to POS-tag a document?" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.\n" + ] + } + ], + "source": [ + "print(doc.content())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We need a way of going from a document to a list of `Sequence`s, each representing an individual sentence. I'll get you started." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " 'ca',\n", + " \"n't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '',\n", + " '',\n", + " 'I',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.',\n", + " '']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer() # keep sentence boundaries!\n", + "tok = metapy.analyzers.PennTreebankNormalizer(tok)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "(Notice that the `PennTreebankNormalizer` modifies some tokens to better match the conventions of the Penn Treebank training data. This should help improve performance a little.)\n", + "\n", + "Now, write me a function that can take a token stream that contains sentence boundary tags and returns a list of `Sequence` objects. Don't include the sentence boundary tags in the actual `Sequence` objects." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def extract_sequences(tok):\n", + " sequences = []\n", + " for token in tok:\n", + " if token == '':\n", + " sequences.append(metapy.sequence.Sequence())\n", + " elif token != '':\n", + " sequences[-1].add_symbol(token) \n", + " return sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(I, PRP), (said, VBD), (that, IN), (I, PRP), (ca, MD), (n't, RB), (believe, VB), (that, IN), (it, PRP), (only, RB), (costs, VBZ), ($, $), (19.95, CD), (!, .)\n", + "(I, PRP), (could, MD), (only, RB), (find, VB), (it, PRP), (for, IN), (more, JJR), (than, IN), ($, $), (30, CD), (before, IN), (., .)\n" + ] + } + ], + "source": [ + "tok.set_content(doc.content())\n", + "for seq in extract_sequences(tok):\n", + " tagger.tag(seq)\n", + " print(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This is still a rather shallow understanding of these sentences. The next major leap is to parse these sequences of POS-tagged words to obtain a tree for each sentence. These trees, in our case, will represent the hierarchical phrase structure of a single sentence by grouping together tokens that belong to one phrase together, and showing how small phrases combine into larger phrases, and eventually a sentence.\n", + "\n", + "Let's try parsing the sentences in our document using a pre-tranned constituency parser that's distributed with MeTA." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-03-28 12:52:36-- https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-constituency-parser.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving github.com... 192.30.253.113, 192.30.253.112\n", + "Connecting to github.com|192.30.253.113|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://github-cloud.s3.amazonaws.com/releases/16466317/5bec2648-07f9-11e7-9d02-cb0d49fd3f76.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAISTNZFOVBIJMK3TQ%2F20170328%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20170328T175236Z&X-Amz-Expires=300&X-Amz-Signature=5ea88ec218ec18f32366d6d448437787721e819451b371250e390c89cec4ca9c&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dgreedy-constituency-parser.tar.gz&response-content-type=application%2Foctet-stream [following]\n", + "--2017-03-28 12:52:36-- https://github-cloud.s3.amazonaws.com/releases/16466317/5bec2648-07f9-11e7-9d02-cb0d49fd3f76.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAISTNZFOVBIJMK3TQ%2F20170328%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20170328T175236Z&X-Amz-Expires=300&X-Amz-Signature=5ea88ec218ec18f32366d6d448437787721e819451b371250e390c89cec4ca9c&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dgreedy-constituency-parser.tar.gz&response-content-type=application%2Foctet-stream\n", + "Resolving github-cloud.s3.amazonaws.com... 54.231.97.200\n", + "Connecting to github-cloud.s3.amazonaws.com|54.231.97.200|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 55347000 (53M) [application/octet-stream]\n", + "Saving to: ‘greedy-constituency-parser.tar.gz’\n", + "\n", + "greedy-constituency 100%[===================>] 52.78M 8.66MB/s in 6.4s \n", + "\n", + "2017-03-28 12:52:43 (8.19 MB/s) - ‘greedy-constituency-parser.tar.gz’ saved [55347000/55347000]\n", + "\n", + "parser/\n", + "parser/parser.trans.gz\n", + "parser/parser.model.gz\n" + ] + } + ], + "source": [ + "!wget -nc https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-constituency-parser.tar.gz\n", + "!tar xvf greedy-constituency-parser.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "parser = metapy.parser.Parser(\"parser/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I could only find it for more than $ 30 before .\n", + "(I, PRP), (could, MD), (only, RB), (find, VB), (it, PRP), (for, IN), (more, JJR), (than, IN), ($, $), (30, CD), (before, IN), (., .)\n", + "(ROOT\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (MD could)\n", + " (ADVP (RB only))\n", + " (VP\n", + " (VB find)\n", + " (NP (PRP it))\n", + " (PP\n", + " (IN for)\n", + " (NP\n", + " (QP\n", + " (JJR more)\n", + " (IN than)\n", + " ($ $)\n", + " (CD 30))))\n", + " (ADVP (IN before))))\n", + " (. .)))\n", + "\n" + ] + } + ], + "source": [ + "print(' '.join([obs.symbol for obs in seq]))\n", + "print(seq)\n", + "tree = parser.parse(seq)\n", + "print(tree.pretty_str())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "(You can also play with this with a [prettier online demo](https://meta-toolkit.org/nlp-demo.html).)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can now parse all of the sentences in our document." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(ROOT\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (VBD said)\n", + " (SBAR\n", + " (IN that)\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (MD ca)\n", + " (RB n't)\n", + " (VP\n", + " (VB believe)\n", + " (SBAR\n", + " (IN that)\n", + " (S\n", + " (NP (PRP it))\n", + " (ADVP (RB only))\n", + " (VP\n", + " (VBZ costs)\n", + " (NP\n", + " ($ $)\n", + " (CD 19.95))))))))))\n", + " (. !)))\n", + "\n", + "(ROOT\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (MD could)\n", + " (ADVP (RB only))\n", + " (VP\n", + " (VB find)\n", + " (NP (PRP it))\n", + " (PP\n", + " (IN for)\n", + " (NP\n", + " (QP\n", + " (JJR more)\n", + " (IN than)\n", + " ($ $)\n", + " (CD 30))))\n", + " (ADVP (IN before))))\n", + " (. .)))\n", + "\n" + ] + } + ], + "source": [ + "tok.set_content(doc.content())\n", + "for seq in extract_sequences(tok):\n", + " tagger.tag(seq)\n", + " print(parser.parse(seq).pretty_str())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now that we know how to build these phrase structure trees from POS-tagged sentences extracted from raw text, let's explore a simple way we might be able to exploit this knowledge to help a downstream task.\n", + "\n", + "Our goal is going to be to extract the Subject-Verb-Object triples from some simple sentences. This will allow us to understand who is doing what to whom, which is knowledge that might be useful for lots of downstream tasks as diverse as question answering to stock market prediction. We should be able to extract these from our constituency parses. (This, of course, isn't the only way, and this method is quite naive. However, the implementation is simple enough that I think you should be able to grasp it in a single lecture.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "First, let's grab our sample data. This is a collection of BBC news headlines that will serve as our \"simple\" sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-03-28 12:52:48-- https://meta-toolkit.org/data/2017-03-27/headlines.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving meta-toolkit.org... 50.116.41.177, 2600:3c02::f03c:91ff:feae:b777\n", + "Connecting to meta-toolkit.org|50.116.41.177|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 31358 (31K) [application/octet-stream]\n", + "Saving to: ‘headlines.tar.gz’\n", + "\n", + "headlines.tar.gz 100%[===================>] 30.62K --.-KB/s in 0.002s \n", + "\n", + "2017-03-28 12:52:48 (15.1 MB/s) - ‘headlines.tar.gz’ saved [31358/31358]\n", + "\n", + "headlines/\n", + "headlines/tech.txt\n", + "headlines/entertainment.txt\n", + "headlines/politics.txt\n", + "headlines/README.md\n", + "headlines/sport.txt\n", + "headlines/business.txt\n", + "\n", + "README:\n", + "http://mlg.ucd.ie/datasets/bbc.html\n", + "\n", + "Exactracted first sentence of each doc from this dataset.\n" + ] + } + ], + "source": [ + "!wget -nc https://meta-toolkit.org/data/2017-03-27/headlines.tar.gz # please be nice!\n", + "!tar xvf headlines.tar.gz\n", + "!echo \"\" && echo \"README:\"\n", + "!cat headlines/README.md" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's look at the first headline of the business category." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Brazil approves bankruptcy reform'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open(\"headlines/business.txt\") as f:\n", + " business = f.readlines()\n", + "business[0].strip()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This looks simple enough. Let's see how it gets tagged." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Brazil, NNP), (approves, VBZ), (bankruptcy, NN), (reform, NN)\n" + ] + } + ], + "source": [ + "tok.set_content(business[0].strip())\n", + "sequence = extract_sequences(tok)[0]\n", + "tagger.tag(sequence)\n", + "print(sequence)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's also parse it." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(ROOT\n", + " (S\n", + " (NP (NNP Brazil))\n", + " (VP\n", + " (VBZ approves)\n", + " (NP\n", + " (NN bankruptcy)\n", + " (NN reform)))))\n", + "\n" + ] + } + ], + "source": [ + "tree = parser.parse(sequence)\n", + "print(tree.pretty_str())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Great. We can now start to develop our technique. We can see that the subject here is the first noun phrase (NP), the verb is the first verb-like token in the VP, and the object is the NP within that VP.\n", + "\n", + "We're going to need to traverse this tree to extract what we want. MeTA supports this by exploiting the [Visitor pattern](https://en.wikipedia.org/wiki/Visitor_pattern), so the easiest way for us to get at what we're looking for is to write some classes that encapsulate the traversal we want to perform and keep track of things within this tree that we are interested in.\n", + "\n", + "Let's write our first simple visitor that traverses the tree to find the first NP node, at which point it will stop and store the root of that subtree." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on class Visitor in module metapy.metapy.parser:\n", + "\n", + "class Visitor(pybind11_builtins.pybind11_object_48)\n", + " | Method resolution order:\n", + " | Visitor\n", + " | pybind11_builtins.pybind11_object_48\n", + " | builtins.object\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(...) from builtins.PyCapsule\n", + " | __init__(self: metapy.metapy.parser.Visitor) -> None\n", + " | \n", + " | visit_internal(...) from builtins.PyCapsule\n", + " | visit_internal(self: metapy.metapy.parser.Visitor, arg0: metapy.metapy.parser.InternalNode) -> object\n", + " | \n", + " | visit_leaf(...) from builtins.PyCapsule\n", + " | visit_leaf(self: metapy.metapy.parser.Visitor, arg0: metapy.metapy.parser.LeafNode) -> object\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Methods inherited from pybind11_builtins.pybind11_object_48:\n", + " | \n", + " | __new__(*args, **kwargs) from pybind11_builtins.pybind11_type\n", + " | Create and return a new object. See help(type) for accurate signature.\n", + "\n" + ] + } + ], + "source": [ + "help(metapy.parser.Visitor)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "class NounPhraseFinder(metapy.parser.Visitor):\n", + " def __init__(self):\n", + " self.node = None\n", + " super(NounPhraseFinder, self).__init__() # required; invoke base class __init__\n", + " \n", + " def visit_leaf(self, node):\n", + " pass # we don't care about leaf nodes\n", + " \n", + " def visit_internal(self, node):\n", + " if self.node:\n", + " return\n", + "\n", + " # we do care about internal nodes; check if it is an NP\n", + " if node.category() == 'NP':\n", + " # store this node and stop the traversal\n", + " self.node = node\n", + " else:\n", + " # continue traversing by visiting all of the child nodes\n", + " node.each_child(lambda child: child.accept(self))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NP with 1 child(ren)\n" + ] + } + ], + "source": [ + "npf = NounPhraseFinder()\n", + "tree.visit(npf)\n", + "print(\"{} with {} child(ren)\".format(npf.node.category(), npf.node.num_children()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now that we have that working, we should be able to make a more generic PhraseFinder that finds the first internal node that matches a specific node category. We'll need one for finding the first NP and one for finding the first VP anyway, so this will be helpful." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "class PhraseFinder(metapy.parser.Visitor):\n", + " def __init__(self, category):\n", + " super(PhraseFinder, self).__init__()\n", + " self.node = None\n", + " self.category = category\n", + " \n", + " def visit_leaf(self, node):\n", + " pass # we don't care about leaf nodes\n", + " \n", + " def visit_internal(self, node):\n", + " if self.node:\n", + " return\n", + " \n", + " if node.category() == self.category:\n", + " self.node = node\n", + " else:\n", + " node.each_child(lambda child: child.accept(self))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NP with 1 child(ren)\n", + "VP with 2 child(ren)\n" + ] + } + ], + "source": [ + "npf = PhraseFinder('NP')\n", + "vpf = PhraseFinder('VP')\n", + "tree.visit(npf)\n", + "tree.visit(vpf)\n", + "for node in [npf.node, vpf.node]:\n", + " print(\"{} with {} child(ren)\".format(node.category(), node.num_children()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now that we can find the first internal node matching a category label, we need to set about extracting the actual leaf nodes we care about. Fortunately there is already a visitor that can extract all leaf nodes from a subtree, so we can use that to get started.\n", + "\n", + "From the first noun phrase, we want to extract all leaf nodes that are noun-like tags and join them together to make up our subject." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Brazil\n" + ] + } + ], + "source": [ + "noun_tags = set(['NN', 'NNS', 'NNP', 'NNPS'])\n", + "lnf = metapy.parser.LeafNodeFinder()\n", + "npf.node.accept(lnf)\n", + "subject = ' '.join([leaf.word() for leaf in lnf.leaves() if leaf.category() in noun_tags])\n", + "print(subject)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "And from the first verb phrase, we want to extract (1) the first verb-like leaf node to be the verb and (2) the noun-like tags in the first NP that occurs within that VP. We should be able to re-use some existing code we've already written." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "approves\n" + ] + } + ], + "source": [ + "verb_tags = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])\n", + "lnf = metapy.parser.LeafNodeFinder()\n", + "vpf.node.accept(lnf)\n", + "verb = next(leaf.word() for leaf in lnf.leaves() if leaf.category() in verb_tags)\n", + "print(verb)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bankruptcy reform\n" + ] + } + ], + "source": [ + "np_finder = PhraseFinder('NP')\n", + "vpf.node.accept(np_finder)\n", + "lnf = metapy.parser.LeafNodeFinder()\n", + "np_finder.node.accept(lnf)\n", + "obj = ' '.join([leaf.word() for leaf in lnf.leaves() if leaf.category() in noun_tags])\n", + "print(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SUBJ: Brazil VERB: approves OBJ: bankruptcy reform\n" + ] + } + ], + "source": [ + "print(\"SUBJ: {} VERB: {} OBJ: {}\".format(subject, verb, obj))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Putting this all together, we can write a visitor to extract (SUBJ, VERB, OBJ) triples." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "class SVOExtractor(metapy.parser.Visitor):\n", + " noun_tags = set(['NN', 'NNS', 'NNP', 'NNPS'])\n", + " verb_tags = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) \n", + " \n", + " def __init__(self):\n", + " super(SVOExtractor, self).__init__()\n", + " self.subject = self.verb = self.object = None\n", + " \n", + " def extract_noun_tagged_words(self, node):\n", + " lnf = metapy.parser.LeafNodeFinder()\n", + " node.accept(lnf)\n", + " return ' '.join([leaf.word() for leaf in lnf.leaves() if leaf.category() in noun_tags])\n", + " \n", + " def visit_leaf(self, node):\n", + " pass # don't care about leaf nodes\n", + " \n", + " def visit_internal(self, node):\n", + " # find and handle the first NP\n", + " first_np = PhraseFinder('NP') \n", + " node.accept(first_np)\n", + " if first_np.node:\n", + " self.subject = self.extract_noun_tagged_words(first_np.node)\n", + " \n", + " # find and handle the first VP\n", + " first_vp = PhraseFinder('VP')\n", + " node.accept(first_vp)\n", + " \n", + " if first_vp.node:\n", + " # find the first NP within the first VP\n", + " vp_first_np = PhraseFinder('NP')\n", + " first_vp.node.accept(vp_first_np)\n", + " \n", + " if vp_first_np.node:\n", + " self.object = self.extract_noun_tagged_words(vp_first_np.node)\n", + " \n", + " lnf = metapy.parser.LeafNodeFinder()\n", + " first_vp.node.accept(lnf)\n", + " for leaf in lnf.leaves():\n", + " if leaf.category() in verb_tags:\n", + " self.verb = leaf.word()\n", + " break\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Brazil approves bankruptcy reform\n", + "SUBJ: Brazil VERB: approves OBJ: bankruptcy reform\n", + "German business confidence slides\n", + "SUBJ: business confidence slides VERB: None OBJ: None\n", + "Dollar slides ahead of New Year\n", + "SUBJ: Dollar slides New Year VERB: None OBJ: None\n", + "Aviation firms eye booming India\n", + "SUBJ: Aviation firms VERB: eye OBJ: India\n", + "Metlife buys up Citigroup insurer\n", + "SUBJ: Metlife VERB: buys OBJ: Citigroup insurer\n", + "US economy still growing says Fed\n", + "SUBJ: VERB: says OBJ: None\n", + "Russia WTO talks 'make progress'\n", + "SUBJ: Russia WTO talks make progress VERB: None OBJ: None\n", + "Deadline nears for Fiat-GM deal\n", + "SUBJ: Deadline VERB: nears OBJ: Fiat GM deal\n", + "Five million Germans out of work\n", + "SUBJ: Germans work VERB: None OBJ: None\n", + "Jobs go at Oracle after takeover\n", + "SUBJ: Jobs VERB: go OBJ: Oracle\n", + "Asian banks halt dollar's slide\n", + "SUBJ: banks VERB: halt OBJ: dollar slide\n", + "Markets signal Brazilian recovery\n", + "SUBJ: Markets VERB: signal OBJ: recovery\n", + "GE sees 'excellent' world economy\n", + "SUBJ: GE VERB: sees OBJ: world economy\n", + "Q&A: Malcolm Glazer and Man Utd\n", + "SUBJ: Q A Malcolm Glazer Man Utd VERB: None OBJ: None\n", + "China continues rapid growth\n", + "SUBJ: China VERB: continues OBJ: growth\n", + "M&S cuts prices by average of 24%\n", + "SUBJ: M S cuts prices average % VERB: None OBJ: None\n", + "Trial begins of Spain's top banker\n", + "SUBJ: Trial VERB: begins OBJ: Spain banker\n", + "Malaysia lifts Islamic bank limit\n", + "SUBJ: Malaysia VERB: lifts OBJ: Islamic bank limit\n", + "Giant waves damage S Asia economy\n", + "SUBJ: Giant VERB: waves OBJ: damage S Asia economy\n", + "Europe asks Asia for euro help\n", + "SUBJ: Europe VERB: asks OBJ: Asia\n", + "Troubled Marsh under SEC scrutiny\n", + "SUBJ: Marsh SEC scrutiny VERB: None OBJ: None\n", + "US to probe airline travel chaos\n", + "SUBJ: airline travel chaos VERB: probe OBJ: airline travel chaos\n", + "China's Shanda buys stake in Sina\n", + "SUBJ: China Shanda VERB: buys OBJ: stake Sina\n", + "Strong demand triggers oil rally\n", + "SUBJ: demand VERB: triggers OBJ: oil rally\n", + "Karachi stocks hit historic high\n", + "SUBJ: Karachi stocks VERB: hit OBJ: None\n", + "Senior Fannie Mae bosses resign\n", + "SUBJ: Fannie Mae bosses VERB: resign OBJ: None\n", + "Bank opts to leave rates on hold\n", + "SUBJ: Bank VERB: opts OBJ: rates\n", + "Standard Life cuts policy bonuses\n", + "SUBJ: Life cuts policy bonuses VERB: None OBJ: None\n", + "Ukraine revisits state sell-offs\n", + "SUBJ: Ukraine VERB: revisits OBJ: state\n", + "Venezuela reviews foreign deals\n", + "SUBJ: Venezuela VERB: reviews OBJ: deals\n", + "Fed chief warning on US deficit\n", + "SUBJ: Fed chief VERB: deficit OBJ: None\n", + "US industrial output growth eases\n", + "SUBJ: output growth eases VERB: None OBJ: None\n", + "IMF 'cuts' German growth estimate\n", + "SUBJ: IMF cuts growth estimate VERB: None OBJ: None\n", + "Electronics firms eye plasma deal\n", + "SUBJ: Electronics firms VERB: eye OBJ: plasma deal\n", + "Ethiopia's crop production up 24%\n", + "SUBJ: Ethiopia crop production % VERB: None OBJ: None\n", + "India widens access to telecoms\n", + "SUBJ: India VERB: widens OBJ: access telecoms\n", + "'Standoff' on Deutsche's LSE bid\n", + "SUBJ: Standoff Deutsche LSE bid VERB: None OBJ: None\n", + "JP Morgan admits US slavery links\n", + "SUBJ: JP Morgan VERB: admits OBJ: slavery links\n", + "Home loan approvals rising again\n", + "SUBJ: Home loan approvals VERB: rising OBJ: None\n", + "Bank payout to Pinochet victims\n", + "SUBJ: Bank payout Pinochet victims VERB: None OBJ: None\n", + "India's Reliance family feud heats up\n", + "SUBJ: India Reliance family feud VERB: heats OBJ: None\n", + "Tsunami cost hits Jakarta shares\n", + "SUBJ: Tsunami cost VERB: hits OBJ: Jakarta shares\n", + "ECB holds rates amid growth fears\n", + "SUBJ: ECB VERB: holds OBJ: rates\n", + "Ad sales boost Time Warner profit\n", + "SUBJ: Ad sales boost Time Warner profit VERB: None OBJ: None\n", + "Deutsche attacks Yukos case\n", + "SUBJ: Deutsche attacks Yukos case VERB: None OBJ: None\n", + "Latin America sees strong growth\n", + "SUBJ: Latin America VERB: sees OBJ: growth\n", + "Chinese exports rise 25% in 2004\n", + "SUBJ: exports VERB: rise OBJ: %\n", + "Mixed signals from French economy\n", + "SUBJ: signals economy VERB: None OBJ: None\n", + "Parmalat to return to stockmarket\n", + "SUBJ: Parmalat VERB: return OBJ: None\n", + "Krispy Kreme shares hit\n", + "SUBJ: Krispy Kreme shares VERB: hit OBJ: None\n", + "Watchdog probes Vivendi bond sale\n", + "SUBJ: Watchdog VERB: probes OBJ: Vivendi bond sale\n", + "Khodorkovsky ally denies charges\n", + "SUBJ: Khodorkovsky VERB: denies OBJ: charges\n", + "Profits jump at China's top bank\n", + "SUBJ: Profits VERB: jump OBJ: China bank\n", + "Parmalat bank barred from suing\n", + "SUBJ: Parmalat bank VERB: barred OBJ: \n", + "Germany nears 1990 jobless level\n", + "SUBJ: Germany VERB: nears OBJ: level\n", + "Peugeot deal boosts Mitsubishi\n", + "SUBJ: Peugeot deal boosts Mitsubishi VERB: None OBJ: None\n", + "WorldCom director admits lying\n", + "SUBJ: WorldCom director VERB: admits OBJ: None\n", + "Ask Jeeves tips online ad revival\n", + "SUBJ: Jeeves tips VERB: Ask OBJ: Jeeves tips\n", + "Euro firms miss out on optimism\n", + "SUBJ: Euro firms VERB: miss OBJ: optimism\n", + "Business fears over sluggish EU economy\n", + "SUBJ: Business fears EU economy VERB: None OBJ: None\n", + "Bush to get 'tough' on deficit\n", + "SUBJ: Bush VERB: get OBJ: deficit\n", + "SEC to rethink post-Enron rules\n", + "SUBJ: SEC VERB: rethink OBJ: post\n", + "EU 'too slow' on economic reforms\n", + "SUBJ: EU reforms VERB: None OBJ: None\n", + "Amex shares up on spin-off news\n", + "SUBJ: Amex shares spin news VERB: None OBJ: None\n", + "US bank boss hails 'genius' Smith\n", + "SUBJ: bank boss genius Smith VERB: hails OBJ: genius Smith\n", + "Mystery surrounds new Yukos owner\n", + "SUBJ: Mystery VERB: surrounds OBJ: Yukos owner\n", + "Mexican in US send $16bn home\n", + "SUBJ: VERB: send OBJ: home\n", + "Euronext 'poised to make LSE bid'\n", + "SUBJ: LSE bid VERB: None OBJ: None\n", + "Parmalat founder offers apology\n", + "SUBJ: Parmalat founder VERB: offers OBJ: apology\n", + "Turkey-Iran mobile deal 'at risk'\n", + "SUBJ: Turkey Iran deal risk VERB: None OBJ: None\n", + "Brazil plays down Varig rescue\n", + "SUBJ: Brazil VERB: plays OBJ: Varig rescue\n", + "High fuel prices hit BA's profits\n", + "SUBJ: fuel prices VERB: hit OBJ: BA profits\n", + "Bombardier chief to leave company\n", + "SUBJ: Bombardier chief company VERB: leave OBJ: company\n", + "Christmas shoppers flock to tills\n", + "SUBJ: Christmas shoppers VERB: flock OBJ: tills\n", + "Jobs growth still slow in the US\n", + "SUBJ: Jobs growth VERB: None OBJ: \n", + "Winter freeze keeps oil above $50\n", + "SUBJ: Winter freeze VERB: keeps OBJ: oil\n", + "US prepares for hybrid onslaught\n", + "SUBJ: VERB: prepares OBJ: hybrid onslaught\n", + "Indonesians face fuel price rise\n", + "SUBJ: Indonesians VERB: face OBJ: fuel price rise\n", + "Car giant hit by Mercedes slump\n", + "SUBJ: Car giant Mercedes slump VERB: hit OBJ: Mercedes slump\n", + "Bank voted 8-1 for no rate change\n", + "SUBJ: Bank VERB: voted OBJ: rate change\n", + "Brazil buy boosts Belgium's Inbev\n", + "SUBJ: Brazil VERB: buy OBJ: boosts Belgium Inbev\n", + "Tsunami slows Sri Lanka's growth\n", + "SUBJ: Tsunami VERB: slows OBJ: Sri Lanka growth\n", + "Japanese growth grinds to a halt\n", + "SUBJ: growth VERB: grinds OBJ: halt\n", + "Shares rise on new Man Utd offer\n", + "SUBJ: Shares VERB: rise OBJ: Man Utd offer\n", + "Takeover rumour lifts Exel shares\n", + "SUBJ: Takeover rumour VERB: lifts OBJ: Exel shares\n", + "Executive trio leave Aer Lingus\n", + "SUBJ: Executive trio VERB: leave OBJ: Aer Lingus\n", + "Businesses fail to plan for HIV\n", + "SUBJ: Businesses VERB: fail OBJ: HIV\n", + "Iranian MPs threaten mobile deal\n", + "SUBJ: MPs VERB: threaten OBJ: deal\n", + "Air passengers win new EU rights\n", + "SUBJ: Air passengers VERB: win OBJ: EU rights\n", + "Yukos drops banks from court bid\n", + "SUBJ: Yukos drops banks court bid VERB: None OBJ: None\n", + "Yukos seeks court action on sale\n", + "SUBJ: Yukos VERB: seeks OBJ: court action sale\n", + "Lesotho textile workers lose jobs\n", + "SUBJ: Lesotho textile workers VERB: lose OBJ: jobs\n", + "Oil rebounds from weather effect\n", + "SUBJ: Oil rebounds weather effect VERB: None OBJ: None\n", + "Cairn shares up on new oil find\n", + "SUBJ: Cairn shares oil find VERB: None OBJ: None\n", + "Huge rush for Jet Airways shares\n", + "SUBJ: rush Jet Airways shares VERB: None OBJ: None\n", + "Brewers' profits lose their fizz\n", + "SUBJ: Brewers profits VERB: lose OBJ: fizz\n", + "Winemaker rejects Foster's offer\n", + "SUBJ: Winemaker VERB: rejects OBJ: Foster offer\n", + "No seasonal lift for house market\n", + "SUBJ: lift house market VERB: None OBJ: None\n", + "Unilever shake up as profit slips\n", + "SUBJ: Unilever VERB: shake OBJ: profit slips\n", + "Japan's ageing workforce: built to last\n", + "SUBJ: Japan workforce VERB: built OBJ: None\n", + "US interest rates increased to 2%\n", + "SUBJ: interest rates % VERB: increased OBJ: %\n", + "Train strike grips Buenos Aires\n", + "SUBJ: Train strike grips Buenos Aires VERB: None OBJ: None\n", + "Axa Sun Life cuts bonus payments\n", + "SUBJ: Axa Sun Life cuts bonus payments VERB: None OBJ: None\n", + "Russian oil merger excludes Yukos\n", + "SUBJ: oil merger VERB: excludes OBJ: Yukos\n", + "Wembley firm won't make a profit\n", + "SUBJ: Wembley firm VERB: make OBJ: profit\n", + "Millions 'to lose textile jobs'\n", + "SUBJ: Millions textile jobs VERB: lose OBJ: textile jobs\n", + "BMW reveals new models pipeline\n", + "SUBJ: BMW VERB: reveals OBJ: models pipeline\n", + "French boss to leave EADS\n", + "SUBJ: boss EADS VERB: leave OBJ: EADS\n", + "Irish markets reach all-time high\n", + "SUBJ: markets VERB: reach OBJ: time\n", + "Argentina, Venezuela in oil deal\n", + "SUBJ: Argentina Venezuela oil deal VERB: None OBJ: None\n", + "Japan economy slides to recession\n", + "SUBJ: Japan economy VERB: slides OBJ: recession\n", + "Green reports shun supply chain\n", + "SUBJ: reports VERB: shun OBJ: supply chain\n", + "Georgia plans hidden asset pardon\n", + "SUBJ: Georgia VERB: plans OBJ: asset pardon\n", + "India's Deccan gets more planes\n", + "SUBJ: India Deccan VERB: gets OBJ: planes\n", + "Yukos unit fetches $9bn at auction\n", + "SUBJ: Yukos unit VERB: fetches OBJ: \n", + "Bargain calls widen Softbank loss\n", + "SUBJ: Bargain VERB: calls OBJ: Softbank loss\n", + "Lufthansa may sue over Bush visit\n", + "SUBJ: Lufthansa VERB: sue OBJ: Bush visit\n", + "Stormy year for property insurers\n", + "SUBJ: Stormy year property insurers VERB: None OBJ: None\n", + "G7 backs Africa debt relief plan\n", + "SUBJ: G7 VERB: backs OBJ: Africa debt relief plan\n", + "Boeing unveils new 777 aircraft\n", + "SUBJ: Boeing VERB: unveils OBJ: aircraft\n", + "Steady job growth continues in US\n", + "SUBJ: job growth VERB: continues OBJ: \n", + "China suspends 26 power projects\n", + "SUBJ: China VERB: suspends OBJ: power projects\n", + "UK firm faces Venezuelan land row\n", + "SUBJ: UK firm VERB: faces OBJ: land row\n", + "J&J agrees $25bn Guidant deal\n", + "SUBJ: J J VERB: agrees OBJ: Guidant deal\n", + "S Korean consumers spending again\n", + "SUBJ: VERB: None OBJ: None\n", + "Industrial revival hope for Japan\n", + "SUBJ: revival hope Japan VERB: None OBJ: None\n", + "UK young top Euro earnings league\n", + "SUBJ: UK top Euro earnings league VERB: None OBJ: None\n", + "Dollar hits new low versus euro\n", + "SUBJ: Dollar VERB: hits OBJ: versus euro\n", + "Turkey knocks six zeros off lira\n", + "SUBJ: Turkey VERB: knocks OBJ: zeros lira\n", + "House prices drop as sales slow\n", + "SUBJ: House prices VERB: drop OBJ: sales\n", + "Pension hitch for long-living men\n", + "SUBJ: Pension hitch living men VERB: None OBJ: None\n", + "Ban on forced retirement under 65\n", + "SUBJ: Ban retirement VERB: None OBJ: None\n", + "UK Coal plunges into deeper loss\n", + "SUBJ: UK Coal VERB: plunges OBJ: loss\n", + "US Airways staff agree to pay cut\n", + "SUBJ: Airways staff cut VERB: agree OBJ: cut\n", + "US firm pulls out of Iraq\n", + "SUBJ: VERB: firm OBJ: Iraq\n", + "Card fraudsters 'targeting web'\n", + "SUBJ: Card fraudsters targeting web VERB: None OBJ: None\n", + "Putin backs state grab for Yukos\n", + "SUBJ: Putin VERB: backs OBJ: state grab Yukos\n", + "Iraq to invite phone licence bids\n", + "SUBJ: Iraq phone licence bids VERB: invite OBJ: phone licence bids\n", + "Go-ahead for Balkan oil pipeline\n", + "SUBJ: oil pipeline VERB: Go OBJ: oil pipeline\n", + "Mixed Christmas for US retailers\n", + "SUBJ: Christmas US retailers VERB: None OBJ: None\n", + "Irish duo could block Man Utd bid\n", + "SUBJ: duo VERB: block OBJ: Man Utd bid\n", + "Nortel in $300m profit revision\n", + "SUBJ: Nortel profit revision VERB: None OBJ: None\n", + "China now top trader with Japan\n", + "SUBJ: China trader Japan VERB: None OBJ: None\n", + "Hyundai to build new India plant\n", + "SUBJ: Hyundai India plant VERB: build OBJ: India plant\n", + "House prices suffer festive fall\n", + "SUBJ: House prices VERB: suffer OBJ: fall\n", + "Yukos bankruptcy 'not US matter'\n", + "SUBJ: Yukos bankruptcy VERB: None OBJ: None\n", + "Millions go missing at China bank\n", + "SUBJ: Millions VERB: go OBJ: China bank\n", + "Markets fall on weak dollar fears\n", + "SUBJ: Markets VERB: fall OBJ: dollar fears\n", + "German bidder in talks with LSE\n", + "SUBJ: bidder talks LSE VERB: None OBJ: None\n", + "Khodorkovsky quits Yukos shares\n", + "SUBJ: Khodorkovsky VERB: quits OBJ: Yukos shares\n", + "AstraZeneca hit by drug failure\n", + "SUBJ: AstraZeneca drug failure VERB: hit OBJ: drug failure\n", + "BT offers equal access to rivals\n", + "SUBJ: BT VERB: offers OBJ: access rivals\n", + "US consumer confidence up\n", + "SUBJ: VERB: consumer OBJ: confidence\n", + "BA to suspend two Saudi services\n", + "SUBJ: BA Saudi services VERB: suspend OBJ: Saudi services\n", + "Honda wins China copyright ruling\n", + "SUBJ: Honda VERB: wins OBJ: China copyright ruling\n", + "Ebbers denies WorldCom fraud\n", + "SUBJ: Ebbers VERB: denies OBJ: WorldCom fraud\n", + "Parmalat sues 45 banks over crash\n", + "SUBJ: Parmalat VERB: sues OBJ: banks\n", + "Yukos accused of lying to court\n", + "SUBJ: Yukos VERB: accused OBJ: court\n", + "Making your office work for you\n", + "SUBJ: office work VERB: Making OBJ: office work\n", + "Economy 'strong' in election year\n", + "SUBJ: Economy election year VERB: None OBJ: None\n", + "Survey confirms property slowdown\n", + "SUBJ: Survey VERB: confirms OBJ: property slowdown\n", + "The 'ticking budget' facing the US\n", + "SUBJ: ticking budget VERB: facing OBJ: \n", + "Ukraine strikes Turkmen gas deal\n", + "SUBJ: Ukraine VERB: strikes OBJ: Turkmen gas deal\n", + "Mitsubishi in Peugeot link talks\n", + "SUBJ: Mitsubishi Peugeot link talks VERB: None OBJ: None\n", + "Golden rule 'intact' says ex-aide\n", + "SUBJ: Golden rule aide VERB: says OBJ: aide\n", + "Irish company hit by Iraqi report\n", + "SUBJ: company Iraqi report VERB: hit OBJ: Iraqi report\n", + "Fiat chief takes steering wheel\n", + "SUBJ: Fiat chief VERB: takes OBJ: steering wheel\n", + "Bat spit drug firm goes to market\n", + "SUBJ: Bat spit drug firm VERB: goes OBJ: market\n", + "Sales 'fail to boost High Street'\n", + "SUBJ: Sales fail High Street VERB: boost OBJ: High Street\n", + "Indy buys into India paper\n", + "SUBJ: buys India paper VERB: None OBJ: None\n", + "Industrial output falls in Japan\n", + "SUBJ: output VERB: falls OBJ: Japan\n", + "VW considers opening Indian plant\n", + "SUBJ: VW VERB: considers OBJ: plant\n", + "US retail sales surge in December\n", + "SUBJ: sales surge December VERB: None OBJ: None\n", + "LSE 'sets date for takeover deal'\n", + "SUBJ: LSE sets date takeover deal VERB: None OBJ: None\n", + "Glazer makes new Man Utd approach\n", + "SUBJ: Glazer VERB: makes OBJ: Man Utd approach\n", + "French suitor holds LSE meeting\n", + "SUBJ: suitor VERB: holds OBJ: LSE meeting\n", + "Booming markets shed few tears\n", + "SUBJ: markets tears VERB: Booming OBJ: markets tears\n", + "McDonald's boss Bell dies aged 44\n", + "SUBJ: McDonald boss Bell VERB: dies OBJ: \n", + "Wall Street cheers Bush victory\n", + "SUBJ: Wall Street cheers Bush victory VERB: None OBJ: None\n", + "US Ahold suppliers face charges\n", + "SUBJ: Ahold suppliers charges VERB: face OBJ: charges\n", + "Italy to get economic action plan\n", + "SUBJ: Italy VERB: get OBJ: action plan\n", + "India calls for fair trade rules\n", + "SUBJ: India VERB: calls OBJ: trade rules\n", + "US trade gap hits record in 2004\n", + "SUBJ: VERB: trade OBJ: gap hits record\n", + "Britannia members' £42m windfall\n", + "SUBJ: Britannia members £ windfall VERB: None OBJ: None\n", + "Manufacturing recovery 'slowing'\n", + "SUBJ: recovery VERB: Manufacturing OBJ: recovery\n", + "Asia shares defy post-quake gloom\n", + "SUBJ: Asia shares VERB: defy OBJ: quake gloom\n", + "Weak dollar hits Reuters\n", + "SUBJ: dollar hits Reuters VERB: None OBJ: None\n", + "Cannabis hopes for drug firm\n", + "SUBJ: Cannabis VERB: hopes OBJ: drug firm\n", + "India's rupee hits five-year high\n", + "SUBJ: India rupee VERB: hits OBJ: year\n", + "Yangtze Electric's profits double\n", + "SUBJ: Yangtze Electric profits VERB: None OBJ: None\n", + "Dollar drops on reserves concerns\n", + "SUBJ: Dollar drops reserves concerns VERB: None OBJ: None\n", + "Worldcom ex-boss launches defence\n", + "SUBJ: Worldcom boss launches defence VERB: None OBJ: None\n", + "Google shares fall as staff sell\n", + "SUBJ: Google shares VERB: fall OBJ: staff sell\n", + "Absa and Barclays talks continue\n", + "SUBJ: Absa Barclays talks VERB: continue OBJ: None\n", + "US regulator to rule on pain drug\n", + "SUBJ: VERB: regulator OBJ: pain drug\n", + "Hariri killing hits Beirut shares\n", + "SUBJ: Hariri hits Beirut shares VERB: killing OBJ: hits Beirut shares\n", + "Jobs growth still slow in the US\n", + "SUBJ: Jobs growth VERB: None OBJ: \n", + "S Korea spending boost to economy\n", + "SUBJ: S Korea spending boost economy VERB: None OBJ: None\n", + "Marsh executive in guilty plea\n", + "SUBJ: Marsh executive plea VERB: None OBJ: None\n", + "India seeks to boost construction\n", + "SUBJ: India VERB: seeks OBJ: construction\n", + "Tokyo says deflation 'controlled'\n", + "SUBJ: Tokyo VERB: says OBJ: deflation\n", + "Stock market eyes Japan recovery\n", + "SUBJ: Stock market eyes Japan recovery VERB: None OBJ: None\n", + "MCI shares climb on takeover bid\n", + "SUBJ: MCI shares VERB: climb OBJ: takeover bid\n", + "UK homes hit £3.3 trillion total\n", + "SUBJ: UK homes VERB: hit OBJ: £\n", + "US adds more jobs than expected\n", + "SUBJ: VERB: adds OBJ: jobs\n", + "Buyers snap up Jet Airways' shares\n", + "SUBJ: Buyers VERB: snap OBJ: Jet Airways shares\n", + "Electrolux to export Europe jobs\n", + "SUBJ: Electrolux Europe jobs VERB: export OBJ: Europe jobs\n", + "Crude oil prices back above $50\n", + "SUBJ: oil prices VERB: None OBJ: None\n", + "Madagascar completes currency switch\n", + "SUBJ: Madagascar VERB: completes OBJ: currency switch\n", + "Tsunami 'to hit Sri Lanka banks'\n", + "SUBJ: Tsunami Sri Lanka banks VERB: hit OBJ: Sri Lanka banks\n", + "India and Iran in gas export deal\n", + "SUBJ: India Iran gas export deal VERB: None OBJ: None\n", + "Rank 'set to sell off film unit'\n", + "SUBJ: Rank set film unit VERB: sell OBJ: film unit\n", + "Iraqi voters turn to economic issues\n", + "SUBJ: voters VERB: turn OBJ: issues\n", + "Fed warns of more US rate rises\n", + "SUBJ: Fed VERB: warns OBJ: rate rises\n", + "Standard Life concern at LSE bid\n", + "SUBJ: Standard Life concern LSE bid VERB: None OBJ: None\n", + "Ukraine trims privatisation check\n", + "SUBJ: Ukraine VERB: trims OBJ: privatisation check\n", + "US data sparks inflation worries\n", + "SUBJ: VERB: sparks OBJ: inflation worries\n", + "Optimism remains over UK housing\n", + "SUBJ: Optimism VERB: remains OBJ: UK housing\n", + "UK 'risks breaking golden rule'\n", + "SUBJ: UK risks rule VERB: breaking OBJ: rule\n", + "Call centre users 'lose patience'\n", + "SUBJ: Call centre users lose patience VERB: None OBJ: None\n", + "'Strong dollar' call halts slide\n", + "SUBJ: dollar call halts VERB: slide OBJ: None\n", + "Criminal probe on Citigroup deals\n", + "SUBJ: probe Citigroup deals VERB: None OBJ: None\n", + "EU aiming to fuel development aid\n", + "SUBJ: EU VERB: aiming OBJ: development aid\n", + "Why few targets are better than many\n", + "SUBJ: targets VERB: are OBJ: \n", + "Chinese dam firm 'defies Beijing'\n", + "SUBJ: dam firm defies Beijing VERB: None OBJ: None\n", + "Beijingers fume over parking fees\n", + "SUBJ: Beijingers VERB: fume OBJ: parking fees\n", + "EU-US seeking deal on air dispute\n", + "SUBJ: EU VERB: seeking OBJ: deal air dispute\n", + "WMC profits up amid bid criticism\n", + "SUBJ: WMC VERB: profits OBJ: bid criticism\n", + "DaimlerChrysler's 2004 sales rise\n", + "SUBJ: DaimlerChrysler sales rise VERB: None OBJ: None\n", + "Weak data buffets French economy\n", + "SUBJ: Weak data buffets economy VERB: None OBJ: None\n", + "Rover deal 'may cost 2,000 jobs'\n", + "SUBJ: Rover deal jobs VERB: cost OBJ: jobs\n", + "Split-caps pay £194m compensation\n", + "SUBJ: Split caps compensation VERB: pay OBJ: compensation\n", + "Cairn Energy in Indian gas find\n", + "SUBJ: Cairn Energy Indian gas find VERB: None OBJ: None\n", + "Kraft cuts snack ads for children\n", + "SUBJ: Kraft VERB: cuts OBJ: snack ads\n", + "EU to probe Alitalia 'state aid'\n", + "SUBJ: EU Alitalia state aid VERB: probe OBJ: Alitalia state aid\n", + "Oil prices reach three-month low\n", + "SUBJ: Oil prices VERB: reach OBJ: month\n", + "Minister hits out at Yukos sale\n", + "SUBJ: Minister VERB: hits OBJ: Yukos sale\n", + "Continental 'may run out of cash'\n", + "SUBJ: Continental VERB: run OBJ: cash\n", + "BMW drives record sales in Asia\n", + "SUBJ: BMW VERB: drives OBJ: record sales Asia\n", + "ID theft surge hits US consumers\n", + "SUBJ: theft surge VERB: hits OBJ: consumers\n", + "Pernod takeover talk lifts Domecq\n", + "SUBJ: Pernod takeover talk VERB: lifts OBJ: Domecq\n", + "Wal-Mart fights back at accusers\n", + "SUBJ: Wal Mart accusers VERB: fights OBJ: accusers\n", + "Saab to build Cadillacs in Sweden\n", + "SUBJ: Saab VERB: build OBJ: Cadillacs\n", + "Police detain Chinese milk bosses\n", + "SUBJ: Police detain milk bosses VERB: None OBJ: None\n", + "Libya takes $1bn in unfrozen funds\n", + "SUBJ: Libya VERB: takes OBJ: \n", + "Singapore growth at 8.1% in 2004\n", + "SUBJ: Singapore growth % VERB: None OBJ: None\n", + "Yukos unit buyer faces loan claim\n", + "SUBJ: Yukos unit buyer VERB: faces OBJ: loan claim\n", + "India opens skies to competition\n", + "SUBJ: India VERB: opens OBJ: skies competition\n", + "LSE doubts boost bidders' shares\n", + "SUBJ: LSE doubts VERB: boost OBJ: bidders shares\n", + "Profits stall at China's Lenovo\n", + "SUBJ: Profits VERB: stall OBJ: China Lenovo\n", + "Profits slide at India's Dr Reddy\n", + "SUBJ: Profits VERB: slide OBJ: India Dr Reddy\n", + "Newest EU members underpin growth\n", + "SUBJ: Newest EU members VERB: underpin OBJ: growth\n", + "S Korean lender faces liquidation\n", + "SUBJ: lender liquidation VERB: faces OBJ: liquidation\n", + "GM, Ford cut output as sales fall\n", + "SUBJ: GM Ford VERB: cut OBJ: output\n", + "Giving financial gifts to children\n", + "SUBJ: gifts VERB: Giving OBJ: gifts\n", + "US bank 'loses' customer details\n", + "SUBJ: bank customer details VERB: None OBJ: None\n", + "Fiat mulls Ferrari market listing\n", + "SUBJ: Fiat VERB: mulls OBJ: Ferrari market\n", + "Lloyd's of London head chides FSA\n", + "SUBJ: Lloyd London head chides FSA VERB: None OBJ: None\n", + "Qwest may spark MCI bidding war\n", + "SUBJ: Qwest VERB: spark OBJ: MCI bidding war\n", + "EC calls truce in deficit battle\n", + "SUBJ: EC VERB: calls OBJ: truce deficit battle\n", + "Umbro profits lifted by Euro 2004\n", + "SUBJ: Umbro profits Euro VERB: lifted OBJ: Euro\n", + "US crude prices surge above $53\n", + "SUBJ: VERB: crude OBJ: prices\n", + "China keeps tight rein on credit\n", + "SUBJ: China VERB: keeps OBJ: rein\n", + "Mixed reaction to Man Utd offer\n", + "SUBJ: reaction Man Utd offer VERB: None OBJ: None\n", + "Soaring oil 'hits world economy'\n", + "SUBJ: oil VERB: Soaring OBJ: oil\n", + "India's Deccan seals $1.8bn deal\n", + "SUBJ: India Deccan VERB: seals OBJ: deal\n", + "Japan bank shares up on link talk\n", + "SUBJ: Japan bank shares link talk VERB: None OBJ: None\n", + "Laura Ashley chief stepping down\n", + "SUBJ: Laura Ashley chief VERB: stepping OBJ: None\n", + "Chinese wine tempts Italy's Illva\n", + "SUBJ: wine VERB: tempts OBJ: Italy Illva\n", + "Macy's owner buys rival for $11bn\n", + "SUBJ: Macy owner VERB: buys OBJ: \n", + "Japanese banking battle at an end\n", + "SUBJ: banking battle end VERB: None OBJ: None\n", + "GM issues 2005 profits warning\n", + "SUBJ: GM issues profits VERB: None OBJ: None\n", + "Warning over US pensions deficit\n", + "SUBJ: pensions VERB: Warning OBJ: pensions\n", + "Russia gets investment blessing\n", + "SUBJ: Russia VERB: gets OBJ: investment blessing\n", + "Brazil jobless rate hits new low\n", + "SUBJ: Brazil rate VERB: hits OBJ: None\n", + "Small firms 'hit by rising costs'\n", + "SUBJ: firms costs VERB: hit OBJ: costs\n", + "Alfa Romeos 'to get GM engines'\n", + "SUBJ: Alfa Romeos GM engines VERB: get OBJ: GM engines\n", + "BP surges ahead on high oil price\n", + "SUBJ: BP VERB: surges OBJ: oil price\n", + "McDonald's to sponsor MTV show\n", + "SUBJ: McDonald MTV VERB: sponsor OBJ: MTV\n", + "Worldcom boss 'left books alone'\n", + "SUBJ: Worldcom boss VERB: left OBJ: books\n", + "Egypt to sell off state-owned bank\n", + "SUBJ: Egypt state bank VERB: sell OBJ: state\n", + "Insurance bosses plead guilty\n", + "SUBJ: Insurance bosses VERB: plead OBJ: None\n", + "Cactus diet deal for Phytopharm\n", + "SUBJ: Cactus diet deal Phytopharm VERB: None OBJ: None\n", + "Strong quarterly growth for Nike\n", + "SUBJ: growth Nike VERB: None OBJ: None\n", + "Euronext joins bid battle for LSE\n", + "SUBJ: joins VERB: bid OBJ: battle LSE\n", + "US to rule on Yukos refuge call\n", + "SUBJ: VERB: rule OBJ: Yukos refuge call\n", + "Yukos loses US bankruptcy battle\n", + "SUBJ: Yukos VERB: loses OBJ: bankruptcy battle\n", + "Battered dollar hits another low\n", + "SUBJ: dollar VERB: hits OBJ: \n", + "Yukos sues four firms for $20bn\n", + "SUBJ: Yukos VERB: sues OBJ: firms\n", + "Delta cuts fares in survival plan\n", + "SUBJ: Delta cuts fares survival plan VERB: None OBJ: None\n", + "Salary scandal in Cameroon\n", + "SUBJ: Salary scandal Cameroon VERB: None OBJ: None\n", + "Bank set to leave rates on hold\n", + "SUBJ: Bank VERB: set OBJ: rates\n", + "Sluggish economy hits German jobs\n", + "SUBJ: economy VERB: hits OBJ: jobs\n", + "Wipro beats forecasts once again\n", + "SUBJ: Wipro VERB: beats OBJ: forecasts\n", + "SA unveils 'more for all' budget\n", + "SUBJ: SA unveils VERB: None OBJ: None\n", + "Renault boss hails 'great year'\n", + "SUBJ: Renault boss year VERB: None OBJ: None\n", + "Mild winter drives US oil down 6%\n", + "SUBJ: Mild winter drives % VERB: oil OBJ: %\n", + "Egypt and Israel seal trade deal\n", + "SUBJ: Egypt Israel seal trade deal VERB: None OBJ: None\n", + "Iraq and Afghanistan in WTO talks\n", + "SUBJ: Iraq Afghanistan WTO talks VERB: None OBJ: None\n", + "China had role in Yukos split-up\n", + "SUBJ: China VERB: had OBJ: role Yukos split\n", + "Venezuela identifies 'idle' farms\n", + "SUBJ: Venezuela farms VERB: None OBJ: None\n", + "Bush budget seeks deep cutbacks\n", + "SUBJ: Bush budget VERB: seeks OBJ: cutbacks\n", + "French wine gets 70m euro top-up\n", + "SUBJ: wine VERB: gets OBJ: euro top\n", + "Ryanair in $4bn Boeing plane deal\n", + "SUBJ: Ryanair Boeing plane deal VERB: None OBJ: None\n", + "Japan turns to beer alternatives\n", + "SUBJ: Japan VERB: turns OBJ: beer alternatives\n", + "MCI shareholder sues to stop bid\n", + "SUBJ: MCI shareholder VERB: sues OBJ: bid\n", + "Novartis hits acquisition trail\n", + "SUBJ: Novartis VERB: hits OBJ: acquisition trail\n", + "SEC to rethink post-Enron rules\n", + "SUBJ: SEC VERB: rethink OBJ: post\n", + "BBC poll indicates economic gloom\n", + "SUBJ: BBC poll VERB: indicates OBJ: gloom\n", + "WMC says Xstrata bid is too low\n", + "SUBJ: WMC VERB: says OBJ: Xstrata bid\n", + "Japanese mogul arrested for fraud\n", + "SUBJ: mogul VERB: arrested OBJ: fraud\n", + "Fannie Mae 'should restate books'\n", + "SUBJ: Fannie Mae VERB: restate OBJ: books\n", + "US trade gap ballooned in October\n", + "SUBJ: VERB: trade OBJ: gap October\n", + "Nasdaq planning $100m-share sale\n", + "SUBJ: Nasdaq share sale VERB: planning OBJ: \n", + "Oil prices fall back from highs\n", + "SUBJ: Oil prices VERB: fall OBJ: highs\n", + "French consumer spending rising\n", + "SUBJ: consumer spending VERB: rising OBJ: None\n", + "Saudi ministry to employ women\n", + "SUBJ: Saudi ministry women VERB: employ OBJ: women\n", + "Telegraph newspapers axe 90 jobs\n", + "SUBJ: Telegraph newspapers VERB: axe OBJ: jobs\n", + "UK interest rates held at 4.75%\n", + "SUBJ: interest rates VERB: held OBJ: %\n", + "US budget deficit to reach $368bn\n", + "SUBJ: VERB: budget OBJ: deficit\n", + "UK house prices dip in November\n", + "SUBJ: UK house prices VERB: dip OBJ: November\n", + "Verizon 'seals takeover of MCI'\n", + "SUBJ: Verizon seals takeover MCI VERB: None OBJ: None\n", + "Cars pull down US retail figures\n", + "SUBJ: Cars VERB: pull OBJ: figures\n", + "Christmas sales worst since 1981\n", + "SUBJ: Christmas sales VERB: None OBJ: None\n", + "Orange colour clash set for court\n", + "SUBJ: Orange colour clash court VERB: set OBJ: court\n", + "Steady job growth continues in US\n", + "SUBJ: job growth VERB: continues OBJ: \n", + "Fresh hope after Argentine crisis\n", + "SUBJ: hope crisis VERB: None OBJ: None\n", + "France Telecom gets Orange boost\n", + "SUBJ: France Telecom VERB: gets OBJ: Orange boost\n", + "Tate & Lyle boss bags top award\n", + "SUBJ: Tate Lyle boss bags award VERB: None OBJ: None\n", + "GSK aims to stop Aids profiteers\n", + "SUBJ: GSK VERB: aims OBJ: Aids profiteers\n", + "GM in crunch talks on Fiat future\n", + "SUBJ: GM crunch talks Fiat future VERB: None OBJ: None\n", + "News Corp eyes video games market\n", + "SUBJ: News Corp eyes VERB: video OBJ: games market\n", + "Market unfazed by Aurora setback\n", + "SUBJ: Market Aurora setback VERB: None OBJ: None\n", + "US gives foreign firms extra time\n", + "SUBJ: VERB: gives OBJ: firms\n", + "US economy shows solid GDP growth\n", + "SUBJ: economy GDP growth VERB: shows OBJ: GDP growth\n", + "India power shares jump on debut\n", + "SUBJ: India power shares VERB: jump OBJ: debut\n", + "Liberian economy starts to grow\n", + "SUBJ: economy VERB: starts OBJ: None\n", + "Tobacco giants hail court ruling\n", + "SUBJ: Tobacco giants VERB: hail OBJ: court ruling\n", + "Bad weather hits Nestle sales\n", + "SUBJ: weather VERB: hits OBJ: Nestle sales\n", + "EU ministers to mull jet fuel tax\n", + "SUBJ: EU ministers jet fuel tax VERB: mull OBJ: jet fuel tax\n", + "Indian oil firm eyes Yukos assets\n", + "SUBJ: oil firm VERB: eyes OBJ: Yukos assets\n", + "Consumers drive French economy\n", + "SUBJ: Consumers VERB: drive OBJ: economy\n", + "Troubled Marsh under SEC scrutiny\n", + "SUBJ: Marsh SEC scrutiny VERB: None OBJ: None\n", + "Bank holds interest rate at 4.75%\n", + "SUBJ: Bank VERB: holds OBJ: interest rate\n", + "Qantas considers offshore option\n", + "SUBJ: Qantas VERB: considers OBJ: offshore option\n", + "Steel firm 'to cut' 45,000 jobs\n", + "SUBJ: Steel firm jobs VERB: None OBJ: None\n", + "Borussia Dortmund near bust\n", + "SUBJ: Borussia Dortmund bust VERB: None OBJ: None\n", + "Economy 'strong' in election year\n", + "SUBJ: Economy election year VERB: None OBJ: None\n", + "China bans new tobacco factories\n", + "SUBJ: China VERB: bans OBJ: tobacco factories\n", + "India and Russia in energy talks\n", + "SUBJ: India Russia energy talks VERB: None OBJ: None\n", + "Arsenal 'may seek full share listing'\n", + "SUBJ: Arsenal VERB: seek OBJ: share listing\n", + "German jobless rate at new record\n", + "SUBJ: rate record VERB: None OBJ: None\n", + "US company admits Benin bribery\n", + "SUBJ: company Benin bribery VERB: admits OBJ: Benin bribery\n", + "Ailing EuroDisney vows turnaround\n", + "SUBJ: EuroDisney vows turnaround VERB: Ailing OBJ: EuroDisney vows turnaround\n", + "Record year for Chilean copper\n", + "SUBJ: Record year Chilean copper VERB: None OBJ: None\n", + "UK economy ends year with spurt\n", + "SUBJ: UK economy VERB: ends OBJ: year spurt\n", + "India-Pakistan peace boosts trade\n", + "SUBJ: India Pakistan peace boosts VERB: None OBJ: None\n", + "High fuel costs hit US airlines\n", + "SUBJ: fuel costs VERB: hit OBJ: \n", + "$1m payoff for former Shell boss\n", + "SUBJ: payoff Shell boss VERB: None OBJ: None\n", + "Palestinian economy in decline\n", + "SUBJ: economy decline VERB: None OBJ: None\n", + "Air China in $1bn London listing\n", + "SUBJ: Air China London listing VERB: None OBJ: None\n", + "Air Jamaica back in state control\n", + "SUBJ: Air Jamaica state control VERB: None OBJ: None\n", + "German growth goes into reverse\n", + "SUBJ: growth VERB: goes OBJ: reverse\n", + "Yukos owner sues Russia for $28bn\n", + "SUBJ: Yukos VERB: owner OBJ: sues Russia\n", + "Weak dollar trims Cadbury profits\n", + "SUBJ: dollar trims Cadbury profits VERB: None OBJ: None\n", + "'Post-Christmas lull' in lending\n", + "SUBJ: Post Christmas lull lending VERB: None OBJ: Post Christmas lull lending\n", + "Barclays shares up on merger talk\n", + "SUBJ: Barclays shares merger talk VERB: None OBJ: None\n", + "Soros group warns of Kazakh close\n", + "SUBJ: Soros group VERB: warns OBJ: Kazakh\n", + "Dollar hovers around record lows\n", + "SUBJ: Dollar hovers record lows VERB: None OBJ: None\n", + "WorldCom trial starts in New York\n", + "SUBJ: WorldCom trial VERB: starts OBJ: New York\n", + "Singapore growth at 8.1% in 2004\n", + "SUBJ: Singapore growth % VERB: None OBJ: None\n", + "US interest rate rise expected\n", + "SUBJ: VERB: rise OBJ: None\n", + "Ex-Boeing director gets jail term\n", + "SUBJ: Ex Boeing director jail term VERB: gets OBJ: jail term\n", + "Glaxo aims high after profit fall\n", + "SUBJ: Glaxo VERB: aims OBJ: profit fall\n", + "Vodafone appoints new Japan boss\n", + "SUBJ: Vodafone VERB: appoints OBJ: Japan boss\n", + "WorldCom bosses' $54m payout\n", + "SUBJ: WorldCom VERB: None OBJ: None\n", + "Ebbers 'aware' of WorldCom fraud\n", + "SUBJ: Ebbers WorldCom fraud VERB: None OBJ: None\n", + "Wall Street cool to eBay's profit\n", + "SUBJ: Wall Street VERB: cool OBJ: eBay profit\n", + "Could Yukos be a blessing in disguise?\n", + "SUBJ: Yukos VERB: be OBJ: blessing disguise\n", + "Budget Aston takes on Porsche\n", + "SUBJ: Budget Aston VERB: takes OBJ: Porsche\n", + "Cash gives way to flexible friend\n", + "SUBJ: Cash VERB: gives OBJ: way\n", + "Asia quake increases poverty risk\n", + "SUBJ: Asia quake increases poverty risk VERB: None OBJ: None\n", + "Parmalat boasts doubled profits\n", + "SUBJ: Parmalat VERB: boasts OBJ: profits\n", + "Burren awarded Egyptian contracts\n", + "SUBJ: Burren VERB: awarded OBJ: contracts\n", + "Germany calls for EU reform\n", + "SUBJ: Germany VERB: calls OBJ: EU reform\n", + "Asia shares defy post-quake gloom\n", + "SUBJ: Asia shares VERB: defy OBJ: quake gloom\n", + "EMI shares hit by profit warning\n", + "SUBJ: EMI shares profit warning VERB: hit OBJ: profit warning\n", + "Takeover offer for Sunderland FC\n", + "SUBJ: Takeover offer Sunderland FC VERB: None OBJ: None\n", + "Banker loses sexism claim\n", + "SUBJ: Banker VERB: loses OBJ: sexism claim\n", + "News Corp makes $5.4bn Fox offer\n", + "SUBJ: News Corp VERB: makes OBJ: Fox offer\n", + "India's Maruti sees profits jump\n", + "SUBJ: India Maruti VERB: sees OBJ: profits\n", + "Fosters buys stake in winemaker\n", + "SUBJ: Fosters VERB: buys OBJ: stake winemaker\n", + "Nasdaq planning $100m share sale\n", + "SUBJ: Nasdaq share sale VERB: planning OBJ: share sale\n", + "World leaders gather to face uncertainty\n", + "SUBJ: World leaders VERB: gather OBJ: uncertainty\n", + "Ore costs hit global steel firms\n", + "SUBJ: Ore VERB: costs OBJ: steel firms\n", + "Golden rule boost for Chancellor\n", + "SUBJ: Golden rule boost Chancellor VERB: None OBJ: None\n", + "Swiss cement firm in buying spree\n", + "SUBJ: cement firm spree VERB: buying OBJ: spree\n", + "Qantas sees profits fly to record\n", + "SUBJ: Qantas VERB: sees OBJ: profits\n", + "House prices rebound says Halifax\n", + "SUBJ: House prices VERB: rebound OBJ: Halifax\n", + "Circuit City gets takeover offer\n", + "SUBJ: Circuit City VERB: gets OBJ: takeover offer\n", + "Trade gap narrows as exports rise\n", + "SUBJ: Trade gap VERB: narrows OBJ: exports\n", + "Turkey turns on the economic charm\n", + "SUBJ: Turkey VERB: turns OBJ: charm\n", + "Qatar and Shell in $6bn gas deal\n", + "SUBJ: Qatar Shell gas deal VERB: None OBJ: None\n", + "Worldcom director ends evidence\n", + "SUBJ: Worldcom director VERB: ends OBJ: evidence\n", + "Disney settles disclosure charges\n", + "SUBJ: Disney VERB: settles OBJ: disclosure charges\n", + "S Korean credit card firm rescued\n", + "SUBJ: credit card firm VERB: rescued OBJ: None\n", + "Consumer spending lifts US growth\n", + "SUBJ: Consumer spending VERB: lifts OBJ: \n", + "Argentina closes $102.6bn debt swap\n", + "SUBJ: Argentina VERB: closes OBJ: debt swap\n", + "Building giant in asbestos payout\n", + "SUBJ: giant VERB: Building OBJ: giant\n", + "US seeks new $280bn smoker ruling\n", + "SUBJ: VERB: seeks OBJ: smoker ruling\n", + "Gaming firm to sell UK dog tracks\n", + "SUBJ: firm UK dog tracks VERB: Gaming OBJ: firm UK dog tracks\n", + "FAO warns on impact of subsidies\n", + "SUBJ: FAO VERB: warns OBJ: impact subsidies\n", + "Beer giant swallows Russian firm\n", + "SUBJ: Beer giant VERB: swallows OBJ: firm\n", + "Further rise in UK jobless total\n", + "SUBJ: rise UK total VERB: None OBJ: None\n", + "Japan narrowly escapes recession\n", + "SUBJ: Japan VERB: escapes OBJ: recession\n", + "Low-cost airlines hit Eurotunnel\n", + "SUBJ: cost airlines VERB: hit OBJ: Eurotunnel\n", + "UK economy facing 'major risks'\n", + "SUBJ: UK economy risks VERB: None OBJ: None\n", + "Barclays profits hit record level\n", + "SUBJ: Barclays profits VERB: hit OBJ: record level\n", + "MG Rover China tie-up 'delayed'\n", + "SUBJ: MG Rover China tie VERB: None OBJ: None\n", + "Asian quake hits European shares\n", + "SUBJ: quake VERB: hits OBJ: shares\n", + "SBC plans post-takeover job cuts\n", + "SUBJ: SBC VERB: plans OBJ: post takeover job cuts\n", + "Safety alert as GM recalls cars\n", + "SUBJ: Safety GM cars VERB: recalls OBJ: cars\n", + "Two Nigerian banks set to merge\n", + "SUBJ: banks VERB: set OBJ: None\n", + "India unveils anti-poverty budget\n", + "SUBJ: India VERB: unveils OBJ: poverty budget\n", + "Jarvis sells Tube stake to Spain\n", + "SUBJ: Jarvis VERB: sells OBJ: Tube stake\n", + "UK bank seals South Korean deal\n", + "SUBJ: UK bank seals deal VERB: None OBJ: None\n", + "US bank in $515m SEC settlement\n", + "SUBJ: VERB: None OBJ: \n", + "Boeing secures giant Japan order\n", + "SUBJ: Boeing secures Japan order VERB: None OBJ: None\n", + "US trade deficit widens sharply\n", + "SUBJ: VERB: trade OBJ: deficit\n", + "Lufthansa flies back to profit\n", + "SUBJ: Lufthansa VERB: flies OBJ: profit\n", + "HealthSouth ex-boss goes on trial\n", + "SUBJ: HealthSouth VERB: goes OBJ: boss trial\n", + "South African car demand surges\n", + "SUBJ: car demand VERB: surges OBJ: None\n", + "Share boost for feud-hit Reliance\n", + "SUBJ: Share boost feud hit Reliance VERB: None OBJ: None\n", + "GM pays $2bn to evade Fiat buyout\n", + "SUBJ: GM VERB: pays OBJ: \n", + "Nissan names successor to Ghosn\n", + "SUBJ: Nissan names successor Ghosn VERB: None OBJ: None\n", + "S&N extends Indian beer venture\n", + "SUBJ: S N VERB: extends OBJ: beer venture\n", + "Israeli economy picking up pace\n", + "SUBJ: economy VERB: picking OBJ: pace\n", + "Ukraine steel sell-off 'illegal'\n", + "SUBJ: Ukraine steel VERB: sell OBJ: \n", + "Dutch bank to lay off 2,850 staff\n", + "SUBJ: bank staff VERB: lay OBJ: staff\n", + "Ad firm WPP's profits surge 15%\n", + "SUBJ: Ad firm WPP profits VERB: surge OBJ: %\n", + "Algeria hit by further gas riots\n", + "SUBJ: Algeria gas riots VERB: hit OBJ: gas riots\n", + "US in EU tariff chaos trade row\n", + "SUBJ: EU tariff chaos trade row VERB: None OBJ: None\n", + "Crossrail link 'to get go-ahead'\n", + "SUBJ: Crossrail link VERB: get OBJ: None\n", + "Israel looks to US for bank chief\n", + "SUBJ: Israel VERB: looks OBJ: bank chief\n", + "Rescue hope for Borussia Dortmund\n", + "SUBJ: Rescue hope Borussia Dortmund VERB: None OBJ: None\n", + "Shares hit by MS drug suspension\n", + "SUBJ: Shares MS drug suspension VERB: hit OBJ: MS drug suspension\n", + "S Korea spending boost to economy\n", + "SUBJ: S Korea spending boost economy VERB: None OBJ: None\n", + "Australia rates at four year high\n", + "SUBJ: Australia VERB: rates OBJ: year\n", + "China continues breakneck growth\n", + "SUBJ: China VERB: continues OBJ: breakneck growth\n", + "Iran budget seeks state sell-offs\n", + "SUBJ: Iran budget VERB: seeks OBJ: offs\n", + "Deutsche Boerse boosts dividend\n", + "SUBJ: Deutsche Boerse VERB: boosts OBJ: dividend\n", + "IMF agrees fresh Turkey funding\n", + "SUBJ: IMF VERB: agrees OBJ: Turkey funding\n", + "Rich grab half Colombia poor fund\n", + "SUBJ: grab half Colombia fund VERB: None OBJ: None\n", + "Tsunami to cost Sri Lanka $1.3bn\n", + "SUBJ: Tsunami VERB: cost OBJ: Sri Lanka\n", + "Diageo to buy US wine firm\n", + "SUBJ: Diageo wine firm VERB: buy OBJ: wine firm\n", + "European losses hit GM's profits\n", + "SUBJ: losses VERB: hit OBJ: GM profits\n", + "Water firm Suez in Argentina row\n", + "SUBJ: Water firm Suez Argentina row VERB: None OBJ: None\n", + "Gold falls on IMF sale concerns\n", + "SUBJ: Gold VERB: falls OBJ: IMF sale concerns\n", + "Venezuela and China sign oil deal\n", + "SUBJ: Venezuela China sign oil deal VERB: None OBJ: None\n", + "Dollar gains on Greenspan speech\n", + "SUBJ: Dollar gains Greenspan speech VERB: None OBJ: None\n", + "Lacroix label bought by US firm\n", + "SUBJ: Lacroix label firm VERB: bought OBJ: firm\n", + "Reliance unit loses Anil Ambani\n", + "SUBJ: Reliance unit VERB: loses OBJ: Anil Ambani\n", + "Durex maker SSL awaits firm bid\n", + "SUBJ: Durex maker SSL VERB: awaits OBJ: firm bid\n", + "Call to save manufacturing jobs\n", + "SUBJ: manufacturing jobs VERB: Call OBJ: manufacturing jobs\n", + "German economy rebounds\n", + "SUBJ: economy VERB: rebounds OBJ: None\n", + "Saudi investor picks up the Savoy\n", + "SUBJ: investor VERB: picks OBJ: Savoy\n", + "Nigeria to boost cocoa production\n", + "SUBJ: Nigeria cocoa production VERB: boost OBJ: cocoa production\n", + "Cairn shares slump on oil setback\n", + "SUBJ: Cairn shares VERB: slump OBJ: oil setback\n", + "Wal-Mart to pay $14m in gun suit\n", + "SUBJ: Wal Mart gun suit VERB: pay OBJ: gun suit\n", + "Deutsche Telekom sees mobile gain\n", + "SUBJ: Deutsche Telekom VERB: sees OBJ: gain\n", + "Gazprom 'in $36m back-tax claim'\n", + "SUBJ: Gazprom VERB: None OBJ: None\n", + "Brussels raps mobile call charges\n", + "SUBJ: Brussels VERB: raps OBJ: call charges\n", + "Man Utd to open books to Glazer\n", + "SUBJ: Man Utd books Glazer VERB: open OBJ: books\n", + "Retail sales show festive fervour\n", + "SUBJ: sales VERB: show OBJ: fervour\n", + "BMW cash to fuel Mini production\n", + "SUBJ: BMW cash Mini production VERB: fuel OBJ: Mini production\n", + "US insurer Marsh cuts 2,500 jobs\n", + "SUBJ: VERB: insurer OBJ: Marsh cuts jobs\n", + "Ford gains from finance not cars\n", + "SUBJ: Ford VERB: gains OBJ: finance\n", + "Feta cheese battle reaches court\n", + "SUBJ: Feta cheese battle VERB: reaches OBJ: court\n", + "Monsanto fined $1.5m for bribery\n", + "SUBJ: Monsanto VERB: fined OBJ: \n", + "China Aviation seeks rescue deal\n", + "SUBJ: China Aviation VERB: seeks OBJ: rescue deal\n", + "Quake's economic costs emerging\n", + "SUBJ: Quake costs VERB: emerging OBJ: None\n", + "Saudi NCCI's shares soar\n", + "SUBJ: Saudi NCCI shares VERB: soar OBJ: None\n", + "Yukos heading back to US courts\n", + "SUBJ: Yukos courts VERB: heading OBJ: courts\n", + "News Corp eyes video games market\n", + "SUBJ: News Corp eyes VERB: video OBJ: games market\n", + "Firms pump billions into pensions\n", + "SUBJ: Firms VERB: pump OBJ: billions\n", + "US firm 'bids for Lacroix label'\n", + "SUBJ: VERB: None OBJ: None\n", + "US manufacturing expands\n", + "SUBJ: VERB: manufacturing OBJ: expands\n", + "Weak end-of-year sales hit Next\n", + "SUBJ: end year sales VERB: hit OBJ: None\n", + "Business confidence dips in Japan\n", + "SUBJ: Business confidence dips Japan VERB: None OBJ: None\n", + "BMW to recall faulty diesel cars\n", + "SUBJ: BMW diesel cars VERB: recall OBJ: diesel cars\n", + "Quiksilver moves for Rossignol\n", + "SUBJ: Quiksilver VERB: moves OBJ: Rossignol\n", + "House prices show slight increase\n", + "SUBJ: House prices VERB: show OBJ: increase\n", + "Winn-Dixie files for bankruptcy\n", + "SUBJ: Winn Dixie files bankruptcy VERB: None OBJ: None\n", + "Deutsche Boerse set to 'woo' LSE\n", + "SUBJ: Deutsche Boerse VERB: set OBJ: woo LSE\n", + "Ericsson sees earnings improve\n", + "SUBJ: Ericsson VERB: sees OBJ: earnings\n", + "Aids and climate top Davos agenda\n", + "SUBJ: Aids climate Davos agenda VERB: None OBJ: None\n", + "Indonesia 'declines debt freeze'\n", + "SUBJ: Indonesia declines debt freeze VERB: None OBJ: None\n", + "Oil companies get Russian setback\n", + "SUBJ: Oil companies VERB: get OBJ: setback\n", + "Economy 'stronger than forecast'\n", + "SUBJ: Economy forecast VERB: None OBJ: None\n", + "Bush to outline 'toughest' budget\n", + "SUBJ: Bush VERB: outline OBJ: budget\n", + "Disaster claims 'less than $10bn'\n", + "SUBJ: Disaster claims VERB: None OBJ: None\n", + "Virgin Blue shares plummet 20%\n", + "SUBJ: Virgin Blue shares VERB: plummet OBJ: %\n", + "Cuba winds back economic clock\n", + "SUBJ: Cuba VERB: winds OBJ: clock\n", + "FBI agent colludes with analyst\n", + "SUBJ: FBI agent VERB: colludes OBJ: analyst\n", + "Court rejects $280bn tobacco case\n", + "SUBJ: Court VERB: rejects OBJ: tobacco case\n", + "Enron bosses in $168m payout\n", + "SUBJ: Enron VERB: bosses OBJ: \n", + "'Golden economic period' to end\n", + "SUBJ: period VERB: end OBJ: period\n", + "Call to overhaul UK state pension\n", + "SUBJ: UK state pension VERB: Call OBJ: UK state pension\n", + "Slowdown hits US factory growth\n", + "SUBJ: Slowdown VERB: hits OBJ: factory growth\n", + "Europe blames US over weak dollar\n", + "SUBJ: Europe VERB: blames OBJ: \n" + ] + } + ], + "source": [ + "for line in business:\n", + " tok.set_content(line.strip())\n", + " seq = extract_sequences(tok)[0]\n", + " \n", + " tagger.tag(seq)\n", + " tree = parser.parse(seq)\n", + " \n", + " extractor = SVOExtractor()\n", + " tree.visit(extractor)\n", + " print(line.strip())\n", + " print(\"SUBJ: {} VERB: {} OBJ: {}\".format(extractor.subject, extractor.verb, extractor.object))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/4-classification.ipynb b/metapy/tutorials/4-classification.ipynb new file mode 100644 index 0000000000..2885ce1447 --- /dev/null +++ b/metapy/tutorials/4-classification.ipynb @@ -0,0 +1,1603 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's import the Python bindings, as usual." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import metapy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's download a list of stopwords and a small dataset to begin playing around with classifiers in MeTA." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-04-18 15:20:46-- https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving raw.githubusercontent.com... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2747 (2.7K) [text/plain]\n", + "Saving to: ‘lemur-stopwords.txt’\n", + "\n", + "lemur-stopwords.txt 100%[===================>] 2.68K --.-KB/s in 0s \n", + "\n", + "Last-modified header missing -- time-stamps turned off.\n", + "2017-04-18 15:20:46 (90.4 MB/s) - ‘lemur-stopwords.txt’ saved [2747/2747]\n", + "\n" + ] + } + ], + "source": [ + "!wget -N https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-04-18 15:20:47-- https://meta-toolkit.org/data/2016-01-26/ceeaus.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving meta-toolkit.org... 50.116.41.177, 2600:3c02::f03c:91ff:feae:b777\n", + "Connecting to meta-toolkit.org|50.116.41.177|:443... connected.\n", + "HTTP request sent, awaiting response... 304 Not Modified\n", + "File ‘ceeaus.tar.gz’ not modified on server. Omitting download.\n", + "\n", + "ceeaus/\n", + "ceeaus/ceeaus.dat.names.gz\n", + "ceeaus/ceeaus.dat.gz\n", + "ceeaus/ceeaus.dat.names\n", + "ceeaus/line.toml\n", + "ceeaus/ceeaus.dat.labels\n", + "ceeaus/ceeaus-nationality-full-corpus.txt\n", + "ceeaus/gz.toml\n", + "ceeaus/qrels.txt\n", + "ceeaus/file.toml\n", + "ceeaus/ceeaus-full-corpus.txt\n", + "ceeaus/japanese/\n", + "ceeaus/japanese/ceejus_M_smk_174.txt\n", + "ceeaus/japanese/ceejus_S_smk_090.txt\n", + "ceeaus/japanese/ceejus_L_ptj_14.txt\n", + "ceeaus/japanese/ceejus_S_ptj_024.txt\n", + "ceeaus/japanese/ceejus_M_smk_068.txt\n", + "ceeaus/japanese/ceejus_S_smk_115.txt\n", + "ceeaus/japanese/ceejus_M_ptj_128.txt\n", + "ceeaus/japanese/ceejus_M_smk_071.txt\n", + "ceeaus/japanese/ceejus_S_ptj_102.txt\n", + "ceeaus/japanese/ceejus_S_smk_075.txt\n", + "ceeaus/japanese/ceejus_S_smk_119.txt\n", + "ceeaus/japanese/ceejus_L_smk_30.txt\n", + "ceeaus/japanese/ceejus_M_smk_069.txt\n", + "ceeaus/japanese/ceejus_M_smk_108.txt\n", + "ceeaus/japanese/ceejus_U_smk_07.txt\n", + "ceeaus/japanese/ceejus_M_smk_048.txt\n", + "ceeaus/japanese/ceejus_M_ptj_129.txt\n", + "ceeaus/japanese/ceejus_M_ptj_147.txt\n", + "ceeaus/japanese/ceejus_M_smk_144.txt\n", + "ceeaus/japanese/ceejus_S_ptj_050.txt\n", + "ceeaus/japanese/ceejus_M_smk_159.txt\n", + "ceeaus/japanese/ceejus_L_ptj_06.txt\n", + "ceeaus/japanese/ceejus_L_smk_32.txt\n", + "ceeaus/japanese/ceejus_M_smk_095.txt\n", + "ceeaus/japanese/ceejus_U_smk_02.txt\n", + "ceeaus/japanese/ceejus_M_smk_127.txt\n", + "ceeaus/japanese/ceejus_M_ptj_013.txt\n", + "ceeaus/japanese/ceejus_M_smk_122.txt\n", + "ceeaus/japanese/ceejus_M_ptj_068.txt\n", + "ceeaus/japanese/ceejus_M_smk_083.txt\n", + "ceeaus/japanese/ceejus_M_smk_053.txt\n", + "ceeaus/japanese/ceejus_S_smk_112.txt\n", + "ceeaus/japanese/ceejus_M_smk_054.txt\n", + "ceeaus/japanese/ceejus_M_smk_163.txt\n", + "ceeaus/japanese/ceejus_L_ptj_31.txt\n", + "ceeaus/japanese/ceejus_S_ptj_090.txt\n", + "ceeaus/japanese/ceejus_M_smk_009.txt\n", + "ceeaus/japanese/ceejus_M_ptj_166.txt\n", + "ceeaus/japanese/ceejus_M_smk_086.txt\n", + "ceeaus/japanese/ceejus_M_smk_124.txt\n", + "ceeaus/japanese/ceejus_M_ptj_041.txt\n", + "ceeaus/japanese/ceejus_S_smk_078.txt\n", + "ceeaus/japanese/ceejus_M_ptj_197.txt\n", + "ceeaus/japanese/ceejus_M_smk_013.txt\n", + "ceeaus/japanese/ceejus_S_smk_060.txt\n", + "ceeaus/japanese/ceejus_M_smk_146.txt\n", + "ceeaus/japanese/ceejus_M_ptj_057.txt\n", + "ceeaus/japanese/ceejus_M_smk_005.txt\n", + "ceeaus/japanese/ceejus_M_ptj_007.txt\n", + "ceeaus/japanese/ceejus_U_ptj_07.txt\n", + "ceeaus/japanese/ceejus_M_ptj_010.txt\n", + "ceeaus/japanese/ceejus_S_ptj_064.txt\n", + "ceeaus/japanese/ceejus_M_ptj_037.txt\n", + "ceeaus/japanese/ceejus_M_ptj_192.txt\n", + "ceeaus/japanese/ceejus_M_ptj_044.txt\n", + "ceeaus/japanese/ceejus_S_ptj_122.txt\n", + "ceeaus/japanese/ceejus_M_smk_128.txt\n", + "ceeaus/japanese/ceejus_M_ptj_090.txt\n", + "ceeaus/japanese/ceejus_M_ptj_189.txt\n", + "ceeaus/japanese/ceejus_M_smk_172.txt\n", + "ceeaus/japanese/ceejus_M_ptj_180.txt\n", + "ceeaus/japanese/ceejus_U_smk_04.txt\n", + "ceeaus/japanese/ceejus_L_smk_33.txt\n", + "ceeaus/japanese/ceejus_M_smk_171.txt\n", + "ceeaus/japanese/ceejus_S_ptj_072.txt\n", + "ceeaus/japanese/ceejus_M_smk_119.txt\n", + "ceeaus/japanese/ceejus_L_ptj_04.txt\n", + "ceeaus/japanese/ceejus_M_ptj_004.txt\n", + "ceeaus/japanese/ceejus_M_smk_061.txt\n", + "ceeaus/japanese/ceejus_S_ptj_103.txt\n", + "ceeaus/japanese/ceejus_L_smk_14.txt\n", + "ceeaus/japanese/ceejus_M_smk_106.txt\n", + "ceeaus/japanese/ceejus_M_smk_135.txt\n", + "ceeaus/japanese/ceejus_M_ptj_115.txt\n", + "ceeaus/japanese/ceejus_M_smk_187.txt\n", + "ceeaus/japanese/ceejus_M_smk_073.txt\n", + "ceeaus/japanese/ceejus_M_ptj_191.txt\n", + "ceeaus/japanese/ceejus_S_ptj_029.txt\n", + "ceeaus/japanese/ceejus_S_ptj_002.txt\n", + "ceeaus/japanese/ceejus_M_smk_041.txt\n", + "ceeaus/japanese/ceejus_M_ptj_015.txt\n", + "ceeaus/japanese/ceejus_M_ptj_084.txt\n", + "ceeaus/japanese/ceejus_M_ptj_064.txt\n", + "ceeaus/japanese/ceejus_M_ptj_042.txt\n", + "ceeaus/japanese/ceejus_M_ptj_102.txt\n", + "ceeaus/japanese/ceejus_M_smk_084.txt\n", + "ceeaus/japanese/ceejus_M_ptj_110.txt\n", + "ceeaus/japanese/ceejus_M_ptj_130.txt\n", + "ceeaus/japanese/ceejus_M_smk_072.txt\n", + "ceeaus/japanese/ceejus_M_ptj_048.txt\n", + "ceeaus/japanese/ceejus_M_ptj_119.txt\n", + "ceeaus/japanese/ceejus_M_smk_004.txt\n", + "ceeaus/japanese/ceejus_M_ptj_032.txt\n", + "ceeaus/japanese/ceejus_L_smk_10.txt\n", + "ceeaus/japanese/ceejus_M_ptj_185.txt\n", + "ceeaus/japanese/ceejus_S_smk_018.txt\n", + "ceeaus/japanese/ceejus_U_smk_03.txt\n", + "ceeaus/japanese/ceejus_M_smk_038.txt\n", + "ceeaus/japanese/ceejus_S_ptj_040.txt\n", + "ceeaus/japanese/ceejus_M_ptj_096.txt\n", + "ceeaus/japanese/ceejus_M_ptj_127.txt\n", + "ceeaus/japanese/ceejus_S_ptj_092.txt\n", + "ceeaus/japanese/ceejus_M_smk_055.txt\n", + "ceeaus/japanese/ceejus_S_ptj_071.txt\n", + "ceeaus/japanese/ceejus_M_smk_030.txt\n", + "ceeaus/japanese/ceejus_M_smk_011.txt\n", + "ceeaus/japanese/ceejus_M_smk_057.txt\n", + "ceeaus/japanese/ceejus_S_ptj_055.txt\n", + "ceeaus/japanese/ceejus_M_ptj_019.txt\n", + "ceeaus/japanese/ceejus_S_ptj_053.txt\n", + "ceeaus/japanese/ceejus_M_smk_165.txt\n", + "ceeaus/japanese/ceejus_U_smk_08.txt\n", + "ceeaus/japanese/ceejus_M_smk_002.txt\n", + "ceeaus/japanese/ceejus_M_ptj_145.txt\n", + "ceeaus/japanese/ceejus_M_smk_079.txt\n", + "ceeaus/japanese/ceejus_M_ptj_040.txt\n", + "ceeaus/japanese/ceejus_M_smk_032.txt\n", + "ceeaus/japanese/ceejus_S_smk_101.txt\n", + "ceeaus/japanese/ceejus_M_smk_081.txt\n", + "ceeaus/japanese/ceejus_S_smk_068.txt\n", + "ceeaus/japanese/ceejus_S_smk_067.txt\n", + "ceeaus/japanese/ceejus_M_smk_168.txt\n", + "ceeaus/japanese/ceejus_S_ptj_015.txt\n", + "ceeaus/japanese/ceejus_L_smk_04.txt\n", + "ceeaus/japanese/ceejus_S_smk_031.txt\n", + "ceeaus/japanese/ceejus_S_smk_071.txt\n", + "ceeaus/japanese/ceejus_U_ptj_05.txt\n", + "ceeaus/japanese/ceejus_M_ptj_124.txt\n", + "ceeaus/japanese/ceejus_M_smk_141.txt\n", + "ceeaus/japanese/ceejus_M_ptj_066.txt\n", + "ceeaus/japanese/ceejus_L_ptj_09.txt\n", + "ceeaus/japanese/ceejus_S_smk_019.txt\n", + "ceeaus/japanese/ceejus_S_ptj_028.txt\n", + "ceeaus/japanese/ceejus_M_smk_183.txt\n", + "ceeaus/japanese/ceejus_S_smk_042.txt\n", + "ceeaus/japanese/ceejus_M_smk_094.txt\n", + "ceeaus/japanese/ceejus_M_ptj_077.txt\n", + "ceeaus/japanese/ceejus_L_ptj_05.txt\n", + "ceeaus/japanese/ceejus_S_smk_056.txt\n", + "ceeaus/japanese/ceejus_M_smk_102.txt\n", + "ceeaus/japanese/ceejus_L_ptj_37.txt\n", + "ceeaus/japanese/ceejus_M_ptj_076.txt\n", + "ceeaus/japanese/ceejus_U_smk_15.txt\n", + "ceeaus/japanese/ceejus_M_ptj_046.txt\n", + "ceeaus/japanese/ceejus_M_smk_049.txt\n", + "ceeaus/japanese/ceejus_L_ptj_21.txt\n", + "ceeaus/japanese/ceejus_M_smk_051.txt\n", + "ceeaus/japanese/ceejus_M_ptj_176.txt\n", + "ceeaus/japanese/ceejus_S_smk_054.txt\n", + "ceeaus/japanese/ceejus_S_ptj_074.txt\n", + "ceeaus/japanese/ceejus_M_ptj_194.txt\n", + "ceeaus/japanese/ceejus_M_ptj_154.txt\n", + "ceeaus/japanese/ceejus_M_ptj_097.txt\n", + "ceeaus/japanese/ceejus_S_smk_110.txt\n", + "ceeaus/japanese/ceejus_M_smk_075.txt\n", + "ceeaus/japanese/ceejus_M_ptj_109.txt\n", + "ceeaus/japanese/ceejus_S_smk_048.txt\n", + "ceeaus/japanese/ceejus_M_smk_062.txt\n", + "ceeaus/japanese/ceejus_S_ptj_065.txt\n", + "ceeaus/japanese/ceejus_M_smk_046.txt\n", + "ceeaus/japanese/ceejus_M_ptj_088.txt\n", + "ceeaus/japanese/ceejus_M_smk_194.txt\n", + "ceeaus/japanese/ceejus_M_smk_078.txt\n", + "ceeaus/japanese/ceejus_M_ptj_156.txt\n", + "ceeaus/japanese/ceejus_M_smk_188.txt\n", + "ceeaus/japanese/ceejus_M_ptj_058.txt\n", + "ceeaus/japanese/ceejus_M_smk_162.txt\n", + "ceeaus/japanese/ceejus_M_smk_125.txt\n", + "ceeaus/japanese/ceejus_M_smk_160.txt\n", + "ceeaus/japanese/ceejus_M_smk_035.txt\n", + "ceeaus/japanese/ceejus_M_smk_158.txt\n", + "ceeaus/japanese/ceejus_L_smk_37.txt\n", + "ceeaus/japanese/ceejus_M_ptj_012.txt\n", + "ceeaus/japanese/ceejus_S_ptj_111.txt\n", + "ceeaus/japanese/ceejus_S_ptj_022.txt\n", + "ceeaus/japanese/ceejus_S_ptj_059.txt\n", + "ceeaus/japanese/ceejus_M_smk_169.txt\n", + "ceeaus/japanese/ceejus_L_smk_23.txt\n", + "ceeaus/japanese/ceejus_S_smk_022.txt\n", + "ceeaus/japanese/ceejus_L_ptj_41.txt\n", + "ceeaus/japanese/ceejus_L_ptj_35.txt\n", + "ceeaus/japanese/ceejus_M_ptj_164.txt\n", + "ceeaus/japanese/ceejus_M_ptj_018.txt\n", + "ceeaus/japanese/ceejus_M_ptj_093.txt\n", + "ceeaus/japanese/ceejus_M_ptj_141.txt\n", + "ceeaus/japanese/ceejus_M_ptj_059.txt\n", + "ceeaus/japanese/ceejus_S_smk_051.txt\n", + "ceeaus/japanese/ceejus_M_ptj_152.txt\n", + "ceeaus/japanese/ceejus_S_smk_011.txt\n", + "ceeaus/japanese/ceejus_M_smk_026.txt\n", + "ceeaus/japanese/ceejus_M_ptj_016.txt\n", + "ceeaus/japanese/ceejus_M_ptj_030.txt\n", + "ceeaus/japanese/ceejus_S_smk_106.txt\n", + "ceeaus/japanese/ceejus_M_ptj_049.txt\n", + "ceeaus/japanese/ceejus_M_smk_143.txt\n", + "ceeaus/japanese/ceejus_L_ptj_23.txt\n", + "ceeaus/japanese/ceejus_S_smk_083.txt\n", + "ceeaus/japanese/ceejus_M_smk_012.txt\n", + "ceeaus/japanese/ceejus_S_ptj_045.txt\n", + "ceeaus/japanese/ceejus_M_smk_192.txt\n", + "ceeaus/japanese/ceejus_L_ptj_03.txt\n", + "ceeaus/japanese/ceejus_M_ptj_092.txt\n", + "ceeaus/japanese/ceejus_M_smk_138.txt\n", + "ceeaus/japanese/ceejus_M_smk_080.txt\n", + "ceeaus/japanese/ceejus_M_ptj_144.txt\n", + "ceeaus/japanese/ceejus_S_ptj_073.txt\n", + "ceeaus/japanese/ceejus_M_ptj_163.txt\n", + "ceeaus/japanese/ceejus_M_smk_088.txt\n", + "ceeaus/japanese/ceejus_L_smk_09.txt\n", + "ceeaus/japanese/ceejus_M_ptj_061.txt\n", + "ceeaus/japanese/ceejus_M_ptj_009.txt\n", + "ceeaus/japanese/ceejus_U_ptj_03.txt\n", + "ceeaus/japanese/ceejus_M_smk_130.txt\n", + "ceeaus/japanese/ceejus_M_ptj_183.txt\n", + "ceeaus/japanese/ceejus_S_ptj_021.txt\n", + "ceeaus/japanese/ceejus_M_smk_184.txt\n", + "ceeaus/japanese/ceejus_M_smk_137.txt\n", + "ceeaus/japanese/ceejus_M_smk_064.txt\n", + "ceeaus/japanese/ceejus_S_smk_116.txt\n", + "ceeaus/japanese/ceejus_M_smk_112.txt\n", + "ceeaus/japanese/ceejus_M_ptj_078.txt\n", + "ceeaus/japanese/ceejus_S_smk_128.txt\n", + "ceeaus/japanese/ceejus_M_smk_129.txt\n", + "ceeaus/japanese/ceejus_M_smk_178.txt\n", + "ceeaus/japanese/ceejus_M_smk_036.txt\n", + "ceeaus/japanese/ceejus_M_smk_082.txt\n", + "ceeaus/japanese/ceejus_S_smk_012.txt\n", + "ceeaus/japanese/ceejus_S_ptj_078.txt\n", + "ceeaus/japanese/ceejus_S_ptj_086.txt\n", + "ceeaus/japanese/ceejus_M_smk_118.txt\n", + "ceeaus/japanese/ceejus_S_ptj_014.txt\n", + "ceeaus/japanese/ceejus_S_ptj_081.txt\n", + "ceeaus/japanese/ceejus_M_smk_024.txt\n", + "ceeaus/japanese/ceejus_U_ptj_06.txt\n", + "ceeaus/japanese/ceejus_M_smk_173.txt\n", + "ceeaus/japanese/ceejus_S_ptj_042.txt\n", + "ceeaus/japanese/ceejus_S_ptj_084.txt\n", + "ceeaus/japanese/ceejus_S_ptj_070.txt\n", + "ceeaus/japanese/ceejus_S_smk_118.txt\n", + "ceeaus/japanese/ceejus_M_ptj_056.txt\n", + "ceeaus/japanese/ceejus_S_smk_027.txt\n", + "ceeaus/japanese/ceejus_M_ptj_116.txt\n", + "ceeaus/japanese/ceejus_S_smk_044.txt\n", + "ceeaus/japanese/ceejus_S_ptj_124.txt\n", + "ceeaus/japanese/ceejus_M_smk_197.txt\n", + "ceeaus/japanese/ceejus_S_ptj_034.txt\n", + "ceeaus/japanese/ceejus_M_ptj_080.txt\n", + "ceeaus/japanese/ceejus_S_ptj_046.txt\n", + "ceeaus/japanese/ceejus_U_ptj_18.txt\n", + "ceeaus/japanese/ceejus_M_smk_017.txt\n", + "ceeaus/japanese/ceejus_S_ptj_095.txt\n", + "ceeaus/japanese/ceejus_U_ptj_11.txt\n", + "ceeaus/japanese/ceejus_L_smk_20.txt\n", + "ceeaus/japanese/ceejus_M_smk_010.txt\n", + "ceeaus/japanese/ceejus_M_ptj_071.txt\n", + "ceeaus/japanese/ceejus_M_ptj_085.txt\n", + "ceeaus/japanese/ceejus_S_ptj_044.txt\n", + "ceeaus/japanese/ceejus_M_smk_139.txt\n", + "ceeaus/japanese/ceejus_M_ptj_155.txt\n", + "ceeaus/japanese/ceejus_M_ptj_023.txt\n", + "ceeaus/japanese/ceejus_L_ptj_02.txt\n", + "ceeaus/japanese/ceejus_L_ptj_20.txt\n", + "ceeaus/japanese/ceejus_U_smk_16.txt\n", + "ceeaus/japanese/ceejus_S_ptj_119.txt\n", + "ceeaus/japanese/ceejus_M_ptj_123.txt\n", + "ceeaus/japanese/ceejus_M_ptj_003.txt\n", + "ceeaus/japanese/ceejus_S_ptj_104.txt\n", + "ceeaus/japanese/ceejus_M_smk_132.txt\n", + "ceeaus/japanese/ceejus_M_smk_033.txt\n", + "ceeaus/japanese/ceejus_M_ptj_065.txt\n", + "ceeaus/japanese/ceejus_S_ptj_099.txt\n", + "ceeaus/japanese/ceejus_S_ptj_058.txt\n", + "ceeaus/japanese/ceejus_S_ptj_114.txt\n", + "ceeaus/japanese/ceejus_S_ptj_123.txt\n", + "ceeaus/japanese/ceejus_M_ptj_120.txt\n", + "ceeaus/japanese/ceejus_M_smk_098.txt\n", + "ceeaus/japanese/ceejus_S_ptj_105.txt\n", + "ceeaus/japanese/ceejus_S_smk_072.txt\n", + "ceeaus/japanese/ceejus_S_ptj_051.txt\n", + "ceeaus/japanese/ceejus_L_ptj_07.txt\n", + "ceeaus/japanese/ceejus_S_smk_120.txt\n", + "ceeaus/japanese/ceejus_S_ptj_013.txt\n", + "ceeaus/japanese/ceejus_L_ptj_33.txt\n", + "ceeaus/japanese/ceejus_M_ptj_172.txt\n", + "ceeaus/japanese/ceejus_U_ptj_14.txt\n", + "ceeaus/japanese/ceejus_M_smk_155.txt\n", + "ceeaus/japanese/ceejus_M_smk_103.txt\n", + "ceeaus/japanese/ceejus_M_smk_116.txt\n", + "ceeaus/japanese/ceejus_S_smk_007.txt\n", + "ceeaus/japanese/ceejus_M_ptj_175.txt\n", + "ceeaus/japanese/ceejus_S_smk_074.txt\n", + "ceeaus/japanese/ceejus_M_smk_091.txt\n", + "ceeaus/japanese/ceejus_M_smk_001.txt\n", + "ceeaus/japanese/ceejus_M_ptj_182.txt\n", + "ceeaus/japanese/ceejus_M_ptj_108.txt\n", + "ceeaus/japanese/ceejus_M_smk_123.txt\n", + "ceeaus/japanese/ceejus_S_ptj_008.txt\n", + "ceeaus/japanese/ceejus_M_smk_066.txt\n", + "ceeaus/japanese/ceejus_S_smk_064.txt\n", + "ceeaus/japanese/ceejus_L_smk_03.txt\n", + "ceeaus/japanese/ceejus_M_smk_096.txt\n", + "ceeaus/japanese/ceejus_S_ptj_061.txt\n", + "ceeaus/japanese/ceejus_S_smk_004.txt\n", + "ceeaus/japanese/ceejus_M_smk_113.txt\n", + "ceeaus/japanese/ceejus_M_ptj_027.txt\n", + "ceeaus/japanese/ceejus_S_ptj_077.txt\n", + "ceeaus/japanese/ceejus_M_ptj_187.txt\n", + "ceeaus/japanese/ceejus_U_smk_14.txt\n", + "ceeaus/japanese/ceejus_S_ptj_047.txt\n", + "ceeaus/japanese/ceejus_M_ptj_117.txt\n", + "ceeaus/japanese/ceejus_M_ptj_050.txt\n", + "ceeaus/japanese/ceejus_S_ptj_043.txt\n", + "ceeaus/japanese/ceejus_S_ptj_125.txt\n", + "ceeaus/japanese/ceejus_M_smk_145.txt\n", + "ceeaus/japanese/ceejus_S_ptj_116.txt\n", + "ceeaus/japanese/ceejus_S_ptj_089.txt\n", + "ceeaus/japanese/ceejus_S_ptj_011.txt\n", + "ceeaus/japanese/ceejus_U_ptj_12.txt\n", + "ceeaus/japanese/ceejus_S_smk_070.txt\n", + "ceeaus/japanese/ceejus_U_ptj_13.txt\n", + "ceeaus/japanese/ceejus_U_smk_13.txt\n", + "ceeaus/japanese/ceejus_S_smk_001.txt\n", + "ceeaus/japanese/ceejus_M_ptj_179.txt\n", + "ceeaus/japanese/ceejus_U_ptj_01.txt\n", + "ceeaus/japanese/ceejus_M_ptj_171.txt\n", + "ceeaus/japanese/ceejus_M_ptj_195.txt\n", + "ceeaus/japanese/ceejus_M_ptj_022.txt\n", + "ceeaus/japanese/ceejus_L_smk_29.txt\n", + "ceeaus/japanese/ceejus_M_smk_150.txt\n", + "ceeaus/japanese/ceejus_S_smk_005.txt\n", + "ceeaus/japanese/ceejus_S_ptj_079.txt\n", + "ceeaus/japanese/ceejus_L_ptj_13.txt\n", + "ceeaus/japanese/ceejus_S_ptj_049.txt\n", + "ceeaus/japanese/ceejus_M_smk_040.txt\n", + "ceeaus/japanese/ceejus_S_ptj_106.txt\n", + "ceeaus/japanese/ceejus_S_smk_124.txt\n", + "ceeaus/japanese/ceejus_S_smk_055.txt\n", + "ceeaus/japanese/ceejus_S_ptj_019.txt\n", + "ceeaus/japanese/ceejus_M_ptj_137.txt\n", + "ceeaus/japanese/ceejus_S_ptj_027.txt\n", + "ceeaus/japanese/ceejus_M_smk_067.txt\n", + "ceeaus/japanese/ceejus_M_ptj_160.txt\n", + "ceeaus/japanese/ceejus_S_ptj_068.txt\n", + "ceeaus/japanese/ceejus_S_smk_016.txt\n", + "ceeaus/japanese/ceejus_L_smk_15.txt\n", + "ceeaus/japanese/ceejus_M_smk_142.txt\n", + "ceeaus/japanese/ceejus_S_smk_026.txt\n", + "ceeaus/japanese/ceejus_S_smk_082.txt\n", + "ceeaus/japanese/ceejus_S_smk_066.txt\n", + "ceeaus/japanese/ceejus_S_ptj_063.txt\n", + "ceeaus/japanese/ceejus_M_smk_003.txt\n", + "ceeaus/japanese/ceejus_M_smk_177.txt\n", + "ceeaus/japanese/ceejus_M_ptj_063.txt\n", + "ceeaus/japanese/ceejus_M_smk_007.txt\n", + "ceeaus/japanese/ceejus_L_ptj_32.txt\n", + "ceeaus/japanese/ceejus_M_ptj_025.txt\n", + "ceeaus/japanese/ceejus_M_ptj_036.txt\n", + "ceeaus/japanese/ceejus_M_ptj_029.txt\n", + "ceeaus/japanese/ceejus_U_smk_01.txt\n", + "ceeaus/japanese/ceejus_M_ptj_052.txt\n", + "ceeaus/japanese/ceejus_M_ptj_122.txt\n", + "ceeaus/japanese/ceejus_M_smk_100.txt\n", + "ceeaus/japanese/ceejus_S_ptj_030.txt\n", + "ceeaus/japanese/ceejus_S_ptj_001.txt\n", + "ceeaus/japanese/ceejus_M_ptj_151.txt\n", + "ceeaus/japanese/ceejus_S_smk_035.txt\n", + "ceeaus/japanese/ceejus_M_smk_107.txt\n", + "ceeaus/japanese/ceejus_M_smk_076.txt\n", + "ceeaus/japanese/ceejus_M_ptj_094.txt\n", + "ceeaus/japanese/ceejus_M_smk_186.txt\n", + "ceeaus/japanese/ceejus_S_smk_123.txt\n", + "ceeaus/japanese/ceejus_M_ptj_161.txt\n", + "ceeaus/japanese/ceejus_M_smk_157.txt\n", + "ceeaus/japanese/ceejus_S_smk_024.txt\n", + "ceeaus/japanese/ceejus_S_ptj_107.txt\n", + "ceeaus/japanese/ceejus_S_smk_063.txt\n", + "ceeaus/japanese/ceejus_S_smk_059.txt\n", + "ceeaus/japanese/ceejus_S_ptj_031.txt\n", + "ceeaus/japanese/ceejus_M_ptj_177.txt\n", + "ceeaus/japanese/ceejus_L_ptj_10.txt\n", + "ceeaus/japanese/ceejus_U_ptj_09.txt\n", + "ceeaus/japanese/ceejus_M_ptj_002.txt\n", + "ceeaus/japanese/ceejus_M_ptj_162.txt\n", + "ceeaus/japanese/ceejus_S_ptj_091.txt\n", + "ceeaus/japanese/ceejus_M_smk_134.txt\n", + "ceeaus/japanese/ceejus_S_ptj_017.txt\n", + "ceeaus/japanese/ceejus_S_smk_046.txt\n", + "ceeaus/japanese/ceejus_L_smk_16.txt\n", + "ceeaus/japanese/ceejus_U_ptj_10.txt\n", + "ceeaus/japanese/ceejus_L_smk_08.txt\n", + "ceeaus/japanese/ceejus_M_smk_070.txt\n", + "ceeaus/japanese/ceejus_M_ptj_073.txt\n", + "ceeaus/japanese/ceejus_S_ptj_016.txt\n", + "ceeaus/japanese/ceejus_M_ptj_034.txt\n", + "ceeaus/japanese/ceejus_M_smk_019.txt\n", + "ceeaus/japanese/ceejus_S_ptj_128.txt\n", + "ceeaus/japanese/ceejus_L_ptj_29.txt\n", + "ceeaus/japanese/ceejus_S_smk_102.txt\n", + "ceeaus/japanese/ceejus_M_smk_133.txt\n", + "ceeaus/japanese/ceejus_M_smk_195.txt\n", + "ceeaus/japanese/ceejus_M_ptj_142.txt\n", + "ceeaus/japanese/ceejus_S_smk_009.txt\n", + "ceeaus/japanese/ceejus_U_smk_17.txt\n", + "ceeaus/japanese/ceejus_L_ptj_01.txt\n", + "ceeaus/japanese/ceejus_S_ptj_052.txt\n", + "ceeaus/japanese/ceejus_M_smk_101.txt\n", + "ceeaus/japanese/ceejus_S_smk_023.txt\n", + "ceeaus/japanese/ceejus_M_smk_087.txt\n", + "ceeaus/japanese/ceejus_S_smk_021.txt\n", + "ceeaus/japanese/ceejus_M_ptj_105.txt\n", + "ceeaus/japanese/ceejus_S_smk_095.txt\n", + "ceeaus/japanese/ceejus_S_smk_093.txt\n", + "ceeaus/japanese/ceejus_M_smk_182.txt\n", + "ceeaus/japanese/ceejus_S_ptj_037.txt\n", + "ceeaus/japanese/ceejus_L_smk_26.txt\n", + "ceeaus/japanese/ceejus_M_ptj_062.txt\n", + "ceeaus/japanese/ceejus_M_ptj_051.txt\n", + "ceeaus/japanese/ceejus_S_smk_038.txt\n", + "ceeaus/japanese/ceejus_M_smk_060.txt\n", + "ceeaus/japanese/ceejus_M_ptj_150.txt\n", + "ceeaus/japanese/ceejus_S_smk_013.txt\n", + "ceeaus/japanese/ceejus_L_smk_38.txt\n", + "ceeaus/japanese/ceejus_L_ptj_30.txt\n", + "ceeaus/japanese/ceejus_S_ptj_113.txt\n", + "ceeaus/japanese/ceejus_S_smk_037.txt\n", + "ceeaus/japanese/ceejus_S_smk_033.txt\n", + "ceeaus/japanese/ceejus_M_ptj_100.txt\n", + "ceeaus/japanese/ceejus_M_ptj_132.txt\n", + "ceeaus/japanese/ceejus_S_smk_036.txt\n", + "ceeaus/japanese/ceejus_M_ptj_143.txt\n", + "ceeaus/japanese/ceejus_S_ptj_005.txt\n", + "ceeaus/japanese/ceejus_M_ptj_107.txt\n", + "ceeaus/japanese/ceejus_M_smk_117.txt\n", + "ceeaus/japanese/ceejus_S_ptj_026.txt\n", + "ceeaus/japanese/ceejus_S_ptj_115.txt\n", + "ceeaus/japanese/ceejus_M_smk_093.txt\n", + "ceeaus/japanese/ceejus_S_ptj_032.txt\n", + "ceeaus/japanese/ceejus_L_smk_27.txt\n", + "ceeaus/japanese/ceejus_U_smk_19.txt\n", + "ceeaus/japanese/ceejus_M_ptj_101.txt\n", + "ceeaus/japanese/ceejus_S_smk_077.txt\n", + "ceeaus/japanese/ceejus_S_smk_103.txt\n", + "ceeaus/japanese/ceejus_U_smk_10.txt\n", + "ceeaus/japanese/ceejus_M_ptj_125.txt\n", + "ceeaus/japanese/ceejus_M_smk_140.txt\n", + "ceeaus/japanese/ceejus_M_ptj_121.txt\n", + "ceeaus/japanese/ceejus_M_smk_109.txt\n", + "ceeaus/japanese/ceejus_M_smk_056.txt\n", + "ceeaus/japanese/ceejus_U_smk_06.txt\n", + "ceeaus/japanese/ceejus_S_ptj_023.txt\n", + "ceeaus/japanese/ceejus_M_smk_044.txt\n", + "ceeaus/japanese/ceejus_S_ptj_087.txt\n", + "ceeaus/japanese/ceejus_S_ptj_109.txt\n", + "ceeaus/japanese/ceejus_S_smk_117.txt\n", + "ceeaus/japanese/ceejus_M_smk_099.txt\n", + "ceeaus/japanese/ceejus_S_ptj_006.txt\n", + "ceeaus/japanese/ceejus_S_smk_104.txt\n", + "ceeaus/japanese/ceejus_M_smk_180.txt\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ceeaus/japanese/ceejus_L_smk_11.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_035.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_149.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_196.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_135.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_06.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_110.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_39.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_193.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_090.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_047.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_099.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_13.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_184.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_04.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_176.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_029.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_125.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_085.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_104.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_010.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_164.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_028.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_149.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_115.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_077.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_086.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_030.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_12.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_120.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_067.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_022.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_196.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_166.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_026.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_099.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_052.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_021.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_055.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_100.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_167.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_133.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_170.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_167.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_054.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_028.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_118.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_27.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_074.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_053.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_120.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_039.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_158.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_003.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_089.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_061.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_179.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_165.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_087.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_069.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_107.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_181.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_043.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_117.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_112.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_094.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_098.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_006.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_186.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_154.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_092.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_134.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_17.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_140.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_114.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_063.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_096.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_12.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_053.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_161.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_126.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_02.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_006.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_41.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_151.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_34.txt\r\n", + "ceeaus/japanese/ceejus_U_smk_18.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_152.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_127.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_098.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_104.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_010.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_041.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_38.txt\r\n", + "ceeaus/japanese/ceejus_U_smk_12.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_027.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_015.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_121.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_042.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_106.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_032.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_181.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_012.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_081.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_047.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_18.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_065.txt\r\n", + "ceeaus/japanese/ceejus_U_smk_05.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_25.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_033.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_015.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_136.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_038.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_121.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_138.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_139.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_193.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_105.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_169.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_111.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_122.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_065.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_057.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_050.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_014.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_01.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_127.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_097.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_108.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_060.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_173.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_025.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_113.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_174.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_039.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_18.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_047.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_082.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_054.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_19.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_037.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_083.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_017.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_07.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_043.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_093.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_088.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_16.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_025.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_22.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_111.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_21.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_118.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_033.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_058.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_24.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_079.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_40.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_020.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_105.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_136.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_014.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_25.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_023.txt\r\n", + "ceeaus/japanese/ceejus_U_smk_09.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_17.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_034.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_062.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_092.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_146.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_073.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_005.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_19.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_004.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_057.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_008.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_049.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_018.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_045.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_08.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_096.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_088.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_098.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_082.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_031.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_087.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_091.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_111.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_017.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_34.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_39.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_085.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_056.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_016.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_15.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_089.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_190.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_175.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_110.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_159.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_36.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_024.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_156.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_31.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_074.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_113.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_190.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_008.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_103.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_15.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_114.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_097.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_035.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_126.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_091.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_050.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_178.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_16.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_126.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_148.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_011.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_081.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_35.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_094.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_079.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_11.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_070.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_052.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_072.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_40.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_36.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_02.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_157.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_006.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_170.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_08.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_28.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_084.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_121.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_148.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_185.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_034.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_020.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_05.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_089.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_097.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_147.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_048.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_007.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_001.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_058.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_168.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_025.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_076.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_043.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_080.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_26.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_039.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_021.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_188.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_039.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_062.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_100.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_028.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_075.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_069.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_031.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_069.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_040.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_086.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_153.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_101.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_059.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_24.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_095.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_060.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_029.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_041.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_085.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_191.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_131.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_189.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_080.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_045.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_014.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_131.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_036.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_126.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_075.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_108.txt\r\n", + "ceeaus/japanese/ceejus_L_ptj_28.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_038.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_083.txt\r\n", + "ceeaus/japanese/ceejus_U_ptj_17.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_003.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_018.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_009.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_045.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_066.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_020.txt\r\n", + "ceeaus/japanese/ceejus_U_smk_11.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_19.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_076.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_067.txt\r\n", + "ceeaus/japanese/ceejus_M_smk_153.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_020.txt\r\n", + "ceeaus/japanese/ceejus_L_smk_22.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_002.txt\r\n", + "ceeaus/japanese/ceejus_S_smk_109.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_008.txt\r\n", + "ceeaus/japanese/ceejus_S_ptj_112.txt\r\n", + "ceeaus/japanese/ceejus_M_ptj_114.txt\r\n", + "ceeaus/english/\r\n", + "ceeaus/english/ceenas_smk_11.txt\r\n", + "ceeaus/english/ceenas_ptj_02.txt\r\n", + "ceeaus/english/ceenas_smk_20.txt\r\n", + "ceeaus/english/ceenas_smk_06.txt\r\n", + "ceeaus/english/ceenas_ptj_57.txt\r\n", + "ceeaus/english/ceenas_ptj_65.txt\r\n", + "ceeaus/english/ceenas_ptj_36.txt\r\n", + "ceeaus/english/ceenas_ptj_64.txt\r\n", + "ceeaus/english/ceenas_smk_08.txt\r\n", + "ceeaus/english/ceenas_ptj_42.txt\r\n", + "ceeaus/english/ceenas_ptj_14.txt\r\n", + "ceeaus/english/ceenas_ptj_63.txt\r\n", + "ceeaus/english/ceenas_ptj_45.txt\r\n", + "ceeaus/english/ceenas_ptj_60.txt\r\n", + "ceeaus/english/ceenas_ptj_31.txt\r\n", + "ceeaus/english/ceenas_smk_51.txt\r\n", + "ceeaus/english/ceenas_smk_60.txt\r\n", + "ceeaus/english/ceenas_ptj_37.txt\r\n", + "ceeaus/english/ceenas_ptj_22.txt\r\n", + "ceeaus/english/ceenas_ptj_24.txt\r\n", + "ceeaus/english/ceenas_smk_56.txt\r\n", + "ceeaus/english/ceenas_smk_37.txt\r\n", + "ceeaus/english/ceenas_ptj_71.txt\r\n", + "ceeaus/english/ceenas_ptj_62.txt\r\n", + "ceeaus/english/ceenas_smk_31.txt\r\n", + "ceeaus/english/ceenas_smk_66.txt\r\n", + "ceeaus/english/ceenas_smk_40.txt\r\n", + "ceeaus/english/ceenas_ptj_41.txt\r\n", + "ceeaus/english/ceenas_smk_42.txt\r\n", + "ceeaus/english/ceenas_smk_30.txt\r\n", + "ceeaus/english/ceenas_ptj_50.txt\r\n", + "ceeaus/english/ceenas_smk_29.txt\r\n", + "ceeaus/english/ceenas_ptj_34.txt\r\n", + "ceeaus/english/ceenas_smk_34.txt\r\n", + "ceeaus/english/ceenas_smk_45.txt\r\n", + "ceeaus/english/ceenas_ptj_08.txt\r\n", + "ceeaus/english/ceenas_ptj_05.txt\r\n", + "ceeaus/english/ceenas_smk_52.txt\r\n", + "ceeaus/english/ceenas_ptj_55.txt\r\n", + "ceeaus/english/ceenas_smk_02.txt\r\n", + "ceeaus/english/ceenas_ptj_30.txt\r\n", + "ceeaus/english/ceenas_ptj_09.txt\r\n", + "ceeaus/english/ceenas_smk_58.txt\r\n", + "ceeaus/english/ceenas_smk_71.txt\r\n", + "ceeaus/english/ceenas_ptj_04.txt\r\n", + "ceeaus/english/ceenas_smk_17.txt\r\n", + "ceeaus/english/ceenas_smk_69.txt\r\n", + "ceeaus/english/ceenas_ptj_73.txt\r\n", + "ceeaus/english/ceenas_ptj_35.txt\r\n", + "ceeaus/english/ceenas_smk_10.txt\r\n", + "ceeaus/english/ceenas_smk_27.txt\r\n", + "ceeaus/english/ceenas_ptj_18.txt\r\n", + "ceeaus/english/ceenas_ptj_19.txt\r\n", + "ceeaus/english/ceenas_ptj_26.txt\r\n", + "ceeaus/english/ceenas_smk_68.txt\r\n", + "ceeaus/english/ceenas_smk_24.txt\r\n", + "ceeaus/english/ceenas_smk_63.txt\r\n", + "ceeaus/english/ceenas_ptj_07.txt\r\n", + "ceeaus/english/ceenas_ptj_51.txt\r\n", + "ceeaus/english/ceenas_ptj_10.txt\r\n", + "ceeaus/english/ceenas_smk_39.txt\r\n", + "ceeaus/english/ceenas_ptj_66.txt\r\n", + "ceeaus/english/ceenas_smk_21.txt\r\n", + "ceeaus/english/ceenas_ptj_40.txt\r\n", + "ceeaus/english/ceenas_ptj_72.txt\r\n", + "ceeaus/english/ceenas_ptj_06.txt\r\n", + "ceeaus/english/ceenas_smk_09.txt\r\n", + "ceeaus/english/ceenas_smk_01.txt\r\n", + "ceeaus/english/ceenas_ptj_15.txt\r\n", + "ceeaus/english/ceenas_smk_04.txt\r\n", + "ceeaus/english/ceenas_ptj_32.txt\r\n", + "ceeaus/english/ceenas_smk_50.txt\r\n", + "ceeaus/english/ceenas_smk_54.txt\r\n", + "ceeaus/english/ceenas_ptj_12.txt\r\n", + "ceeaus/english/ceenas_ptj_17.txt\r\n", + "ceeaus/english/ceenas_smk_43.txt\r\n", + "ceeaus/english/ceenas_ptj_54.txt\r\n", + "ceeaus/english/ceenas_ptj_67.txt\r\n", + "ceeaus/english/ceenas_smk_05.txt\r\n", + "ceeaus/english/ceenas_smk_14.txt\r\n", + "ceeaus/english/ceenas_ptj_03.txt\r\n", + "ceeaus/english/ceenas_smk_35.txt\r\n", + "ceeaus/english/ceenas_ptj_29.txt\r\n", + "ceeaus/english/ceenas_ptj_70.txt\r\n", + "ceeaus/english/ceenas_smk_65.txt\r\n", + "ceeaus/english/ceenas_smk_28.txt\r\n", + "ceeaus/english/ceenas_ptj_13.txt\r\n", + "ceeaus/english/ceenas_smk_72.txt\r\n", + "ceeaus/english/ceenas_ptj_68.txt\r\n", + "ceeaus/english/ceenas_smk_64.txt\r\n", + "ceeaus/english/ceenas_ptj_20.txt\r\n", + "ceeaus/english/ceenas_smk_22.txt\r\n", + "ceeaus/english/ceenas_smk_07.txt\r\n", + "ceeaus/english/ceenas_ptj_46.txt\r\n", + "ceeaus/english/ceenas_smk_18.txt\r\n", + "ceeaus/english/ceenas_ptj_27.txt\r\n", + "ceeaus/english/ceenas_smk_73.txt\r\n", + "ceeaus/english/ceenas_smk_48.txt\r\n", + "ceeaus/english/ceenas_smk_36.txt\r\n", + "ceeaus/english/ceenas_smk_49.txt\r\n", + "ceeaus/english/ceenas_ptj_52.txt\r\n", + "ceeaus/english/ceenas_smk_13.txt\r\n", + "ceeaus/english/ceenas_ptj_16.txt\r\n", + "ceeaus/english/ceenas_smk_25.txt\r\n", + "ceeaus/english/ceenas_smk_53.txt\r\n", + "ceeaus/english/ceenas_smk_12.txt\r\n", + "ceeaus/english/ceenas_ptj_53.txt\r\n", + "ceeaus/english/ceenas_ptj_43.txt\r\n", + "ceeaus/english/ceenas_ptj_49.txt\r\n", + "ceeaus/english/ceenas_ptj_39.txt\r\n", + "ceeaus/english/ceenas_smk_70.txt\r\n", + "ceeaus/english/ceenas_ptj_69.txt\r\n", + "ceeaus/english/ceenas_smk_55.txt\r\n", + "ceeaus/english/ceenas_ptj_25.txt\r\n", + "ceeaus/english/ceenas_ptj_47.txt\r\n", + "ceeaus/english/ceenas_ptj_28.txt\r\n", + "ceeaus/english/ceenas_smk_47.txt\r\n", + "ceeaus/english/ceenas_ptj_48.txt\r\n", + "ceeaus/english/ceenas_smk_67.txt\r\n", + "ceeaus/english/ceenas_smk_32.txt\r\n", + "ceeaus/english/ceenas_smk_23.txt\r\n", + "ceeaus/english/ceenas_ptj_58.txt\r\n", + "ceeaus/english/ceenas_smk_38.txt\r\n", + "ceeaus/english/ceenas_smk_59.txt\r\n", + "ceeaus/english/ceenas_ptj_44.txt\r\n", + "ceeaus/english/ceenas_ptj_61.txt\r\n", + "ceeaus/english/ceenas_ptj_59.txt\r\n", + "ceeaus/english/ceenas_smk_57.txt\r\n", + "ceeaus/english/ceenas_ptj_38.txt\r\n", + "ceeaus/english/ceenas_ptj_11.txt\r\n", + "ceeaus/english/ceenas_smk_41.txt\r\n", + "ceeaus/english/ceenas_smk_26.txt\r\n", + "ceeaus/english/ceenas_smk_46.txt\r\n", + "ceeaus/english/ceenas_smk_62.txt\r\n", + "ceeaus/english/ceenas_ptj_56.txt\r\n", + "ceeaus/english/ceenas_smk_03.txt\r\n", + "ceeaus/english/ceenas_ptj_33.txt\r\n", + "ceeaus/english/ceenas_smk_19.txt\r\n", + "ceeaus/english/ceenas_smk_33.txt\r\n", + "ceeaus/english/ceenas_smk_61.txt\r\n", + "ceeaus/english/ceenas_ptj_21.txt\r\n", + "ceeaus/english/ceenas_smk_16.txt\r\n", + "ceeaus/english/ceenas_ptj_01.txt\r\n", + "ceeaus/english/ceenas_ptj_23.txt\r\n", + "ceeaus/english/ceenas_smk_44.txt\r\n", + "ceeaus/english/ceenas_smk_15.txt\r\n", + "ceeaus/ceeaus.dat.labels.gz\r\n", + "ceeaus/ceeaus.dat\r\n", + "ceeaus/chinese/\r\n", + "ceeaus/chinese/ceecus_smk_19.txt\r\n", + "ceeaus/chinese/ceecus_smk_05.txt\r\n", + "ceeaus/chinese/ceecus_smk_02.txt\r\n", + "ceeaus/chinese/ceecus_smk_29.txt\r\n", + "ceeaus/chinese/ceecus_smk_41.txt\r\n", + "ceeaus/chinese/ceecus_smk_16.txt\r\n", + "ceeaus/chinese/ceecus_ptj_02.txt\r\n", + "ceeaus/chinese/ceecus_ptj_31.txt\r\n", + "ceeaus/chinese/ceecus_smk_34.txt\r\n", + "ceeaus/chinese/ceecus_ptj_36.txt\r\n", + "ceeaus/chinese/ceecus_ptj_14.txt\r\n", + "ceeaus/chinese/ceecus_smk_38.txt\r\n", + "ceeaus/chinese/ceecus_ptj_07.txt\r\n", + "ceeaus/chinese/ceecus_smk_01.txt\r\n", + "ceeaus/chinese/ceecus_ptj_17.txt\r\n", + "ceeaus/chinese/ceecus_ptj_24.txt\r\n", + "ceeaus/chinese/ceecus_smk_26.txt\r\n", + "ceeaus/chinese/ceecus_ptj_38.txt\r\n", + "ceeaus/chinese/ceecus_ptj_22.txt\r\n", + "ceeaus/chinese/ceecus_smk_40.txt\r\n", + "ceeaus/chinese/ceecus_smk_46.txt\r\n", + "ceeaus/chinese/ceecus_ptj_25.txt\r\n", + "ceeaus/chinese/ceecus_smk_39.txt\r\n", + "ceeaus/chinese/ceecus_ptj_28.txt\r\n", + "ceeaus/chinese/ceecus_ptj_06.txt\r\n", + "ceeaus/chinese/ceecus_smk_07.txt\r\n", + "ceeaus/chinese/ceecus_ptj_30.txt\r\n", + "ceeaus/chinese/ceecus_smk_09.txt\r\n", + "ceeaus/chinese/ceecus_smk_11.txt\r\n", + "ceeaus/chinese/ceecus_smk_24.txt\r\n", + "ceeaus/chinese/ceecus_ptj_39.txt\r\n", + "ceeaus/chinese/ceecus_ptj_45.txt\r\n", + "ceeaus/chinese/ceecus_ptj_41.txt\r\n", + "ceeaus/chinese/ceecus_ptj_20.txt\r\n", + "ceeaus/chinese/ceecus_smk_27.txt\r\n", + "ceeaus/chinese/ceecus_smk_12.txt\r\n", + "ceeaus/chinese/ceecus_smk_30.txt\r\n", + "ceeaus/chinese/ceecus_ptj_33.txt\r\n", + "ceeaus/chinese/ceecus_ptj_16.txt\r\n", + "ceeaus/chinese/ceecus_smk_13.txt\r\n", + "ceeaus/chinese/ceecus_smk_25.txt\r\n", + "ceeaus/chinese/ceecus_smk_45.txt\r\n", + "ceeaus/chinese/ceecus_smk_36.txt\r\n", + "ceeaus/chinese/ceecus_ptj_27.txt\r\n", + "ceeaus/chinese/ceecus_smk_23.txt\r\n", + "ceeaus/chinese/ceecus_ptj_29.txt\r\n", + "ceeaus/chinese/ceecus_ptj_35.txt\r\n", + "ceeaus/chinese/ceecus_smk_08.txt\r\n", + "ceeaus/chinese/ceecus_ptj_21.txt\r\n", + "ceeaus/chinese/ceecus_smk_31.txt\r\n", + "ceeaus/chinese/ceecus_ptj_19.txt\r\n", + "ceeaus/chinese/ceecus_smk_17.txt\r\n", + "ceeaus/chinese/ceecus_smk_10.txt\r\n", + "ceeaus/chinese/ceecus_smk_37.txt\r\n", + "ceeaus/chinese/ceecus_smk_04.txt\r\n", + "ceeaus/chinese/ceecus_ptj_18.txt\r\n", + "ceeaus/chinese/ceecus_smk_35.txt\r\n", + "ceeaus/chinese/ceecus_smk_44.txt\r\n", + "ceeaus/chinese/ceecus_ptj_32.txt\r\n", + "ceeaus/chinese/ceecus_smk_43.txt\r\n", + "ceeaus/chinese/ceecus_ptj_11.txt\r\n", + "ceeaus/chinese/ceecus_ptj_23.txt\r\n", + "ceeaus/chinese/ceecus_smk_18.txt\r\n", + "ceeaus/chinese/ceecus_ptj_10.txt\r\n", + "ceeaus/chinese/ceecus_smk_06.txt\r\n", + "ceeaus/chinese/ceecus_ptj_05.txt\r\n", + "ceeaus/chinese/ceecus_ptj_42.txt\r\n", + "ceeaus/chinese/ceecus_ptj_01.txt\r\n", + "ceeaus/chinese/ceecus_smk_32.txt\r\n", + "ceeaus/chinese/ceecus_smk_42.txt\r\n", + "ceeaus/chinese/ceecus_smk_33.txt\r\n", + "ceeaus/chinese/ceecus_smk_20.txt\r\n", + "ceeaus/chinese/ceecus_smk_21.txt\r\n", + "ceeaus/chinese/ceecus_ptj_44.txt\r\n", + "ceeaus/chinese/ceecus_ptj_12.txt\r\n", + "ceeaus/chinese/ceecus_ptj_13.txt\r\n", + "ceeaus/chinese/ceecus_ptj_04.txt\r\n", + "ceeaus/chinese/ceecus_smk_22.txt\r\n", + "ceeaus/chinese/ceecus_ptj_03.txt\r\n", + "ceeaus/chinese/ceecus_smk_14.txt\r\n", + "ceeaus/chinese/ceecus_ptj_46.txt\r\n", + "ceeaus/chinese/ceecus_smk_03.txt\r\n", + "ceeaus/chinese/ceecus_ptj_43.txt\r\n", + "ceeaus/chinese/ceecus_smk_28.txt\r\n", + "ceeaus/chinese/ceecus_ptj_08.txt\r\n", + "ceeaus/chinese/ceecus_ptj_34.txt\r\n", + "ceeaus/chinese/ceecus_ptj_37.txt\r\n", + "ceeaus/chinese/ceecus_ptj_09.txt\r\n", + "ceeaus/chinese/ceecus_ptj_15.txt\r\n", + "ceeaus/chinese/ceecus_ptj_26.txt\r\n", + "ceeaus/chinese/ceecus_ptj_40.txt\r\n", + "ceeaus/chinese/ceecus_smk_15.txt\r\n" + ] + } + ], + "source": [ + "!wget -N https://meta-toolkit.org/data/2016-01-26/ceeaus.tar.gz\n", + "!tar xvf ceeaus.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's index this dataset. Since we are doing classification experiments, we will most likely be concerning ourselves with a `ForwardIndex`, since we want to map document ids to their feature vector representations." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "fidx = metapy.index.make_forward_index('ceeaus-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the feature set used for classification depends on your settings in the configuration file _at the time of indexing_. If you want to play with different feature sets, remember to change your `analyzer` pipeline in the configuration file, and also to **reindex** your documents!\n", + "\n", + "Here, we've just chosen simple unigram words. This is actually a surprisingly good baseline feature set for many text classification problems.\n", + "\n", + "Now that we have a `ForwardIndex` on disk, we need to load the documents we want to start playing with into memory. Since this is a small enough dataset, let's load the whole thing into memory at once.\n", + "\n", + "We need to decide what kind of dataset we're using. MeTA has classes for binary classification (`BinaryDataset`) and multi-class classification (`MulticlassDataset`), which you should choose from depending on the kind of classification problem you're dealing with. Let's see how many labels we have in our corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fidx.num_labels()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since this is more than 2, we likely want a `MulticlassDataset` so we can learn a classifier that can predict which of these three labels a document should have. (But we might be interested in only determining one particular class from the rest, in which case we might actually want a `BinaryDataset`.)\n", + "\n", + "For now, let's focus on the multi-class case, as that likely makes the most sense for this kind of data. Let's load or documents." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1008" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset = metapy.classify.MulticlassDataset(fidx)\n", + "len(dset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have 1008 documents, split across three labels. What are our labels?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'chinese', 'english', 'japanese'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set([dset.label(instance) for instance in dset])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dataset is a small collection of essays written by a bunch of students with different first languages. Our goal will be to try to identify whether an essay was written by a native-Chinese speaker, a native-English speaker, or a native-Japanese speaker.\n", + "\n", + "Now, because these in-memory datasets can potentially be quite large, it's beneficial to not make unnecessary copies of them to, for example, create a new list that's shuffled that contains the same documents. In most cases, you'll be operating with a `DatasetView` (either `MulticlassDatasetView` or `BinaryDatasetView`) so that you can do things like shuffle or rotate the contents of a dataset without having to actually modify it. Doing so is pretty easy: you can use Python's slicing API, or you can just construct one directly." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "view = dset[0:len(dset)+1]\n", + "# or\n", + "view = metapy.classify.MulticlassDatasetView(dset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can, for example, shuffle this view without changing the underlying datsaet." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "929 vs 0\n" + ] + } + ], + "source": [ + "view.shuffle()\n", + "print(\"{} vs {}\".format(view[0].id, dset[0].id))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The view has been shuffled and now has documents in random order (useful in many cases to make sure that you don't have clumps of the same-labeled documents together, or to just permute the documents in a stochastic learning algorithm), but the underlying dataset is still sorted by id.\n", + "\n", + "We can also use this slicing API to create a random training and testing set from our shuffled views (views also support slicing). Let's make a 75-25 split of training-testing data. (Note that's really important that we already shuffled the view!)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "training = view[0:int(0.75*len(view))]\n", + "testing = view[int(0.75*len(view)):len(view)+1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we're ready to train a classifier! Let's start with very simple one: [Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier).\n", + "\n", + "In MeTA, construction of a classifier implies training of that model. Let's train a Naive Bayes classifier on our training view now." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "nb = metapy.classify.NaiveBayes(training)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now classify individual documents like so." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'english'" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb.classify(testing[0].weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We might be more interested in how well we classify the testing set." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.75\u001b[22m - 0.25 \n", + " english | 0.0278 \u001b[1m0.917\u001b[22m 0.0556 \n", + " japanese | 0.015 - \u001b[1m0.985\u001b[22m \n", + "\n", + "\n" + ] + } + ], + "source": [ + "mtrx = nb.test(testing)\n", + "print(mtrx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `test()` method of MeTA's classifiers returns to you a `ConfusionMatrix`, which contains useful information about what kinds of mistakes your classifier is making.\n", + "\n", + "(Note that, due to the random shuffling, you might see different results than we do here.)\n", + "\n", + "For example, we can see that this classifier seems to have some trouble with confusing native-Chinese students' essays with those of native-Japanese students. We can tell that by looking at the rows of the confusion matrix. Each row tells you what fraction of documents with that _true_ label were assigned the label for each column by the classifier. In the case of the native-Chinese label, we can see that 25% of the time they were miscategorized as being native-Japanese.\n", + "\n", + "The `ConfusionMatrix` also computes a lot of metrics that are commonly used in classifier evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.75 0.75 0.75 0.0635 \n", + "english 0.957 1 0.917 0.143 \n", + "japanese 0.978 0.97 0.985 0.794 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.96\u001b[22m \u001b[1m0.961\u001b[22m \u001b[1m0.96\u001b[22m \n", + "------------------------------------------------------------\n", + "252 predictions attempted, overall accuracy: 0.96\n", + "\n" + ] + } + ], + "source": [ + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want to make sure that the classifier isn't overfitting to our training data, a common approach is to do [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)). Let's run CV for our Naive Bayes classifier across the whole dataset, using 5-folds, to get an idea of how well we might generalize to new data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mtrx = metapy.classify.cross_validate(lambda fold: metapy.classify.NaiveBayes(fold), view, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`cross_validate()` returns a `ConfusionMatrix` just like `test()` does. We give it a function to use to create the trained classifiers for each fold, and then pass in the dataset view containing all of our documents, and the number of folds we want to use.\n", + "\n", + "Let's see how we did." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.674\u001b[22m 0.0435 0.283 \n", + " english | 0.0137 \u001b[1m0.945\u001b[22m 0.0411 \n", + " japanese | 0.0013 0.00261 \u001b[1m0.996\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.79 0.954 0.674 0.0915 \n", + "english 0.952 0.958 0.945 0.145 \n", + "japanese 0.978 0.96 0.996 0.763 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.959\u001b[22m \u001b[1m0.959\u001b[22m \u001b[1m0.959\u001b[22m \n", + "------------------------------------------------------------\n", + "1005 predictions attempted, overall accuracy: 0.959\n", + "\n" + ] + } + ], + "source": [ + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's do the same thing, but for an arguably stronger baseline: [SVM](https://en.wikipedia.org/wiki/Support_vector_machine).\n", + "\n", + "MeTA's implementation of SVM is actually an approximation using [stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) on the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss). It's implemented as a `BinaryClassifier`, so we will need to adapt it before it can be used to solve our multi-class clasification problem.\n", + "\n", + "MeTA provides two different adapters for this scenario: [One-vs-All](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) and [One-vs-One](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-one)." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "ova = metapy.classify.OneVsAll(training, metapy.classify.SGD, loss_id='hinge')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We construct the `OneVsAll` reduction by providing it the training documents, the name of a binary classifier, and then (as keyword arguments) any additional arguments to that chosen classifier. In this case, we use `loss_id` to specify the loss function to use.\n", + "\n", + "We can now use `OneVsAll` just like any other classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.75\u001b[22m - 0.25 \n", + " english | 0.0556 \u001b[1m0.806\u001b[22m 0.139 \n", + " japanese | - 0.005 \u001b[1m0.995\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.8 0.857 0.75 0.0635 \n", + "english 0.879 0.967 0.806 0.143 \n", + "japanese 0.975 0.957 0.995 0.794 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.952\u001b[22m \u001b[1m0.952\u001b[22m \u001b[1m0.952\u001b[22m \n", + "------------------------------------------------------------\n", + "252 predictions attempted, overall accuracy: 0.952\n", + "\n" + ] + } + ], + "source": [ + "mtrx = ova.test(testing)\n", + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.674\u001b[22m 0.0435 0.283 \n", + " english | 0.0137 \u001b[1m0.945\u001b[22m 0.0411 \n", + " japanese | 0.0013 0.00261 \u001b[1m0.996\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.79 0.954 0.674 0.0915 \n", + "english 0.952 0.958 0.945 0.145 \n", + "japanese 0.978 0.96 0.996 0.763 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.959\u001b[22m \u001b[1m0.959\u001b[22m \u001b[1m0.959\u001b[22m \n", + "------------------------------------------------------------\n", + "1005 predictions attempted, overall accuracy: 0.959\n", + "\n" + ] + } + ], + "source": [ + "mtrx = metapy.classify.cross_validate(lambda fold: metapy.classify.OneVsAll(fold, metapy.classify.SGD, loss_id='hinge'), view, 5)\n", + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That should be enough to get you started! Try looking at `help(metapy.classify)` for a list of what's included in the bindings." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/5-topic-modeling.ipynb b/metapy/tutorials/5-topic-modeling.ipynb new file mode 100644 index 0000000000..ab48572ba4 --- /dev/null +++ b/metapy/tutorials/5-topic-modeling.ipynb @@ -0,0 +1,812 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "First, let's import the Python bindings, as usual." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import metapy" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.2.10'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metapy.__version__ # you will want your version to be >= to this" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If you would like to, you can inform MeTA to output log data to stderr like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "metapy.log_to_stderr()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's download a list of stopwords and a sample dataset to begin exploring MeTA's topic models." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2018-05-01 15:41:33-- https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.184.133\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.184.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2747 (2.7K) [text/plain]\n", + "Saving to: ‘lemur-stopwords.txt’\n", + "\n", + "lemur-stopwords.txt 100%[===================>] 2.68K --.-KB/s in 0s \n", + "\n", + "Last-modified header missing -- time-stamps turned off.\n", + "2018-05-01 15:41:33 (45.5 MB/s) - ‘lemur-stopwords.txt’ saved [2747/2747]\n", + "\n" + ] + } + ], + "source": [ + "!wget -N https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2018-05-01 15:41:35-- https://meta-toolkit.org/data/2016-01-26/ceeaus.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving meta-toolkit.org (meta-toolkit.org)... 50.116.41.177, 2600:3c02::f03c:91ff:feae:b777\n", + "Connecting to meta-toolkit.org (meta-toolkit.org)|50.116.41.177|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1182344 (1.1M) [application/octet-stream]\n", + "Saving to: ‘ceeaus.tar.gz’\n", + "\n", + "ceeaus.tar.gz 100%[===================>] 1.13M 2.30MB/s in 0.5s \n", + "\n", + "2018-05-01 15:41:36 (2.30 MB/s) - ‘ceeaus.tar.gz’ saved [1182344/1182344]\n", + "\n" + ] + } + ], + "source": [ + "!wget -N https://meta-toolkit.org/data/2016-01-26/ceeaus.tar.gz\n", + "!tar xf ceeaus.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We will need to index our data to proceed. We eventually want to be able to extract the bag-of-words representation for our individual documents, so we will want a `ForwardIndex` in this case." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Counting lines in file: [=================================] 100% ETA 00:00:00 \n", + "1525207335: [info] Creating forward index: ceeaus-idx/fwd (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/forward_index.cpp:239)\n", + " > Tokenizing Docs: [========================================] 100% ETA 00:00:00 \n", + " > Merging: [================================================] 100% ETA 00:00:00 \n", + "1525207335: [info] Done creating index: ceeaus-idx/fwd (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/forward_index.cpp:278)\n" + ] + } + ], + "source": [ + "fidx = metapy.index.make_forward_index('ceeaus-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Just like in classification, the feature set used for the topic modeling will be the feature set used at the time of indexing, so if you want to play with a different set of features (like bigram words), you will need to re-index your data.\n", + "\n", + "For now, we've just stuck with the default filter chain for unigram words, so we're operating in the traditional bag-of-words space.\n", + "\n", + "Let's load our documents into memory to run the topic model inference now." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading instances into memory: [> ] 0% ETA 00:00:00 \r", + " > Loading instances into memory: [==========================] 100% ETA 00:00:00 \n" + ] + } + ], + "source": [ + "dset = metapy.learn.Dataset(fidx)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's try to find some topics for this dataset. To do so, we're going to use a generative model called a topic model.\n", + "\n", + "There are many different topic models in the literature, but the most commonly used topic model is Latent Dirichlet Allocation. Here, we propose that there are K topics (represented with a categorical distribution over words) $\\phi_k$ from which all of our documents are genereated. These K topics are modeled as being sampled from a Dirichlet distribution with parameter $\\vec{\\alpha}$. Then, to generate a document $d$, we first sample a distribution over the K topics $\\theta_d$ from another Dirichlet distribution with parameter $\\vec{\\beta}$. Then, for each word in this document, we first sample a topic identifier $z \\sim \\theta_d$ and then the word by drawing from the topic we selected ($w \\sim \\phi_z$). Refer to the [Wikipedia article on LDA](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) for more information.\n", + "\n", + "The goal of running inference for an LDA model is to infer the latent variables $\\phi_k$ and $\\theta_d$ for all of the $K$ topics and $D$ documents, respectively. MeTA provides a number of different inference algorithms for LDA, as each one entails a different set of trade-offs (inference in LDA is intractable, so all inference algorithms are approximations; different algorithms entail different approximation guarantees, running times, and required memroy consumption). For now, let's run a Variational Infernce algorithm called CVB0 to find two topics. (In practice you will likely be finding many more topics than just two, but this is a very small toy dataset.)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Initialization: [============================================] 100% ETA 00:00:00 \n", + "Iteration 1 maximum change in gamma: 1.78732 \n", + "Iteration 2 maximum change in gamma: 0.397587 \n", + "Iteration 3 maximum change in gamma: 0.585143 \n", + "Iteration 4 maximum change in gamma: 0.639287 \n", + "Iteration 5 maximum change in gamma: 0.543676 \n", + "Iteration 6 maximum change in gamma: 1.04887 \n", + "Iteration 7 maximum change in gamma: 1.26393 \n", + "Iteration 8 maximum change in gamma: 1.43444 \n", + "Iteration 9 maximum change in gamma: 1.36808 \n", + "Iteration 10 maximum change in gamma: 1.39221 \n", + "Iteration 11 maximum change in gamma: 1.3227 \n", + "Iteration 12 maximum change in gamma: 1.21134 \n", + "Iteration 13 maximum change in gamma: 0.759168 \n", + "Iteration 14 maximum change in gamma: 0.805964 \n", + "Iteration 15 maximum change in gamma: 0.802893 \n", + "Iteration 16 maximum change in gamma: 0.571095 \n", + "Iteration 17 maximum change in gamma: 0.516594 \n", + "Iteration 18 maximum change in gamma: 0.523159 \n", + "Iteration 19 maximum change in gamma: 0.275293 \n", + "Iteration 20 maximum change in gamma: 0.228512 \n", + "Iteration 21 maximum change in gamma: 0.237871 \n", + "Iteration 22 maximum change in gamma: 0.203393 \n", + "Iteration 23 maximum change in gamma: 0.214538 \n", + "Iteration 24 maximum change in gamma: 0.205745 \n", + "Iteration 25 maximum change in gamma: 0.169748 \n", + "Iteration 26 maximum change in gamma: 0.122061 \n", + "Iteration 27 maximum change in gamma: 0.119245 \n", + "Iteration 28 maximum change in gamma: 0.129667 \n", + "Iteration 29 maximum change in gamma: 0.134692 \n", + "Iteration 30 maximum change in gamma: 0.132623 \n", + "Iteration 31 maximum change in gamma: 0.129393 \n", + "Iteration 32 maximum change in gamma: 0.151032 \n", + "Iteration 33 maximum change in gamma: 0.164985 \n", + "Iteration 34 maximum change in gamma: 0.165914 \n", + "Iteration 35 maximum change in gamma: 0.152212 \n", + "Iteration 36 maximum change in gamma: 0.142474 \n", + "Iteration 37 maximum change in gamma: 0.171535 \n", + "Iteration 38 maximum change in gamma: 0.191064 \n", + "Iteration 39 maximum change in gamma: 0.192934 \n", + "Iteration 40 maximum change in gamma: 0.202834 \n", + "Iteration 41 maximum change in gamma: 0.220982 \n", + "Iteration 42 maximum change in gamma: 0.210217 \n", + "Iteration 43 maximum change in gamma: 0.213731 \n", + "Iteration 44 maximum change in gamma: 0.192772 \n", + "Iteration 45 maximum change in gamma: 0.15127 \n", + "Iteration 46 maximum change in gamma: 0.105296 \n", + "Iteration 47 maximum change in gamma: 0.0670212 \n", + "Iteration 48 maximum change in gamma: 0.0402179 \n", + "Iteration 49 maximum change in gamma: 0.0360913 \n", + "Iteration 50 maximum change in gamma: 0.0366095 \n", + "Iteration 51 maximum change in gamma: 0.037 \n", + "Iteration 52 maximum change in gamma: 0.0372273 \n", + "Iteration 53 maximum change in gamma: 0.0374239 \n", + "Iteration 54 maximum change in gamma: 0.0379712 \n", + "Iteration 55 maximum change in gamma: 0.0393836 \n", + "Iteration 56 maximum change in gamma: 0.0430323 \n", + "Iteration 57 maximum change in gamma: 0.0467638 \n", + "Iteration 58 maximum change in gamma: 0.0504449 \n", + "Iteration 59 maximum change in gamma: 0.0539003 \n", + "Iteration 60 maximum change in gamma: 0.0569188 \n", + "Iteration 61 maximum change in gamma: 0.0592677 \n", + "Iteration 62 maximum change in gamma: 0.0607189 \n", + "Iteration 63 maximum change in gamma: 0.0610821 \n", + "Iteration 64 maximum change in gamma: 0.0616854 \n", + "Iteration 65 maximum change in gamma: 0.0622165 \n", + "Iteration 66 maximum change in gamma: 0.0614795 \n", + "Iteration 67 maximum change in gamma: 0.0594567 \n", + "Iteration 68 maximum change in gamma: 0.0562496 \n", + "Iteration 69 maximum change in gamma: 0.0520662 \n", + "Iteration 70 maximum change in gamma: 0.0471889 \n", + "Iteration 71 maximum change in gamma: 0.0419296 \n", + "Iteration 72 maximum change in gamma: 0.0365863 \n", + "Iteration 73 maximum change in gamma: 0.0314094 \n", + "Iteration 74 maximum change in gamma: 0.0265839 \n", + "Iteration 75 maximum change in gamma: 0.0222263 \n", + "Iteration 76 maximum change in gamma: 0.0183923 \n", + "Iteration 77 maximum change in gamma: 0.0150897 \n", + "Iteration 78 maximum change in gamma: 0.0122935 \n", + "Iteration 79 maximum change in gamma: 0.00995884 \n", + "Iteration 80 maximum change in gamma: 0.00803111 \n", + "Iteration 81 maximum change in gamma: 0.00645346 \n", + "Iteration 82 maximum change in gamma: 0.00517133 \n", + "Iteration 83 maximum change in gamma: 0.0041351 \n", + "Iteration 84 maximum change in gamma: 0.00330117 \n", + "Iteration 85 maximum change in gamma: 0.00263226 \n", + "Iteration 86 maximum change in gamma: 0.00209708 \n", + "Iteration 87 maximum change in gamma: 0.00166969 \n", + "Iteration 88 maximum change in gamma: 0.00132887 \n", + "Iteration 89 maximum change in gamma: 0.00121476 \n", + "Iteration 90 maximum change in gamma: 0.00116639 \n", + "Iteration 91 maximum change in gamma: 0.00111979 \n", + "Iteration 92 maximum change in gamma: 0.00107491 \n", + "Iteration 93 maximum change in gamma: 0.00103169 \n", + "Iteration 94 maximum change in gamma: 0.00099009 \n", + "1525207346: [info] Finished maximum iterations, or found convergence! (/tmp/pip-req-build-m473bt6z/deps/meta/src/topics/lda_cvb.cpp:60)\n" + ] + } + ], + "source": [ + "lda_inf = metapy.topics.LDACollapsedVB(dset, num_topics=2, alpha=1.0, beta=0.01)\n", + "lda_inf.run(num_iters=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "source": [ + "The above ran the CVB0 algorithm for 1000 iterations, or until an algorithm-specific convergence criterion was met. Now let's save the current estimate for our topics and topic proportions." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "lda_inf.save('lda-cvb0')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can interrogate the topic inference results by using the `TopicModel` query class. Let's load our inference results back in." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading topic term probabilities: [===========> ] 50% ETA 00:00:00 \r", + " > Loading topic term probabilities: [=======================] 100% ETA 00:00:00 \n", + " \r", + " > Loading document topic probabilities: [> ] 0% ETA 00:00:00 \r", + " > Loading document topic probabilities: [===================] 100% ETA 00:00:00 \n" + ] + } + ], + "source": [ + "model = metapy.topics.TopicModel('lda-cvb0')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's have a look at our topics. A typical way of doing this is to print the top $k$ words in each topic, so let's do that." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3341, 0.1311039703325231),\n", + " (3045, 0.05434932034406297),\n", + " (2677, 0.036780095760011296),\n", + " (3346, 0.033492639884972024),\n", + " (281, 0.022530673690313033),\n", + " (3729, 0.015620482424303144),\n", + " (1953, 0.012780918673797484),\n", + " (707, 0.012635069663149857),\n", + " (592, 0.011987183284170312),\n", + " (2448, 0.01131774038055637)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.top_k(tid=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The models operate on term ids instead of raw text strings, so let's convert this to a human readable format by using the vocabulary contained in our `ForwardIndex` to map the term ids to strings." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('smoke', 0.1311039703325231),\n", + " ('restaur', 0.05434932034406297),\n", + " ('peopl', 0.036780095760011296),\n", + " ('smoker', 0.033492639884972024),\n", + " ('ban', 0.022530673690313033),\n", + " ('think', 0.015620482424303144),\n", + " ('japan', 0.012780918673797484),\n", + " ('complet', 0.012635069663149857),\n", + " ('cigarett', 0.011987183284170312),\n", + " ('non', 0.01131774038055637)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=0)]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('time', 0.06705646364996719),\n", + " ('job', 0.056059299650279136),\n", + " ('part', 0.05222306274365633),\n", + " ('student', 0.046429384401537266),\n", + " ('colleg', 0.03488140708901945),\n", + " ('work', 0.029067480910345566),\n", + " ('money', 0.02885021953621179),\n", + " ('think', 0.0223313502030152),\n", + " ('import', 0.02075570151328543),\n", + " ('studi', 0.015483035500804767)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=1)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can pretty clearly see that this particular dataset was about two major issues: smoking in public and part time jobs for students. This dataset is actually a collection of essays written by students, and there just so happen to be two different topics they can choose from!\n", + "\n", + "The topics are pretty clear in this case, but in some cases it is also useful to score the terms in a topic using some function of the probability of the word in the topic and the probability of the word in the other topics. Intuitively, we might want to select words from each topic that best reflect that topic's content by picking words that both have high probability in that topic **and** have low probability in the other topics. In other words, we want to balance between high probability terms and highly specific terms (this is kind of like a tf-idf weighting). One such scoring function is provided by the toolkit in `BLTermScorer`, which implements a scoring function proposed by Blei and Lafferty." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('smoke', 0.8741642954704982),\n", + " ('restaur', 0.3174635157613981),\n", + " ('smoker', 0.20060264558827434),\n", + " ('ban', 0.12853037061168004),\n", + " ('cigarett', 0.06557603445386917),\n", + " ('non', 0.06128422163678793),\n", + " ('complet', 0.06105372135982501),\n", + " ('japan', 0.05846453427778453),\n", + " ('health', 0.05054834419152887),\n", + " ('seat', 0.04533989518585457)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scorer = metapy.topics.BLTermScorer(model)\n", + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=0, scorer=scorer)]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('job', 0.34822052999812314),\n", + " ('part', 0.31311071220949405),\n", + " ('student', 0.28328931730015805),\n", + " ('colleg', 0.20808996037214555),\n", + " ('time', 0.17797810952278859),\n", + " ('money', 0.16234681881133195),\n", + " ('work', 0.15585089245616857),\n", + " ('studi', 0.08228292617409116),\n", + " ('learn', 0.06491899302248028),\n", + " ('experi', 0.05494526738519956)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=1, scorer=scorer)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Here we can see that the uninformative word stem \"think\" was downweighted from the word list from each topic, since it had relatively high probability in either topic." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can also see the inferred topic distribution for each document." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.topic_distribution(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "It looks like our first document was written by a student who chose the part-time job essay topic..." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.topic_distribution(900)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "...whereas this document looks like it was written by a student who chose the public smoking essay topic." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "source": [ + "We can also infer topics for a brand new document. First, let's create the document and use the forward index we loaded before to convert it to a feature vector:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "doc = metapy.index.Document()\n", + "doc.content(\"I think smoking in public is bad for others' health.\")\n", + "fvec = fidx.tokenize(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's load a topic model inferencer that uses the same CVB inference method we used earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading topic term probabilities: [===========> ] 50% ETA 00:00:00 \r", + " > Loading topic term probabilities: [=======================] 100% ETA 00:00:00 \n" + ] + } + ], + "source": [ + "inferencer = metapy.topics.CVBInferencer('lda-cvb0.phi.bin', alpha=1.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's infer the topic proportions for the new document:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "proportions = inferencer.infer(fvec, max_iters=20, convergence=1e-4)\n", + "print(proportions)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/KDD 2017.ipynb b/metapy/tutorials/KDD 2017.ipynb new file mode 100644 index 0000000000..36b99f8ad8 --- /dev/null +++ b/metapy/tutorials/KDD 2017.ipynb @@ -0,0 +1,4144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Part 1: Feature Engineering for Text Data with MeTA" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In this part of the tutorial, we'll explore how to go from raw text data to feature representations for documents using MeTA. Everything downstream depends on this representation, so it's important that we spend some time talking about the many different ways you can analyze documents into feature representations." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "First, we'll import the `metapy` python bindings." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import metapy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "For reference, this tutorial was written agains the following metapy version:" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.2.6'" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metapy.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If you'd like, you can tell MeTA to log to stderr so you can get progress output when running long-running function calls." + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "metapy.log_to_stderr()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's create a document with some content." + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "doc = metapy.index.Document()\n", + "doc.content(\"I said that I can't believe that it only costs $19.95!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "MeTA provides a stream-based interface for performing document tokenization. Each stream starts off with a Tokenizer object, and in most cases you should use the [Unicode standard aware](http://site.icu-project.org) `ICUTokenizer`." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tok = metapy.analyzers.ICUTokenizer()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Tokenizers operate on raw text and provide an Iterable that spits out the individual text tokens. Let's try running just the `ICUTokenizer` to see what it does." + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '']" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok.set_content(doc.content()) # this could be any string\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "One thing that you likely immediately notice is the insertion of these pseudo-XML looking `` and `` tags. These are called \"sentence boundary tags\". As a side-effect, a default-construted `ICUTokenizer` discovers the sentences in a document by delimiting them with the sentence boundary tags. Let's try tokenizing a multi-sentence document to see what that looks like." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '',\n", + " '',\n", + " 'I',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.',\n", + " '']" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc.content(\"I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.\")\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Most of the information retrieval techniques you have likely been learning about in this class don't need to concern themselves with finding the boundaries between separate sentences in a document, but later today we'll explore a scenario where this might matter more.\n", + "\n", + "Let's pass a flag to the `ICUTokenizer` constructor to disable sentence boundary tags for now." + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " 'I',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.']" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "I mentioned earlier that MeTA treats tokenization as a *streaming* process, and that it *starts* with a tokenizer. As you've learned, for optimal search performance it's often beneficial to modify the raw underlying tokens of a document, and thus change its representation, before adding it to an inverted index structure for searching.\n", + "\n", + "The \"intermediate\" steps in the tokenization stream are represented with objects called Filters. Each filter consumes the content of a previous filter (or a tokenizer) and modifies the tokens coming out of the stream in some way.\n", + "\n", + "Let's start by using a simple filter that can help eliminate a lot of noise that we might encounter when tokenizing web documents: a `LengthFilter`." + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['said',\n", + " 'that',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '19.95',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '30',\n", + " 'before']" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.LengthFilter(tok, min=2, max=30)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Here, we can see that the `LengthFilter` is consuming our original `ICUTokenizer`. It modifies the token stream by only emitting tokens that are of a minimum length of 2 and a maximum length of 30. This can get rid of a lot of punctuation tokens, but also excessively long tokens such as URLs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Another common trick is to remove stopwords. (Can anyone tell me what a stopword is?) In MeTA, this is done using a `ListFilter`." + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘lemur-stopwords.txt’ already there; not retrieving.\r\n", + "\r\n" + ] + }, + { + "data": { + "text/plain": [ + "[\"can't\", 'believe', 'costs', '19.95', 'find', '30']" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!wget -nc https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt\n", + "\n", + "tok = metapy.analyzers.ListFilter(tok, \"lemur-stopwords.txt\", metapy.analyzers.ListFilter.Type.Reject)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Here we've downloaded a common list of stopwords obtained from the [Lemur project](http://lemurproject.org) and created a `ListFilter` to reject any tokens that occur in that list of words.\n", + "\n", + "You can see how much of a difference removing stopwords can make on the size of a document's token stream! This translates to a lot of space savings in the inverted index as well." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Another common filter that people use is called a stemmer, or lemmatizer. This kind of filter tries to modify individual tokens in such a way that different inflected forms of a word all reduce to the same representation. This lets you, for example, find documents about a \"run\" when you search \"running\" or \"runs\". A common stemmer is the [Porter2 Stemmer](http://snowball.tartarus.org/algorithms/english/stemmer.html), which MeTA has an implementation of. Let's try it!" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[\"can't\", 'believ', 'cost', '19.95', 'find', '30']" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.Porter2Filter(tok)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Notice how \"believe\" becomes \"believ\" and \"costs\" becomes \"cost\". Stemming can help search by allowing queries to return more matched documents by relaxing what it means for a document to match a query term. Note that it's important to ensure that queries are tokenized in the *exact same way* as your documents were before indexing them. If you ignore this, your query is unlikely to contain the raw token \"believ\" and you'll miss a lot of results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Finally, after you've got the token stream configured the way you'd like, it's time to analyze the document by consuming each token from its token stream and performing some actions based on these tokens. In the simplest case, which often is enough for \"good enough\" search results, our action can simply be counting how many times these tokens occur.\n", + "\n", + "For clarity, let's switch back to a simpler token stream first. Write me a token stream that tokenizes using the Unicode standard, and then lowercases each token. (Hint: `help(metapy.analyzers)`.)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['i',\n", + " 'said',\n", + " 'that',\n", + " 'i',\n", + " \"can't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " 'i',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.']" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)\n", + "tok = metapy.analyzers.LowercaseFilter(tok)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's count how often each individual token appears in the stream. You might have called this representation the \"bag of words\" representation, but it is also often called \"unigram word counts\". In MeTA, classes that consume a token stream and emit a document representation are called Analyzers." + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'!': 1,\n", + " '$': 2,\n", + " '.': 1,\n", + " '19.95': 1,\n", + " '30': 1,\n", + " 'before': 1,\n", + " 'believe': 1,\n", + " \"can't\": 1,\n", + " 'costs': 1,\n", + " 'could': 1,\n", + " 'find': 1,\n", + " 'for': 1,\n", + " 'i': 3,\n", + " 'it': 2,\n", + " 'more': 1,\n", + " 'only': 2,\n", + " 'said': 1,\n", + " 'than': 1,\n", + " 'that': 2}" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.NGramWordAnalyzer(1, tok)\n", + "print(doc.content())\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If you noticed the name of the analyzer, you might have realized that you can count not just individual tokens, but groups of them. \"Unigram\" means \"1-gram\", and we count individual tokens. \"Bigram\" means \"2-gram\", and we count adjacent tokens together as a group. Let's try that now." + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{('!', 'i'): 1,\n", + " ('$', '19.95'): 1,\n", + " ('$', '30'): 1,\n", + " ('19.95', '!'): 1,\n", + " ('30', 'before'): 1,\n", + " ('before', '.'): 1,\n", + " ('believe', 'that'): 1,\n", + " (\"can't\", 'believe'): 1,\n", + " ('costs', '$'): 1,\n", + " ('could', 'only'): 1,\n", + " ('find', 'it'): 1,\n", + " ('for', 'more'): 1,\n", + " ('i', \"can't\"): 1,\n", + " ('i', 'could'): 1,\n", + " ('i', 'said'): 1,\n", + " ('it', 'for'): 1,\n", + " ('it', 'only'): 1,\n", + " ('more', 'than'): 1,\n", + " ('only', 'costs'): 1,\n", + " ('only', 'find'): 1,\n", + " ('said', 'that'): 1,\n", + " ('than', '$'): 1,\n", + " ('that', 'i'): 1,\n", + " ('that', 'it'): 1}" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.NGramWordAnalyzer(2, tok)\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now the individual \"tokens\" we're counting are pairs of tokens. You can analyze any n-gram of tokens you would like to in this way (and this is a simple way to attempt to support phrase search). Note, however, that as you increase the size of the n-grams you are counting, you are also increasing (exponentially!) the number of possible n-grams you could observe, so there's no free lunch here.\n", + "\n", + "This analysis pipeline feeds both the creation of the `InvertedIndex`, which is used for search applications, and the `ForwardIndex`, which is used for topic modeling and classification applications. For classification, sometimes looking at n-grams of characters is useful." + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{(' ', '$', '1', '9'): 1,\n", + " (' ', '$', '3', '0'): 1,\n", + " (' ', 'I', ' ', 'c'): 2,\n", + " (' ', 'b', 'e', 'f'): 1,\n", + " (' ', 'b', 'e', 'l'): 1,\n", + " (' ', 'c', 'a', 'n'): 1,\n", + " (' ', 'c', 'o', 's'): 1,\n", + " (' ', 'c', 'o', 'u'): 1,\n", + " (' ', 'f', 'i', 'n'): 1,\n", + " (' ', 'f', 'o', 'r'): 1,\n", + " (' ', 'i', 't', ' '): 2,\n", + " (' ', 'm', 'o', 'r'): 1,\n", + " (' ', 'o', 'n', 'l'): 2,\n", + " (' ', 's', 'a', 'i'): 1,\n", + " (' ', 't', 'h', 'a'): 3,\n", + " ('!', ' ', 'I', ' '): 1,\n", + " ('$', '1', '9', '.'): 1,\n", + " ('$', '3', '0', ' '): 1,\n", + " (\"'\", 't', ' ', 'b'): 1,\n", + " ('.', '9', '5', '!'): 1,\n", + " ('0', ' ', 'b', 'e'): 1,\n", + " ('1', '9', '.', '9'): 1,\n", + " ('3', '0', ' ', 'b'): 1,\n", + " ('5', '!', ' ', 'I'): 1,\n", + " ('9', '.', '9', '5'): 1,\n", + " ('9', '5', '!', ' '): 1,\n", + " ('I', ' ', 'c', 'a'): 1,\n", + " ('I', ' ', 'c', 'o'): 1,\n", + " ('I', ' ', 's', 'a'): 1,\n", + " ('a', 'i', 'd', ' '): 1,\n", + " ('a', 'n', ' ', '$'): 1,\n", + " ('a', 'n', \"'\", 't'): 1,\n", + " ('a', 't', ' ', 'I'): 1,\n", + " ('a', 't', ' ', 'i'): 1,\n", + " ('b', 'e', 'f', 'o'): 1,\n", + " ('b', 'e', 'l', 'i'): 1,\n", + " ('c', 'a', 'n', \"'\"): 1,\n", + " ('c', 'o', 's', 't'): 1,\n", + " ('c', 'o', 'u', 'l'): 1,\n", + " ('d', ' ', 'i', 't'): 1,\n", + " ('d', ' ', 'o', 'n'): 1,\n", + " ('d', ' ', 't', 'h'): 1,\n", + " ('e', ' ', 't', 'h'): 2,\n", + " ('e', 'f', 'o', 'r'): 1,\n", + " ('e', 'l', 'i', 'e'): 1,\n", + " ('e', 'v', 'e', ' '): 1,\n", + " ('f', 'i', 'n', 'd'): 1,\n", + " ('f', 'o', 'r', ' '): 1,\n", + " ('f', 'o', 'r', 'e'): 1,\n", + " ('h', 'a', 'n', ' '): 1,\n", + " ('h', 'a', 't', ' '): 2,\n", + " ('i', 'd', ' ', 't'): 1,\n", + " ('i', 'e', 'v', 'e'): 1,\n", + " ('i', 'n', 'd', ' '): 1,\n", + " ('i', 't', ' ', 'f'): 1,\n", + " ('i', 't', ' ', 'o'): 1,\n", + " ('l', 'd', ' ', 'o'): 1,\n", + " ('l', 'i', 'e', 'v'): 1,\n", + " ('l', 'y', ' ', 'c'): 1,\n", + " ('l', 'y', ' ', 'f'): 1,\n", + " ('m', 'o', 'r', 'e'): 1,\n", + " ('n', ' ', '$', '3'): 1,\n", + " ('n', \"'\", 't', ' '): 1,\n", + " ('n', 'd', ' ', 'i'): 1,\n", + " ('n', 'l', 'y', ' '): 2,\n", + " ('o', 'n', 'l', 'y'): 2,\n", + " ('o', 'r', ' ', 'm'): 1,\n", + " ('o', 'r', 'e', ' '): 1,\n", + " ('o', 'r', 'e', '.'): 1,\n", + " ('o', 's', 't', 's'): 1,\n", + " ('o', 'u', 'l', 'd'): 1,\n", + " ('r', ' ', 'm', 'o'): 1,\n", + " ('r', 'e', ' ', 't'): 1,\n", + " ('s', ' ', '$', '1'): 1,\n", + " ('s', 'a', 'i', 'd'): 1,\n", + " ('s', 't', 's', ' '): 1,\n", + " ('t', ' ', 'I', ' '): 1,\n", + " ('t', ' ', 'b', 'e'): 1,\n", + " ('t', ' ', 'f', 'o'): 1,\n", + " ('t', ' ', 'i', 't'): 1,\n", + " ('t', ' ', 'o', 'n'): 1,\n", + " ('t', 'h', 'a', 'n'): 1,\n", + " ('t', 'h', 'a', 't'): 2,\n", + " ('t', 's', ' ', '$'): 1,\n", + " ('u', 'l', 'd', ' '): 1,\n", + " ('v', 'e', ' ', 't'): 1,\n", + " ('y', ' ', 'c', 'o'): 1,\n", + " ('y', ' ', 'f', 'i'): 1}" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.CharacterTokenizer()\n", + "ana = metapy.analyzers.NGramWordAnalyzer(4, tok)\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Different analyzers can be combined together to create document representations that have many unique perspectives. Once things start to get more complicated, we recommend using a configuration file to specify each of the analyzers you wish to combine for your document representation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's explore something a little bit different. MeTA also has a natural language processing (NLP) component, which currently supports two major NLP tasks: part-of-speech tagging and syntactic parsing.\n", + "\n", + "(Does anyone know what part-of-speech tagging is?) POS tagging is a task in NLP that involves identifying a type for each word in a sentence. For example, POS tagging can be used to identify all of the nouns in a sentence, or all of the verbs, or adjectives, or... This is useful as first step towards developing an understanding of the meaning of a particular sentence." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "MeTA places its POS tagging component in its \"sequences\" library. Let's play with some sequences first to get an idea of how they work. We'll start of by creating a sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq = metapy.sequence.Sequence()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, we can add individual words to this sequence. Sequences consist of a list of `Observation`s, which are essentially (word, tag) pairs. If we don't yet know the tags for a `Sequence`, we can just add individual words and leave the tags unset. Words are called \"symbols\" in the library terminology." + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(The, ???), (dog, ???), (ran, ???), (across, ???), (the, ???), (park, ???), (., ???)\n" + ] + } + ], + "source": [ + "for word in [\"The\", \"dog\", \"ran\", \"across\", \"the\", \"park\", \".\"]:\n", + " seq.add_symbol(word)\n", + "print(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The printed form of the sequence shows that we do not yet know the tags for each word. Let's fill them in by using a pre-trained POS-tagger model that's distributed with MeTA." + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘greedy-perceptron-tagger.tar.gz’ already there; not retrieving.\n", + "\n", + "perceptron-tagger/\n", + "perceptron-tagger/feature.mapping.gz\n", + "perceptron-tagger/label.mapping\n", + "perceptron-tagger/tagger.model.gz\n" + ] + } + ], + "source": [ + "!wget -nc https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-perceptron-tagger.tar.gz\n", + "!tar xvf greedy-perceptron-tagger.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n" + ] + } + ], + "source": [ + "tagger = metapy.sequence.PerceptronTagger(\"perceptron-tagger/\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now let's fill in the missing tags in our sentence based on the best guess this model has." + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(The, DT), (dog, NN), (ran, VBD), (across, IN), (the, DT), (park, NN), (., .)\n" + ] + } + ], + "source": [ + "tagger.tag(seq)\n", + "print(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Each tag indicates the type of a word, and this particular tagger was trained to output the tags present in the [Penn Treebank tagset](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).\n", + "\n", + "But what if we want to POS-tag a document?" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.\n" + ] + } + ], + "source": [ + "print(doc.content())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We need a way of going from a document to a list of `Sequence`s, each representing an individual sentence. I'll get you started." + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['',\n", + " 'I',\n", + " 'said',\n", + " 'that',\n", + " 'I',\n", + " 'ca',\n", + " \"n't\",\n", + " 'believe',\n", + " 'that',\n", + " 'it',\n", + " 'only',\n", + " 'costs',\n", + " '$',\n", + " '19.95',\n", + " '!',\n", + " '',\n", + " '',\n", + " 'I',\n", + " 'could',\n", + " 'only',\n", + " 'find',\n", + " 'it',\n", + " 'for',\n", + " 'more',\n", + " 'than',\n", + " '$',\n", + " '30',\n", + " 'before',\n", + " '.',\n", + " '']" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer() # keep sentence boundaries!\n", + "tok = metapy.analyzers.PennTreebankNormalizer(tok)\n", + "tok.set_content(doc.content())\n", + "[token for token in tok]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "(Notice that the `PennTreebankNormalizer` modifies some tokens to better match the conventions of the Penn Treebank training data. This should help improve performance a little.)\n", + "\n", + "Now, write me a function that can take a token stream that contains sentence boundary tags and returns a list of `Sequence` objects. Don't include the sentence boundary tags in the actual `Sequence` objects." + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def extract_sequences(tok):\n", + " sequences = []\n", + " for token in tok:\n", + " if token == '':\n", + " sequences.append(metapy.sequence.Sequence())\n", + " elif token != '':\n", + " sequences[-1].add_symbol(token) \n", + " return sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(I, PRP), (said, VBD), (that, IN), (I, PRP), (ca, MD), (n't, RB), (believe, VB), (that, IN), (it, PRP), (only, RB), (costs, VBZ), ($, $), (19.95, CD), (!, .)\n", + "(I, PRP), (could, MD), (only, RB), (find, VB), (it, PRP), (for, IN), (more, JJR), (than, IN), ($, $), (30, CD), (before, IN), (., .)\n" + ] + } + ], + "source": [ + "tok.set_content(doc.content())\n", + "for seq in extract_sequences(tok):\n", + " tagger.tag(seq)\n", + " print(seq)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This is still a rather shallow understanding of these sentences. The next major leap is to parse these sequences of POS-tagged words to obtain a tree for each sentence. These trees, in our case, will represent the hierarchical phrase structure of a single sentence by grouping together tokens that belong to one phrase together, and showing how small phrases combine into larger phrases, and eventually a sentence.\n", + "\n", + "Let's try parsing the sentences in our document using a pre-tranned constituency parser that's distributed with MeTA." + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘greedy-constituency-parser.tar.gz’ already there; not retrieving.\n", + "\n", + "parser/\n", + "parser/parser.trans.gz\n", + "parser/parser.model.gz\n" + ] + } + ], + "source": [ + "!wget -nc https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-constituency-parser.tar.gz\n", + "!tar xvf greedy-constituency-parser.tar.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "parser = metapy.parser.Parser(\"parser/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I could only find it for more than $ 30 before .\n", + "(I, PRP), (could, MD), (only, RB), (find, VB), (it, PRP), (for, IN), (more, JJR), (than, IN), ($, $), (30, CD), (before, IN), (., .)\n", + "(ROOT\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (MD could)\n", + " (ADVP (RB only))\n", + " (VP\n", + " (VB find)\n", + " (NP (PRP it))\n", + " (PP\n", + " (IN for)\n", + " (NP\n", + " (QP\n", + " (JJR more)\n", + " (IN than)\n", + " ($ $)\n", + " (CD 30))))\n", + " (ADVP (IN before))))\n", + " (. .)))\n", + "\n" + ] + } + ], + "source": [ + "print(' '.join([obs.symbol for obs in seq]))\n", + "print(seq)\n", + "tree = parser.parse(seq)\n", + "print(tree.pretty_str())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "(You can also play with this with a [prettier online demo](https://meta-toolkit.org/nlp-demo.html).)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can now parse all of the sentences in our document." + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(ROOT\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (VBD said)\n", + " (SBAR\n", + " (IN that)\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (MD ca)\n", + " (RB n't)\n", + " (VP\n", + " (VB believe)\n", + " (SBAR\n", + " (IN that)\n", + " (S\n", + " (NP (PRP it))\n", + " (ADVP (RB only))\n", + " (VP\n", + " (VBZ costs)\n", + " (NP\n", + " ($ $)\n", + " (CD 19.95))))))))))\n", + " (. !)))\n", + "\n", + "(ROOT\n", + " (S\n", + " (NP (PRP I))\n", + " (VP\n", + " (MD could)\n", + " (ADVP (RB only))\n", + " (VP\n", + " (VB find)\n", + " (NP (PRP it))\n", + " (PP\n", + " (IN for)\n", + " (NP\n", + " (QP\n", + " (JJR more)\n", + " (IN than)\n", + " ($ $)\n", + " (CD 30))))\n", + " (ADVP (IN before))))\n", + " (. .)))\n", + "\n" + ] + } + ], + "source": [ + "tok.set_content(doc.content())\n", + "for seq in extract_sequences(tok):\n", + " tagger.tag(seq)\n", + " print(parser.parse(seq).pretty_str())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now that we know how POS-tagging and syntactic parsing works in MeTA, let's explore some features that we can add to our document representations using these techniques.\n", + "\n", + "The simplest feature we can imagine that uses the POS-taggged sequences might be n-grams of POS tags. (As a quick detour, we'll need to download and extract a CRF-based POS tagging model.)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘crf.tar.gz’ already there; not retrieving.\r\n", + "\r\n" + ] + } + ], + "source": [ + "!wget -nc https://github.com/meta-toolkit/meta/releases/download/v3.0.1/crf.tar.gz\n", + "!tar xf crf.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, we can use the following analysis pipeline to get n-gram POS tag features by using the `NGRamPOSAnalyzer`:" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "{('$', 'CD'): 2,\n", + " ('CD', '.'): 1,\n", + " ('CD', 'RB'): 1,\n", + " ('IN', '$'): 1,\n", + " ('IN', 'JJR'): 1,\n", + " ('IN', 'PRP'): 2,\n", + " ('JJR', 'IN'): 1,\n", + " ('MD', 'RB'): 2,\n", + " ('PRP', 'IN'): 1,\n", + " ('PRP', 'MD'): 2,\n", + " ('PRP', 'RB'): 1,\n", + " ('PRP', 'VBD'): 1,\n", + " ('RB', '.'): 1,\n", + " ('RB', 'VB'): 2,\n", + " ('RB', 'VBZ'): 1,\n", + " ('VB', 'IN'): 1,\n", + " ('VB', 'PRP'): 1,\n", + " ('VBD', 'IN'): 1,\n", + " ('VBZ', '$'): 1}" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = metapy.analyzers.ICUTokenizer()\n", + "tok = metapy.analyzers.PennTreebankNormalizer(tok)\n", + "ana = metapy.analyzers.NGramPOSAnalyzer(2, tok, 'crf')\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can also parse the sentences in the document and extract a number of different structural features from the parse trees using a `TreeAnalyzer`." + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n" + ] + } + ], + "source": [ + "ana = metapy.analyzers.TreeAnalyzer(tok, 'perceptron-tagger', 'parser')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The `TreeAnalyzer` has a function `add()` that takes `TreeFeaturizer` subclasses. Conceptually, the extraction of structural features from parse trees looks something like this:\n", + "\n", + "1. The tokenizer is run until a full sentence is read.\n", + "2. The greedy perceptron tagger is run to tag the words in the sentence.\n", + "3. The shift-reduce constituency parser is run to produce a parse tree.\n", + "4. Each `TreeFeaturizer` that is part of the `TreeAnalayzer` is run over the parse tree to produce features.\n", + "\n", + "This process is repeated for each sentence found in the document.\n", + "\n", + "Let's try adding just one `TreeFeaturizer` to the analyzer for now and see what features we get." + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'depth-12': 1, 'depth-8': 1}" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana.add(metapy.analyzers.DepthFeaturizer())\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The featurizer we used here simply extracts the depth of each subtree and creates a new feature for each depth encountered.\n", + "\n", + "We can also see some features that utilize the structure of the trees if we use some different `TreeFeaturizer`s." + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "{'subtree-($)': 2,\n", + " 'subtree-(.)': 2,\n", + " 'subtree-(ADVP (IN))': 1,\n", + " 'subtree-(ADVP (RB))': 2,\n", + " 'subtree-(CD)': 2,\n", + " 'subtree-(IN)': 5,\n", + " 'subtree-(JJR)': 1,\n", + " 'subtree-(MD)': 2,\n", + " 'subtree-(NP ($) (CD))': 1,\n", + " 'subtree-(NP (PRP))': 5,\n", + " 'subtree-(NP (QP))': 1,\n", + " 'subtree-(PP (IN) (NP))': 1,\n", + " 'subtree-(PRP)': 5,\n", + " 'subtree-(QP (JJR) (IN) ($) (CD))': 1,\n", + " 'subtree-(RB)': 3,\n", + " 'subtree-(ROOT (S))': 2,\n", + " 'subtree-(S (NP) (ADVP) (VP))': 1,\n", + " 'subtree-(S (NP) (VP) (.))': 2,\n", + " 'subtree-(S (NP) (VP))': 1,\n", + " 'subtree-(SBAR (IN) (S))': 2,\n", + " 'subtree-(VB)': 2,\n", + " 'subtree-(VBD)': 1,\n", + " 'subtree-(VBZ)': 1,\n", + " 'subtree-(VP (MD) (ADVP) (VP))': 1,\n", + " 'subtree-(VP (MD) (RB) (VP))': 1,\n", + " 'subtree-(VP (VB) (NP) (PP) (ADVP))': 1,\n", + " 'subtree-(VP (VB) (SBAR))': 1,\n", + " 'subtree-(VP (VBD) (SBAR))': 1,\n", + " 'subtree-(VP (VBZ) (NP))': 1}" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.TreeAnalyzer(tok, 'perceptron-tagger', 'parser')\n", + "ana.add(metapy.analyzers.SubtreeFeaturizer())\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The `SubtreeFeaturizer` creates a new feature for each unique subtree seen in the data, to a depth of 1. This can create quite a lot of features, but describes how the sentence is decomposed structureally. This kind of feature is also known as a \"rewrite rule\" feature.\n", + "\n", + "We can also ignore the labels of the subtrees entirely and just extract their structure if we use a `SkeletonFeaturizer`." + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "{'(((())(()(()((())(()()(()(()((())(())(()(()())))))))))()))': 1,\n", + " '(((())(()(())(()(())(()((()()()())))(())))()))': 1,\n", + " '((()()()()))': 1,\n", + " '((())(()(()((())(()()(()(()((())(())(()(()())))))))))())': 1,\n", + " '((())(()(())(()(())(()((()()()())))(())))())': 1,\n", + " '((())(()()(()(()((())(())(()(()())))))))': 1,\n", + " '((())(())(()(()())))': 1,\n", + " '(()((()()()())))': 1,\n", + " '(()((())(()()(()(()((())(())(()(()()))))))))': 1,\n", + " '(()((())(())(()(()()))))': 1,\n", + " '(()(()((())(()()(()(()((())(())(()(()())))))))))': 1,\n", + " '(()(()((())(())(()(()())))))': 1,\n", + " '(()(()()))': 1,\n", + " '(()(())(()((()()()())))(()))': 1,\n", + " '(()(())(()(())(()((()()()())))(())))': 1,\n", + " '(()()(()(()((())(())(()(()()))))))': 1,\n", + " '(()()()())': 1,\n", + " '(()())': 1,\n", + " '(())': 8,\n", + " '()': 26}" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana = metapy.analyzers.TreeAnalyzer(tok, 'perceptron-tagger', 'parser')\n", + "ana.add(metapy.analyzers.SkeletonFeaturizer())\n", + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Play with the other featurizers to see what they do!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In practice, it is often beneficial to combine multiple feature sets together. We can do this with a `MultiAnalyzer`. Let's combine unigram words, bigram POS tags, and rewrite rules for our document feature representation.\n", + "\n", + "We can certainly do this programmatically, but doing so can become tedious quite quickly. Instead, let's use MeTA's configuration file format to specify our analyzer, which we can then load in one line of code. MeTA uses [TOML](https://en.wikipedia.org/wiki/TOML) configuration files for all of its configuration. If you haven't heard of TOML before, don't panic! It's a very simple, readable format that looks like old school INI files.\n", + "\n", + "Let's create a simple configuration file now." + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "config = \"\"\"stop-words = \"lemur-stopwords.txt\"\n", + "\n", + "[[analyzers]]\n", + "method = \"ngram-word\"\n", + "ngram = 1\n", + "filter = \"default-unigram-chain\"\n", + "\n", + "[[analyzers]]\n", + "method = \"ngram-pos\"\n", + "ngram = 2\n", + "filter = [{type = \"icu-tokenizer\"}, {type = \"ptb-normalizer\"}]\n", + "crf-prefix = \"crf\"\n", + "\n", + "[[analyzers]]\n", + "method = \"tree\"\n", + "filter = [{type = \"icu-tokenizer\"}, {type = \"ptb-normalizer\"}]\n", + "features = [\"subtree\"]\n", + "tagger = \"perceptron-tagger/\"\n", + "parser = \"parser/\"\n", + "\"\"\"\n", + "with open('config.toml', 'w') as f:\n", + " f.write(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Each `[[analyzers]]` block defines another analyzer to combine for our feature representation. Since \"ngram-word\" is such a common analyzer, we have defined some default filter chains that can be used with shortcuts. \"default-unigram-chain\" is a filter chain suitable for unigram words; \"default-chain\" is a filter chain suitable for bigram words and above.\n", + "\n", + "We can now load an analyzer from this configuration file like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n", + " > Loading feature mapping: [================================] 100% ETA 00:00:00 \n", + " \n" + ] + } + ], + "source": [ + "ana = metapy.analyzers.load('config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now let's see what we get!" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'$_CD': 2,\n", + " 'CD_.': 1,\n", + " 'CD_RB': 1,\n", + " 'IN_$': 1,\n", + " 'IN_JJR': 1,\n", + " 'IN_PRP': 2,\n", + " 'JJR_IN': 1,\n", + " 'MD_RB': 2,\n", + " 'PRP_IN': 1,\n", + " 'PRP_MD': 2,\n", + " 'PRP_RB': 1,\n", + " 'PRP_VBD': 1,\n", + " 'RB_.': 1,\n", + " 'RB_VB': 2,\n", + " 'RB_VBZ': 1,\n", + " 'VBD_IN': 1,\n", + " 'VBZ_$': 1,\n", + " 'VB_IN': 1,\n", + " 'VB_PRP': 1,\n", + " 'believ': 1,\n", + " \"can't\": 1,\n", + " 'cost': 1,\n", + " 'find': 1,\n", + " 'subtree-($)': 2,\n", + " 'subtree-(.)': 2,\n", + " 'subtree-(ADVP (IN))': 1,\n", + " 'subtree-(ADVP (RB))': 2,\n", + " 'subtree-(CD)': 2,\n", + " 'subtree-(IN)': 5,\n", + " 'subtree-(JJR)': 1,\n", + " 'subtree-(MD)': 2,\n", + " 'subtree-(NP ($) (CD))': 1,\n", + " 'subtree-(NP (PRP))': 5,\n", + " 'subtree-(NP (QP))': 1,\n", + " 'subtree-(PP (IN) (NP))': 1,\n", + " 'subtree-(PRP)': 5,\n", + " 'subtree-(QP (JJR) (IN) ($) (CD))': 1,\n", + " 'subtree-(RB)': 3,\n", + " 'subtree-(ROOT (S))': 2,\n", + " 'subtree-(S (NP) (ADVP) (VP))': 1,\n", + " 'subtree-(S (NP) (VP) (.))': 2,\n", + " 'subtree-(S (NP) (VP))': 1,\n", + " 'subtree-(SBAR (IN) (S))': 2,\n", + " 'subtree-(VB)': 2,\n", + " 'subtree-(VBD)': 1,\n", + " 'subtree-(VBZ)': 1,\n", + " 'subtree-(VP (MD) (ADVP) (VP))': 1,\n", + " 'subtree-(VP (MD) (RB) (VP))': 1,\n", + " 'subtree-(VP (VB) (NP) (PP) (ADVP))': 1,\n", + " 'subtree-(VP (VB) (SBAR))': 1,\n", + " 'subtree-(VP (VBD) (SBAR))': 1,\n", + " 'subtree-(VP (VBZ) (NP))': 1}" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ana.analyze(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Part 2: Information Retrieval with MeTA" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In this part of the tutorial, we'll play with the first major application of MeTA: search engines. We will be having the first contest in this part! Once we finish going through how to create an inverted index, search it, and evaluate retrieval algorithms, I will give you instructions on how to participate in the competition. There will be a leader board to keep track of the best submissions, and I intend on leaving it running until the end of the conference for people to play around with." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's get a publicly available retrieval dataset with relevance judgments first." + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-08-16 19:19:41-- https://meta-toolkit.org/data/2016-11-10/cranfield.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving meta-toolkit.org... 50.116.41.177, 2600:3c02::f03c:91ff:feae:b777\n", + "Connecting to meta-toolkit.org|50.116.41.177|:443... connected.\n", + "HTTP request sent, awaiting response... 304 Not Modified\n", + "File ‘cranfield.tar.gz’ not modified on server. Omitting download.\n", + "\n" + ] + } + ], + "source": [ + "!wget -N https://meta-toolkit.org/data/2016-11-10/cranfield.tar.gz\n", + "!tar xf cranfield.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We're going to add a flag to our corpus' configuration file to force it to store full text for later." + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with open('cranfield/tutorial.toml', 'w') as f:\n", + " f.write('type = \"line-corpus\"\\n')\n", + " f.write('store-full-text = true\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's set up a MeTA configuration file up to index the `cranfield` dataset we just downloaded using the default unigram words filter chain." + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "config = \"\"\"prefix = \".\" # tells MeTA where to search for datasets\n", + "\n", + "dataset = \"cranfield\" # a subfolder under the prefix directory\n", + "corpus = \"tutorial.toml\" # a configuration file for the corpus specifying its format & additional args\n", + "\n", + "index = \"cranfield-idx\" # subfolder of the current working directory to place index files\n", + "\n", + "query-judgements = \"cranfield/cranfield-qrels.txt\" # file containing the relevance judgments for this dataset\n", + "\n", + "stop-words = \"lemur-stopwords.txt\"\n", + "\n", + "[[analyzers]]\n", + "method = \"ngram-word\"\n", + "ngram = 1\n", + "filter = \"default-unigram-chain\"\n", + "\"\"\"\n", + "with open('cranfield-config.toml', 'w') as f:\n", + " f.write(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's index our data using the `InvertedIndex` format. In a search engine, we want to quickly determine what documents mention a specific query term, so the `InvertedIndex` stores a mapping from term to a list of documents that contain that term (along with how many times they do)." + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1502921982: [info] Loading index from disk: cranfield-idx/inv (/tmp/pip-bneszy3v-build/deps/meta/src/index/inverted_index.cpp:171)\n", + "1502921982: [info] Loading index from disk: cranfield-idx/inv (/tmp/pip-bneszy3v-build/deps/meta/src/index/inverted_index.cpp:171)\n" + ] + } + ], + "source": [ + "inv_idx = metapy.index.make_inverted_index('cranfield-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This may take a minute at first, since the index needs to be built. Subsequent calls to `make_inverted_index` with this config file will simply load the index, which will not take any time.\n", + "\n", + "Here's how we can interact with the index object:" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1400" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.num_docs()" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4137" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.unique_terms()" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "87.17857360839844" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.avg_doc_length()" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "122050" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.total_corpus_terms()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's search our index. We'll start by creating a ranker:" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ranker = metapy.index.OkapiBM25()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now we need a query. Let's create an example query." + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "query = metapy.index.Document()\n", + "query.content(\"flow equilibrium\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now we can use this to search our index like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(235, 6.424363136291504),\n", + " (1009, 6.096038818359375),\n", + " (1229, 5.877272129058838),\n", + " (1251, 5.866937160491943),\n", + " (316, 5.859640121459961)]" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_docs = ranker.score(inv_idx, query, num_results=5)\n", + "top_docs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We are returned a ranked list of *(doc_id, score)* pairs. The scores are from the ranker, which in this case was Okapi BM25. Since the `tutorial.toml` file we created for the cranfield dataset has `store-full-text = true`, we can verify the content of our top documents by inspecting the document metadata field \"content\"." + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. criteria for thermodynamic equilibrium in gas flow . when gases flow at high velocity, the rates of internal processes may not be fast enough to maintain thermodynamic equilibrium . by defining quasi-equilibrium in flow as the condition in which the...\n", + "\n", + "2. free-flight measurements of the static and dynamic . air-flow properties in nozzles were calculated and charted for equilibrium flow and two types of frozen flows . in one type of frozen flow, air was assumed to be in equilibrium from the nozzle res...\n", + "\n", + "3. hypersonic nozzle expansion of air with atom recombination present . an experimental investigation on the expansion of high- temperature, high-pressure air to hypersonic flow mach numbers in a conical nozzle of a hypersonic shock tunnel has been carr...\n", + "\n", + "4. on the approach to chemical and vibrational equilibrium behind a strong normal shock wave . the concurrent approach to chemical and vibrational equilibrium of a pure diatomic gas passing through a strong normal shock wave is investigated . it is dem...\n", + "\n", + "5. non-equilibrium flow of an ideal dissociating gas . the theory of an'ideal dissociating'gas developed by lighthill/1957/for conditions of thermodynamic equilibrium is extended to non-equilibrium conditions by postulating a simple rate equation for th...\n", + "\n" + ] + } + ], + "source": [ + "for num, (d_id, _) in enumerate(top_docs):\n", + " content = inv_idx.metadata(d_id).get('content')\n", + " print(\"{}. {}...\\n\".format(num + 1, content[0:250]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Since we have the queries file and relevance judgements, we can do an IR evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ev = metapy.index.IREval('cranfield-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We will loop over the queries file and add each result to the `IREval` object `ev`." + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query 1 average precision: 0.24166666666666664\n", + "Query 2 average precision: 0.4196428571428571\n", + "Query 3 average precision: 0.6383928571428572\n", + "Query 4 average precision: 0.25\n", + "Query 5 average precision: 0.3333333333333333\n", + "Query 6 average precision: 0.125\n", + "Query 7 average precision: 0.11666666666666665\n", + "Query 8 average precision: 0.1\n", + "Query 9 average precision: 0.6388888888888888\n", + "Query 10 average precision: 0.0625\n", + "Query 11 average precision: 0.09285714285714286\n", + "Query 12 average precision: 0.18\n", + "Query 13 average precision: 0.0\n", + "Query 14 average precision: 0.5\n", + "Query 15 average precision: 1.0\n", + "Query 16 average precision: 0.16666666666666666\n", + "Query 17 average precision: 0.08333333333333333\n", + "Query 18 average precision: 0.3333333333333333\n", + "Query 19 average precision: 0.0\n", + "Query 20 average precision: 0.4302469135802469\n", + "Query 21 average precision: 0.0\n", + "Query 22 average precision: 0.0\n", + "Query 23 average precision: 0.19952380952380952\n", + "Query 24 average precision: 0.3333333333333333\n", + "Query 25 average precision: 0.6507936507936507\n", + "Query 26 average precision: 0.19444444444444442\n", + "Query 27 average precision: 0.12962962962962962\n", + "Query 28 average precision: 0.0\n", + "Query 29 average precision: 0.35\n", + "Query 30 average precision: 0.023809523809523808\n", + "Query 31 average precision: 0.0\n", + "Query 32 average precision: 0.1111111111111111\n", + "Query 33 average precision: 0.6388888888888888\n", + "Query 34 average precision: 0.1111111111111111\n", + "Query 35 average precision: 0.0\n", + "Query 36 average precision: 0.5\n", + "Query 37 average precision: 0.0\n", + "Query 38 average precision: 0.0\n", + "Query 39 average precision: 0.1\n", + "Query 40 average precision: 0.045\n", + "Query 41 average precision: 0.6666666666666666\n", + "Query 42 average precision: 0.16714285714285712\n", + "Query 43 average precision: 0.4583333333333333\n", + "Query 44 average precision: 0.0\n", + "Query 45 average precision: 0.1\n", + "Query 46 average precision: 0.4058333333333334\n", + "Query 47 average precision: 0.27341269841269844\n", + "Query 48 average precision: 0.17666666666666667\n", + "Query 49 average precision: 0.1\n", + "Query 50 average precision: 0.05555555555555555\n", + "Query 51 average precision: 0.4730952380952381\n", + "Query 52 average precision: 0.47916666666666663\n", + "Query 53 average precision: 0.15222222222222223\n", + "Query 54 average precision: 0.05555555555555555\n", + "Query 55 average precision: 0.19444444444444445\n", + "Query 56 average precision: 0.1\n", + "Query 57 average precision: 0.03333333333333333\n", + "Query 58 average precision: 0.0380952380952381\n", + "Query 59 average precision: 0.027777777777777776\n", + "Query 60 average precision: 0.42000000000000004\n", + "Query 61 average precision: 0.5638888888888889\n", + "Query 62 average precision: 0.0\n", + "Query 63 average precision: 0.0\n", + "Query 64 average precision: 0.5\n", + "Query 65 average precision: 0.24\n", + "Query 66 average precision: 0.02857142857142857\n", + "Query 67 average precision: 0.575\n", + "Query 68 average precision: 0.04\n", + "Query 69 average precision: 0.02857142857142857\n", + "Query 70 average precision: 0.05\n", + "Query 71 average precision: 0.017857142857142856\n", + "Query 72 average precision: 0.12\n", + "Query 73 average precision: 0.4680952380952381\n", + "Query 74 average precision: 0.020833333333333332\n", + "Query 75 average precision: 0.0\n", + "Query 76 average precision: 0.07142857142857142\n", + "Query 77 average precision: 0.2333333333333333\n", + "Query 78 average precision: 0.7222222222222222\n", + "Query 79 average precision: 0.0\n", + "Query 80 average precision: 0.0\n", + "Query 81 average precision: 0.75\n", + "Query 82 average precision: 0.24\n", + "Query 83 average precision: 0.0625\n", + "Query 84 average precision: 0.3\n", + "Query 85 average precision: 0.25\n", + "Query 86 average precision: 0.5833333333333333\n", + "Query 87 average precision: 0.0\n", + "Query 88 average precision: 0.6496031746031746\n", + "Query 89 average precision: 0.05555555555555555\n", + "Query 90 average precision: 0.15607142857142856\n", + "Query 91 average precision: 0.2577160493827161\n", + "Query 92 average precision: 0.5014285714285714\n", + "Query 93 average precision: 0.5\n", + "Query 94 average precision: 0.5264285714285715\n", + "Query 95 average precision: 0.5\n", + "Query 96 average precision: 0.38976190476190475\n", + "Query 97 average precision: 0.15416666666666665\n", + "Query 98 average precision: 0.0\n", + "Query 99 average precision: 0.18333333333333335\n", + "Query 100 average precision: 0.16666666666666663\n", + "Query 101 average precision: 0.6958333333333333\n", + "Query 102 average precision: 0.3214285714285714\n", + "Query 103 average precision: 0.0\n", + "Query 104 average precision: 0.06666666666666667\n", + "Query 105 average precision: 0.3833333333333333\n", + "Query 106 average precision: 0.38571428571428573\n", + "Query 107 average precision: 0.17261904761904762\n", + "Query 108 average precision: 0.5901360544217686\n", + "Query 109 average precision: 0.0\n", + "Query 110 average precision: 0.125\n", + "Query 111 average precision: 0.08333333333333333\n", + "Query 112 average precision: 0.25\n", + "Query 113 average precision: 0.08333333333333333\n", + "Query 114 average precision: 0.0\n", + "Query 115 average precision: 0.05\n", + "Query 116 average precision: 0.05\n", + "Query 117 average precision: 0.0\n", + "Query 118 average precision: 0.21666666666666667\n", + "Query 119 average precision: 1.0\n", + "Query 120 average precision: 0.39589947089947086\n", + "Query 121 average precision: 0.369047619047619\n", + "Query 122 average precision: 0.21164021164021163\n", + "Query 123 average precision: 0.0\n", + "Query 124 average precision: 0.0\n", + "Query 125 average precision: 0.2095238095238095\n", + "Query 126 average precision: 0.20833333333333331\n", + "Query 127 average precision: 0.05\n", + "Query 128 average precision: 0.0\n", + "Query 129 average precision: 0.369047619047619\n", + "Query 130 average precision: 0.5\n", + "Query 131 average precision: 0.10238095238095238\n", + "Query 132 average precision: 0.48476190476190484\n", + "Query 133 average precision: 0.05215419501133787\n", + "Query 134 average precision: 0.25\n", + "Query 135 average precision: 0.3839285714285714\n", + "Query 136 average precision: 0.3333333333333333\n", + "Query 137 average precision: 0.225\n", + "Query 138 average precision: 0.1\n", + "Query 139 average precision: 0.0\n", + "Query 140 average precision: 0.13888888888888887\n", + "Query 141 average precision: 0.075\n", + "Query 142 average precision: 0.0\n", + "Query 143 average precision: 0.7\n", + "Query 144 average precision: 0.28439153439153436\n", + "Query 145 average precision: 0.21995464852607707\n", + "Query 146 average precision: 0.5833333333333333\n", + "Query 147 average precision: 0.22666666666666666\n", + "Query 148 average precision: 0.16666666666666666\n", + "Query 149 average precision: 0.24861111111111106\n", + "Query 150 average precision: 0.8333333333333333\n", + "Query 151 average precision: 0.0\n", + "Query 152 average precision: 0.0\n", + "Query 153 average precision: 0.2738095238095238\n", + "Query 154 average precision: 0.8333333333333333\n", + "Query 155 average precision: 0.125\n", + "Query 156 average precision: 0.5607142857142857\n", + "Query 157 average precision: 0.29861111111111105\n", + "Query 158 average precision: 0.3625\n", + "Query 159 average precision: 0.043402777777777776\n", + "Query 160 average precision: 0.1\n", + "Query 161 average precision: 0.5\n", + "Query 162 average precision: 0.10416666666666666\n", + "Query 163 average precision: 0.24444444444444446\n", + "Query 164 average precision: 0.31805555555555554\n", + "Query 165 average precision: 0.5833333333333333\n", + "Query 166 average precision: 0.013888888888888888\n", + "Query 167 average precision: 0.5\n", + "Query 168 average precision: 0.08333333333333333\n", + "Query 169 average precision: 0.25\n", + "Query 170 average precision: 0.5694444444444444\n", + "Query 171 average precision: 0.6388888888888888\n", + "Query 172 average precision: 0.6791666666666667\n", + "Query 173 average precision: 1.0\n", + "Query 174 average precision: 0.03333333333333333\n", + "Query 175 average precision: 0.02\n", + "Query 176 average precision: 0.0\n", + "Query 177 average precision: 0.5888888888888889\n", + "Query 178 average precision: 0.3333333333333333\n", + "Query 179 average precision: 0.29166666666666663\n", + "Query 180 average precision: 0.33095238095238094\n", + "Query 181 average precision: 0.2\n", + "Query 182 average precision: 0.5833333333333333\n", + "Query 183 average precision: 0.5580952380952381\n", + "Query 184 average precision: 0.21428571428571427\n", + "Query 185 average precision: 0.6388888888888888\n", + "Query 186 average precision: 0.16619047619047617\n", + "Query 187 average precision: 0.13888888888888887\n", + "Query 188 average precision: 0.3196428571428571\n", + "Query 189 average precision: 0.05952380952380952\n", + "Query 190 average precision: 0.3\n", + "Query 191 average precision: 0.05333333333333333\n", + "Query 192 average precision: 0.5666666666666667\n", + "Query 193 average precision: 0.8282627865961198\n", + "Query 194 average precision: 0.041666666666666664\n", + "Query 195 average precision: 0.05555555555555555\n", + "Query 196 average precision: 0.11666666666666665\n", + "Query 197 average precision: 0.5555555555555555\n", + "Query 198 average precision: 0.44375\n", + "Query 199 average precision: 0.025\n", + "Query 200 average precision: 0.1851851851851852\n", + "Query 201 average precision: 0.3583333333333333\n", + "Query 202 average precision: 0.19166666666666665\n", + "Query 203 average precision: 0.1595238095238095\n", + "Query 204 average precision: 0.0\n", + "Query 205 average precision: 0.75\n", + "Query 206 average precision: 0.2619047619047619\n", + "Query 207 average precision: 0.14444444444444443\n", + "Query 208 average precision: 0.6916666666666668\n", + "Query 209 average precision: 0.05555555555555556\n", + "Query 210 average precision: 0.27777777777777773\n", + "Query 211 average precision: 0.0125\n", + "Query 212 average precision: 0.4891666666666666\n", + "Query 213 average precision: 0.4625\n", + "Query 214 average precision: 0.125\n", + "Query 215 average precision: 0.0\n", + "Query 216 average precision: 0.0\n", + "Query 217 average precision: 0.2\n", + "Query 218 average precision: 0.24285714285714283\n", + "Query 219 average precision: 0.0\n", + "Query 220 average precision: 0.13333333333333333\n", + "Query 221 average precision: 0.275\n", + "Query 222 average precision: 0.5978835978835979\n", + "Query 223 average precision: 0.44166666666666665\n", + "Query 224 average precision: 0.04285714285714286\n", + "Query 225 average precision: 0.1775\n" + ] + } + ], + "source": [ + "num_results = 10\n", + "with open('cranfield/cranfield-queries.txt') as query_file:\n", + " for query_num, line in enumerate(query_file):\n", + " query.content(line.strip())\n", + " results = ranker.score(inv_idx, query, num_results) \n", + " avg_p = ev.avg_p(results, query_num + 1, num_results)\n", + " print(\"Query {} average precision: {}\".format(query_num + 1, avg_p))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Afterwards, we can get the mean average precision of all the queries." + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.25511867318944054" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ev.map()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In the competition, you should try experimenting with different rankers, ranker parameters, tokenization, and filters. What combination can give you the best results?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Lastly, it's possible to define your own ranking function in Python." + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "class SimpleRanker(metapy.index.RankingFunction): \n", + " \"\"\" \n", + " Create a new ranking function in Python that can be used in MeTA. \n", + " \"\"\" \n", + " def __init__(self, some_param=1.0): \n", + " self.param = some_param\n", + " # You *must* invoke the base class __init__() here!\n", + " super(SimpleRanker, self).__init__() \n", + " \n", + " def score_one(self, sd):\n", + " \"\"\"\n", + " You need to override this function to return a score for a single term.\n", + " For fields available in the score_data sd object,\n", + " @see https://meta-toolkit.org/doxygen/structmeta_1_1index_1_1score__data.html\n", + " \"\"\"\n", + " return (self.param + sd.doc_term_count) / (self.param * sd.doc_unique_terms + sd.doc_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "**COMPETITION TIME**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Part 3: Document Classification with MeTA" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In this part of the tutorial, we'll play with the next major application for MeTA: creating classifiers. We will be having the second contest in this part! Once we finish going through how to create a forward index, train classifiers on top of it, and perform classifier evaluation and cross validation, I will give you instructions on how to participate in the competition (it will be similar to the first competition). Again, there will be another leader board to keep track of the best submissions, and I intend on leaving it running until the end of the conference for people to play around with." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's switch back to using the `ceeaus` dataset we downloaded before. If you're just joining us, grab it now:" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-08-16 19:19:42-- https://meta-toolkit.org/data/2016-01-26/ceeaus.tar.gz\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving meta-toolkit.org... 50.116.41.177, 2600:3c02::f03c:91ff:feae:b777\n", + "Connecting to meta-toolkit.org|50.116.41.177|:443... connected.\n", + "HTTP request sent, awaiting response... 304 Not Modified\n", + "File ‘ceeaus.tar.gz’ not modified on server. Omitting download.\n", + "\n" + ] + } + ], + "source": [ + "!wget -N https://meta-toolkit.org/data/2016-01-26/ceeaus.tar.gz\n", + "!tar xf ceeaus.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We'll also need our standard stopword list. Grab it now if you don't already have it:" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2017-08-16 19:19:43-- https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving raw.githubusercontent.com... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2747 (2.7K) [text/plain]\n", + "Saving to: ‘lemur-stopwords.txt’\n", + "\n", + "lemur-stopwords.txt 100%[===================>] 2.68K --.-KB/s in 0s \n", + "\n", + "Last-modified header missing -- time-stamps turned off.\n", + "2017-08-16 19:19:43 (63.8 MB/s) - ‘lemur-stopwords.txt’ saved [2747/2747]\n", + "\n" + ] + } + ], + "source": [ + "!wget -N https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's create our MeTA configuration file for this part of the tutorial. We'll be using standard unigram words for now, but you're strongly encouraged to play with different features for the competition!" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "config = \"\"\"prefix = \".\"\n", + "dataset = \"ceeaus\"\n", + "corpus = \"line.toml\"\n", + "index = \"ceeaus-idx\"\n", + "stop-words = \"lemur-stopwords.txt\"\n", + "\n", + "[[analyzers]]\n", + "method = \"ngram-word\"\n", + "ngram = 1\n", + "filter = \"default-unigram-chain\"\n", + "\"\"\"\n", + "with open('ceeaus-config.toml', 'w') as f:\n", + " f.write(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's index this dataset. Since we are doing classification experiments, we will most likely be concerning ourselves with a `ForwardIndex`, since we want to map document ids to their feature vector representations." + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1502921983: [info] Loading index from disk: ceeaus-idx/fwd (/tmp/pip-bneszy3v-build/deps/meta/src/index/forward_index.cpp:171)\n", + "1502921983: [info] Loading index from disk: ceeaus-idx/fwd (/tmp/pip-bneszy3v-build/deps/meta/src/index/forward_index.cpp:171)\n" + ] + } + ], + "source": [ + "fidx = metapy.index.make_forward_index('ceeaus-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note that the feature set used for classification depends on your settings in the configuration file _at the time of indexing_. If you want to play with different feature sets, remember to change your `analyzer` pipeline in the configuration file, and also to **reindex** your documents!\n", + "\n", + "Here, we've just chosen simple unigram words. This is actually a surprisingly good baseline feature set for many text classification problems.\n", + "\n", + "Now that we have a `ForwardIndex` on disk, we need to load the documents we want to start playing with into memory. Since this is a small enough dataset, let's load the whole thing into memory at once.\n", + "\n", + "We need to decide what kind of dataset we're using. MeTA has classes for binary classification (`BinaryDataset`) and multi-class classification (`MulticlassDataset`), which you should choose from depending on the kind of classification problem you're dealing with. Let's see how many labels we have in our corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fidx.num_labels()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Since this is more than 2, we likely want a `MulticlassDataset` so we can learn a classifier that can predict which of these three labels a document should have. (But we might be interested in only determining one particular class from the rest, in which case we might actually want a `BinaryDataset`.)\n", + "\n", + "For now, let's focus on the multi-class case, as that likely makes the most sense for this kind of data. Let's load or documents." + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading instances into memory: [> ] 0% ETA 00:00:00 \r", + " > Loading instances into memory: [> ] 0% ETA 00:00:00 \r", + " > Loading instances into memory: [==========================] 100% ETA 00:00:00 \r", + " > Loading instances into memory: [==========================] 100% ETA 00:00:00 \n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "1008" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset = metapy.classify.MulticlassDataset(fidx)\n", + "len(dset)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We have 1008 documents, split across three labels. What are our labels?" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'chinese', 'english', 'japanese'}" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set([dset.label(instance) for instance in dset])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This dataset is a small collection of essays written by a bunch of students with different first languages. Our goal will be to try to identify whether an essay was written by a native-Chinese speaker, a native-English speaker, or a native-Japanese speaker.\n", + "\n", + "Now, because these in-memory datasets can potentially be quite large, it's beneficial to not make unnecessary copies of them to, for example, create a new list that's shuffled that contains the same documents. In most cases, you'll be operating with a `DatasetView` (either `MulticlassDatasetView` or `BinaryDatasetView`) so that you can do things like shuffle or rotate the contents of a dataset without having to actually modify it. Doing so is pretty easy: you can use Python's slicing API, or you can just construct one directly." + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "view = dset[0:len(dset)+1]\n", + "# or\n", + "view = metapy.classify.MulticlassDatasetView(dset)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now we can, for example, shuffle this view without changing the underlying datsaet." + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "869 vs 0\n" + ] + } + ], + "source": [ + "view.shuffle()\n", + "print(\"{} vs {}\".format(view[0].id, dset[0].id))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The view has been shuffled and now has documents in random order (useful in many cases to make sure that you don't have clumps of the same-labeled documents together, or to just permute the documents in a stochastic learning algorithm), but the underlying dataset is still sorted by id.\n", + "\n", + "We can also use this slicing API to create a random training and testing set from our shuffled views (views also support slicing). Let's make a 75-25 split of training-testing data. (Note that's really important that we already shuffled the view!)" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "training = view[0:int(0.75*len(view))]\n", + "testing = view[int(0.75*len(view)):]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, we're ready to train a classifier! Let's start with very simple one: [Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier).\n", + "\n", + "In MeTA, construction of a classifier implies training of that model. Let's train a Naive Bayes classifier on our training view now." + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nb = metapy.classify.NaiveBayes(training)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can now classify individual documents like so." + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'japanese'" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb.classify(testing[0].weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We might be more interested in how well we classify the testing set." + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.96\u001b[22m - 0.04 \n", + " english | - \u001b[1m0.909\u001b[22m 0.0909 \n", + " japanese | 0.0155 0.0155 \u001b[1m0.969\u001b[22m \n", + "\n", + "\n" + ] + } + ], + "source": [ + "mtrx = nb.test(testing)\n", + "print(mtrx)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The `test()` method of MeTA's classifiers returns to you a `ConfusionMatrix`, which contains useful information about what kinds of mistakes your classifier is making.\n", + "\n", + "(Note that, due to the random shuffling, you might see different results than we do here.)\n", + "\n", + "For example, we can see that this classifier seems to have some trouble with confusing native-Chinese students' essays with those of native-Japanese students. We can tell that by looking at the rows of the confusion matrix. Each row tells you what fraction of documents with that _true_ label were assigned the label for each column by the classifier. In the case of the native-Chinese label, we can see that 25% of the time they were miscategorized as being native-Japanese.\n", + "\n", + "The `ConfusionMatrix` also computes a lot of metrics that are commonly used in classifier evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.923 0.889 0.96 0.0992 \n", + "english 0.909 0.909 0.909 0.131 \n", + "japanese 0.974 0.979 0.969 0.77 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.961\u001b[22m \u001b[1m0.961\u001b[22m \u001b[1m0.96\u001b[22m \n", + "------------------------------------------------------------\n", + "252 predictions attempted, overall accuracy: 0.96\n", + "\n" + ] + } + ], + "source": [ + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If we want to make sure that the classifier isn't overfitting to our training data, a common approach is to do [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)). Let's run CV for our Naive Bayes classifier across the whole dataset, using 5-folds, to get an idea of how well we might generalize to new data." + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1502921983: [info] Cross-validating fold 1/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 1/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 2/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 2/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 3/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 3/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 4/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 4/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 5/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 5/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n" + ] + } + ], + "source": [ + "mtrx = metapy.classify.cross_validate(lambda fold: metapy.classify.NaiveBayes(fold), view, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "`cross_validate()` returns a `ConfusionMatrix` just like `test()` does. We give it a function to use to create the trained classifiers for each fold, and then pass in the dataset view containing all of our documents, and the number of folds we want to use.\n", + "\n", + "Let's see how we did." + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.868\u001b[22m 0.011 0.121 \n", + " english | 0.0342 \u001b[1m0.918\u001b[22m 0.0479 \n", + " japanese | 0.0195 0.00911 \u001b[1m0.971\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.832 0.798 0.868 0.0905 \n", + "english 0.931 0.944 0.918 0.145 \n", + "japanese 0.974 0.976 0.971 0.764 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.955\u001b[22m \u001b[1m0.956\u001b[22m \u001b[1m0.954\u001b[22m \n", + "------------------------------------------------------------\n", + "1005 predictions attempted, overall accuracy: 0.954\n", + "\n" + ] + } + ], + "source": [ + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now let's do the same thing, but for an arguably stronger baseline: [SVM](https://en.wikipedia.org/wiki/Support_vector_machine).\n", + "\n", + "MeTA's implementation of SVM is actually an approximation using [stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) on the [hinge loss](https://en.wikipedia.org/wiki/Hinge_loss). It's implemented as a `BinaryClassifier`, so we will need to adapt it before it can be used to solve our multi-class clasification problem.\n", + "\n", + "MeTA provides two different adapters for this scenario: [One-vs-All](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) and [One-vs-One](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-one)." + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ova = metapy.classify.OneVsAll(training, metapy.classify.SGD, loss_id='hinge')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We construct the `OneVsAll` reduction by providing it the training documents, the name of a binary classifier, and then (as keyword arguments) any additional arguments to that chosen classifier. In this case, we use `loss_id` to specify the loss function to use.\n", + "\n", + "We can now use `OneVsAll` just like any other classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.72\u001b[22m - 0.28 \n", + " english | - \u001b[1m0.909\u001b[22m 0.0909 \n", + " japanese | - 0.0103 \u001b[1m0.99\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.837 1 0.72 0.0992 \n", + "english 0.923 0.938 0.909 0.131 \n", + "japanese 0.97 0.95 0.99 0.77 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.953\u001b[22m \u001b[1m0.954\u001b[22m \u001b[1m0.952\u001b[22m \n", + "------------------------------------------------------------\n", + "252 predictions attempted, overall accuracy: 0.952\n", + "\n" + ] + } + ], + "source": [ + "mtrx = ova.test(testing)\n", + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " chinese english japanese \n", + " ------------------------------\n", + " chinese | \u001b[1m0.835\u001b[22m 0.022 0.143 \n", + " english | - \u001b[1m0.911\u001b[22m 0.089 \n", + " japanese | 0.00391 0.00651 \u001b[1m0.99\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "chinese 0.894 0.962 0.835 0.0905 \n", + "english 0.93 0.95 0.911 0.145 \n", + "japanese 0.978 0.967 0.99 0.764 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.964\u001b[22m \u001b[1m0.964\u001b[22m \u001b[1m0.964\u001b[22m \n", + "------------------------------------------------------------\n", + "1005 predictions attempted, overall accuracy: 0.964\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1502921983: [info] Cross-validating fold 1/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 1/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 2/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 2/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 3/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 3/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 4/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 4/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 5/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n", + "1502921983: [info] Cross-validating fold 5/5 (/tmp/pip-bneszy3v-build/deps/meta/include/meta/classify/classifier/classifier.h:103)\n" + ] + } + ], + "source": [ + "mtrx = metapy.classify.cross_validate(lambda fold: metapy.classify.OneVsAll(fold, metapy.classify.SGD, loss_id='hinge'), view, 5)\n", + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "That should be enough to get you started! Try looking at `help(metapy.classify)` for a list of what's included in the bindings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "source": [ + "**COMPETITION TIME**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Part 4: Topic Modeling" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In this part of the tutorial we will discuss how to run a topic model over data indexed as a `ForwardIndex`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We will need to index our data to proceed. We eventually want to be able to extract the bag-of-words representation for our individual documents, so we will want a `ForwardIndex` in this case." + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1502921983: [info] Loading index from disk: ceeaus-idx/fwd (/tmp/pip-bneszy3v-build/deps/meta/src/index/forward_index.cpp:171)\n", + "1502921983: [info] Loading index from disk: ceeaus-idx/fwd (/tmp/pip-bneszy3v-build/deps/meta/src/index/forward_index.cpp:171)\n" + ] + } + ], + "source": [ + "fidx = metapy.index.make_forward_index('ceeaus-config.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Just like in classification, the feature set used for the topic modeling will be the feature set used at the time of indexing, so if you want to play with a different set of features (like bigram words), you will need to re-index your data.\n", + "\n", + "For now, we've just stuck with the default filter chain for unigram words, so we're operating in the traditional bag-of-words space.\n", + "\n", + "Let's load our documents into memory to run the topic model inference now." + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading instances into memory: [> ] 0% ETA 00:00:00 \r", + " > Loading instances into memory: [> ] 0% ETA 00:00:00 \r", + " > Loading instances into memory: [==========================] 100% ETA 00:00:00 \r", + " > Loading instances into memory: [==========================] 100% ETA 00:00:00 \n", + " \n" + ] + } + ], + "source": [ + "dset = metapy.learn.Dataset(fidx)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's try to find some topics for this dataset. To do so, we're going to use a generative model called a topic model.\n", + "\n", + "There are many different topic models in the literature, but the most commonly used topic model is Latent Dirichlet Allocation. Here, we propose that there are K topics (represented with a categorical distribution over words) $\\phi_k$ from which all of our documents are genereated. These K topics are modeled as being sampled from a Dirichlet distribution with parameter $\\vec{\\alpha}$. Then, to generate a document $d$, we first sample a distribution over the K topics $\\theta_d$ from another Dirichlet distribution with parameter $\\vec{\\beta}$. Then, for each word in this document, we first sample a topic identifier $z \\sim \\theta_d$ and then the word by drawing from the topic we selected ($w \\sim \\phi_z$). Refer to the [Wikipedia article on LDA](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) for more information.\n", + "\n", + "The goal of running inference for an LDA model is to infer the latent variables $\\phi_k$ and $\\theta_d$ for all of the $K$ topics and $D$ documents, respectively. MeTA provides a number of different inference algorithms for LDA, as each one entails a different set of trade-offs (inference in LDA is intractable, so all inference algorithms are approximations; different algorithms entail different approximation guarantees, running times, and required memroy consumption). For now, let's run a Variational Infernce algorithm called CVB0 to find two topics. (In practice you will likely be finding many more topics than just two, but this is a very small toy dataset.)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Initialization: [============================================] 100% ETA 00:00:00 \n", + " \n", + "Iteration 1 maximum change in gamma: 1.94892 \n", + "Iteration 1 maximum change in gamma: 1.94892 \n", + "Iteration 2 maximum change in gamma: 0.489304 \n", + "Iteration 2 maximum change in gamma: 0.489304 \n", + "Iteration 3 maximum change in gamma: 0.353439 \n", + "Iteration 3 maximum change in gamma: 0.353439 \n", + "Iteration 4 maximum change in gamma: 0.437895 \n", + "Iteration 4 maximum change in gamma: 0.437895 \n", + "Iteration 5 maximum change in gamma: 0.646495 \n", + "Iteration 5 maximum change in gamma: 0.646495 \n", + "Iteration 6 maximum change in gamma: 1.08145 \n", + "Iteration 6 maximum change in gamma: 1.08145 \n", + "Iteration 7 maximum change in gamma: 1.37714 \n", + "Iteration 7 maximum change in gamma: 1.37714 \n", + "Iteration 8 maximum change in gamma: 1.5234 \n", + "Iteration 8 maximum change in gamma: 1.5234 \n", + "Iteration 9 maximum change in gamma: 1.41999 \n", + "Iteration 9 maximum change in gamma: 1.41999 \n", + "Iteration 10 maximum change in gamma: 1.35315 \n", + "Iteration 10 maximum change in gamma: 1.35315 \n", + "Iteration 11 maximum change in gamma: 1.16084 \n", + "Iteration 11 maximum change in gamma: 1.16084 \n", + "Iteration 12 maximum change in gamma: 0.985118 \n", + "Iteration 12 maximum change in gamma: 0.985118 \n", + "Iteration 13 maximum change in gamma: 0.529624 \n", + "Iteration 13 maximum change in gamma: 0.529624 \n", + "Iteration 14 maximum change in gamma: 0.505101 \n", + "Iteration 14 maximum change in gamma: 0.505101 \n", + "Iteration 15 maximum change in gamma: 0.357362 \n", + "Iteration 15 maximum change in gamma: 0.357362 \n", + "Iteration 16 maximum change in gamma: 0.33901 \n", + "Iteration 16 maximum change in gamma: 0.33901 \n", + "Iteration 17 maximum change in gamma: 0.277928 \n", + "Iteration 17 maximum change in gamma: 0.277928 \n", + "Iteration 18 maximum change in gamma: 0.242615 \n", + "Iteration 18 maximum change in gamma: 0.242615 \n", + "Iteration 19 maximum change in gamma: 0.237642 \n", + "Iteration 19 maximum change in gamma: 0.237642 \n", + "Iteration 20 maximum change in gamma: 0.198027 \n", + "Iteration 20 maximum change in gamma: 0.198027 \n", + "Iteration 21 maximum change in gamma: 0.192858 \n", + "Iteration 21 maximum change in gamma: 0.192858 \n", + "Iteration 22 maximum change in gamma: 0.185038 \n", + "Iteration 22 maximum change in gamma: 0.185038 \n", + "Iteration 23 maximum change in gamma: 0.168724 \n", + "Iteration 23 maximum change in gamma: 0.168724 \n", + "Iteration 24 maximum change in gamma: 0.157681 \n", + "Iteration 24 maximum change in gamma: 0.157681 \n", + "Iteration 25 maximum change in gamma: 0.13898 \n", + "Iteration 25 maximum change in gamma: 0.13898 \n", + "Iteration 26 maximum change in gamma: 0.131065 \n", + "Iteration 26 maximum change in gamma: 0.131065 \n", + "Iteration 27 maximum change in gamma: 0.126334 \n", + "Iteration 27 maximum change in gamma: 0.126334 \n", + "Iteration 28 maximum change in gamma: 0.148569 \n", + "Iteration 28 maximum change in gamma: 0.148569 \n", + "Iteration 29 maximum change in gamma: 0.177806 \n", + "Iteration 29 maximum change in gamma: 0.177806 \n", + "Iteration 30 maximum change in gamma: 0.19599 \n", + "Iteration 30 maximum change in gamma: 0.19599 \n", + "Iteration 31 maximum change in gamma: 0.195208 \n", + "Iteration 31 maximum change in gamma: 0.195208 \n", + "Iteration 32 maximum change in gamma: 0.207592 \n", + "Iteration 32 maximum change in gamma: 0.207592 \n", + "Iteration 33 maximum change in gamma: 0.222097 \n", + "Iteration 33 maximum change in gamma: 0.222097 \n", + "Iteration 34 maximum change in gamma: 0.209845 \n", + "Iteration 34 maximum change in gamma: 0.209845 \n", + "Iteration 35 maximum change in gamma: 0.211747 \n", + "Iteration 35 maximum change in gamma: 0.211747 \n", + "Iteration 36 maximum change in gamma: 0.185753 \n", + "Iteration 36 maximum change in gamma: 0.185753 \n", + "Iteration 37 maximum change in gamma: 0.142088 \n", + "Iteration 37 maximum change in gamma: 0.142088 \n", + "Iteration 38 maximum change in gamma: 0.096915 \n", + "Iteration 38 maximum change in gamma: 0.096915 \n", + "Iteration 39 maximum change in gamma: 0.0608104 \n", + "Iteration 39 maximum change in gamma: 0.0608104 \n", + "Iteration 40 maximum change in gamma: 0.0361621 \n", + "Iteration 40 maximum change in gamma: 0.0361621 \n", + "Iteration 41 maximum change in gamma: 0.0208601 \n", + "Iteration 41 maximum change in gamma: 0.0208601 \n", + "Iteration 42 maximum change in gamma: 0.0192793 \n", + "Iteration 42 maximum change in gamma: 0.0192793 \n", + "Iteration 43 maximum change in gamma: 0.0184543 \n", + "Iteration 43 maximum change in gamma: 0.0184543 \n", + "Iteration 44 maximum change in gamma: 0.0176882 \n", + "Iteration 44 maximum change in gamma: 0.0176882 \n", + "Iteration 45 maximum change in gamma: 0.0169772 \n", + "Iteration 45 maximum change in gamma: 0.0169772 \n", + "Iteration 46 maximum change in gamma: 0.0163172 \n", + "Iteration 46 maximum change in gamma: 0.0163172 \n", + "Iteration 47 maximum change in gamma: 0.0157038 \n", + "Iteration 47 maximum change in gamma: 0.0157038 \n", + "Iteration 48 maximum change in gamma: 0.0151331 \n", + "Iteration 48 maximum change in gamma: 0.0151331 \n", + "Iteration 49 maximum change in gamma: 0.0146011 \n", + "Iteration 49 maximum change in gamma: 0.0146011 \n", + "Iteration 50 maximum change in gamma: 0.0141041 \n", + "Iteration 50 maximum change in gamma: 0.0141041 \n", + "Iteration 51 maximum change in gamma: 0.0136389 \n", + "Iteration 51 maximum change in gamma: 0.0136389 \n", + "Iteration 52 maximum change in gamma: 0.0132024 \n", + "Iteration 52 maximum change in gamma: 0.0132024 \n", + "Iteration 53 maximum change in gamma: 0.0127917 \n", + "Iteration 53 maximum change in gamma: 0.0127917 \n", + "Iteration 54 maximum change in gamma: 0.0124045 \n", + "Iteration 54 maximum change in gamma: 0.0124045 \n", + "Iteration 55 maximum change in gamma: 0.0120384 \n", + "Iteration 55 maximum change in gamma: 0.0120384 \n", + "Iteration 56 maximum change in gamma: 0.0116915 \n", + "Iteration 56 maximum change in gamma: 0.0116915 \n", + "Iteration 57 maximum change in gamma: 0.0113617 \n", + "Iteration 57 maximum change in gamma: 0.0113617 \n", + "Iteration 58 maximum change in gamma: 0.0110477 \n", + "Iteration 58 maximum change in gamma: 0.0110477 \n", + "Iteration 59 maximum change in gamma: 0.0107477 \n", + "Iteration 59 maximum change in gamma: 0.0107477 \n", + "Iteration 60 maximum change in gamma: 0.0104606 \n", + "Iteration 60 maximum change in gamma: 0.0104606 \n", + "Iteration 61 maximum change in gamma: 0.0101851 \n", + "Iteration 61 maximum change in gamma: 0.0101851 \n", + "Iteration 62 maximum change in gamma: 0.00992002 \n", + "Iteration 62 maximum change in gamma: 0.00992002 \n", + "Iteration 63 maximum change in gamma: 0.00966452 \n", + "Iteration 63 maximum change in gamma: 0.00966452 \n", + "Iteration 64 maximum change in gamma: 0.00941766 \n", + "Iteration 64 maximum change in gamma: 0.00941766 \n", + "Iteration 65 maximum change in gamma: 0.00917865 \n", + "Iteration 65 maximum change in gamma: 0.00917865 \n", + "Iteration 66 maximum change in gamma: 0.00908822 \n", + "Iteration 66 maximum change in gamma: 0.00908822 \n", + "Iteration 67 maximum change in gamma: 0.0091286 \n", + "Iteration 67 maximum change in gamma: 0.0091286 \n", + "Iteration 68 maximum change in gamma: 0.00916622 \n", + "Iteration 68 maximum change in gamma: 0.00916622 \n", + "Iteration 69 maximum change in gamma: 0.00920064 \n", + "Iteration 69 maximum change in gamma: 0.00920064 \n", + "Iteration 70 maximum change in gamma: 0.00923141 \n", + "Iteration 70 maximum change in gamma: 0.00923141 \n", + "Iteration 71 maximum change in gamma: 0.00925807 \n", + "Iteration 71 maximum change in gamma: 0.00925807 \n", + "Iteration 72 maximum change in gamma: 0.00928021 \n", + "Iteration 72 maximum change in gamma: 0.00928021 \n", + "Iteration 73 maximum change in gamma: 0.00929737 \n", + "Iteration 73 maximum change in gamma: 0.00929737 \n", + "Iteration 74 maximum change in gamma: 0.00930916 \n", + "Iteration 74 maximum change in gamma: 0.00930916 \n", + "Iteration 75 maximum change in gamma: 0.00931517 \n", + "Iteration 75 maximum change in gamma: 0.00931517 \n", + "Iteration 76 maximum change in gamma: 0.00931502 \n", + "Iteration 76 maximum change in gamma: 0.00931502 \n", + "Iteration 77 maximum change in gamma: 0.00930835 \n", + "Iteration 77 maximum change in gamma: 0.00930835 \n", + "Iteration 78 maximum change in gamma: 0.00929481 \n", + "Iteration 78 maximum change in gamma: 0.00929481 \n", + "Iteration 79 maximum change in gamma: 0.00927412 \n", + "Iteration 79 maximum change in gamma: 0.00927412 \n", + "Iteration 80 maximum change in gamma: 0.00924599 \n", + "Iteration 80 maximum change in gamma: 0.00924599 \n", + "Iteration 81 maximum change in gamma: 0.00921019 \n", + "Iteration 81 maximum change in gamma: 0.00921019 \n", + "Iteration 82 maximum change in gamma: 0.00916651 \n", + "Iteration 82 maximum change in gamma: 0.00916651 \n", + "Iteration 83 maximum change in gamma: 0.00911479 \n", + "Iteration 83 maximum change in gamma: 0.00911479 \n", + "Iteration 84 maximum change in gamma: 0.00905492 \n", + "Iteration 84 maximum change in gamma: 0.00905492 \n", + "Iteration 85 maximum change in gamma: 0.00898683 \n", + "Iteration 85 maximum change in gamma: 0.00898683 \n", + "Iteration 86 maximum change in gamma: 0.00891048 \n", + "Iteration 86 maximum change in gamma: 0.00891048 \n", + "Iteration 87 maximum change in gamma: 0.00882591 \n", + "Iteration 87 maximum change in gamma: 0.00882591 \n", + "Iteration 88 maximum change in gamma: 0.00873318 \n", + "Iteration 88 maximum change in gamma: 0.00873318 \n", + "Iteration 89 maximum change in gamma: 0.0086324 \n", + "Iteration 89 maximum change in gamma: 0.0086324 \n", + "Iteration 90 maximum change in gamma: 0.00852376 \n", + "Iteration 90 maximum change in gamma: 0.00852376 \n", + "Iteration 91 maximum change in gamma: 0.00840745 \n", + "Iteration 91 maximum change in gamma: 0.00840745 \n", + "Iteration 92 maximum change in gamma: 0.00828374 \n", + "Iteration 92 maximum change in gamma: 0.00828374 \n", + "Iteration 93 maximum change in gamma: 0.00815293 \n", + "Iteration 93 maximum change in gamma: 0.00815293 \n", + "Iteration 94 maximum change in gamma: 0.00801536 \n", + "Iteration 94 maximum change in gamma: 0.00801536 \n", + "Iteration 95 maximum change in gamma: 0.00787141 \n", + "Iteration 95 maximum change in gamma: 0.00787141 \n", + "Iteration 96 maximum change in gamma: 0.00772149 \n", + "Iteration 96 maximum change in gamma: 0.00772149 \n", + "Iteration 97 maximum change in gamma: 0.00756605 \n", + "Iteration 97 maximum change in gamma: 0.00756605 \n", + "Iteration 98 maximum change in gamma: 0.00740556 \n", + "Iteration 98 maximum change in gamma: 0.00740556 \n", + "Iteration 99 maximum change in gamma: 0.0072405 \n", + "Iteration 99 maximum change in gamma: 0.0072405 \n", + "Iteration 100 maximum change in gamma: 0.00707137 \n", + "Iteration 100 maximum change in gamma: 0.00707137 \n", + "Iteration 101 maximum change in gamma: 0.0068987 \n", + "Iteration 101 maximum change in gamma: 0.0068987 \n", + "Iteration 102 maximum change in gamma: 0.00672302 \n", + "Iteration 102 maximum change in gamma: 0.00672302 \n", + "Iteration 103 maximum change in gamma: 0.00654484 \n", + "Iteration 103 maximum change in gamma: 0.00654484 \n", + "Iteration 104 maximum change in gamma: 0.00636471 \n", + "Iteration 104 maximum change in gamma: 0.00636471 \n", + "Iteration 105 maximum change in gamma: 0.00618313 \n", + "Iteration 105 maximum change in gamma: 0.00618313 \n", + "Iteration 106 maximum change in gamma: 0.00600062 \n", + "Iteration 106 maximum change in gamma: 0.00600062 \n", + "Iteration 107 maximum change in gamma: 0.00581768 \n", + "Iteration 107 maximum change in gamma: 0.00581768 \n", + "Iteration 108 maximum change in gamma: 0.00563479 \n", + "Iteration 108 maximum change in gamma: 0.00563479 \n", + "Iteration 109 maximum change in gamma: 0.00545242 \n", + "Iteration 109 maximum change in gamma: 0.00545242 \n", + "Iteration 110 maximum change in gamma: 0.00527099 \n", + "Iteration 110 maximum change in gamma: 0.00527099 \n", + "Iteration 111 maximum change in gamma: 0.00509093 \n", + "Iteration 111 maximum change in gamma: 0.00509093 \n", + "Iteration 112 maximum change in gamma: 0.00491263 \n", + "Iteration 112 maximum change in gamma: 0.00491263 \n", + "Iteration 113 maximum change in gamma: 0.00473645 \n", + "Iteration 113 maximum change in gamma: 0.00473645 \n", + "Iteration 114 maximum change in gamma: 0.00456272 \n", + "Iteration 114 maximum change in gamma: 0.00456272 \n", + "Iteration 115 maximum change in gamma: 0.00439176 \n", + "Iteration 115 maximum change in gamma: 0.00439176 \n", + "Iteration 116 maximum change in gamma: 0.00422383 \n", + "Iteration 116 maximum change in gamma: 0.00422383 \n", + "Iteration 117 maximum change in gamma: 0.00405918 \n", + "Iteration 117 maximum change in gamma: 0.00405918 \n", + "Iteration 118 maximum change in gamma: 0.00389803 \n", + "Iteration 118 maximum change in gamma: 0.00389803 \n", + "Iteration 119 maximum change in gamma: 0.00374058 \n", + "Iteration 119 maximum change in gamma: 0.00374058 \n", + "Iteration 120 maximum change in gamma: 0.00358697 \n", + "Iteration 120 maximum change in gamma: 0.00358697 \n", + "Iteration 121 maximum change in gamma: 0.00343736 \n", + "Iteration 121 maximum change in gamma: 0.00343736 \n", + "Iteration 122 maximum change in gamma: 0.00329185 \n", + "Iteration 122 maximum change in gamma: 0.00329185 \n", + "Iteration 123 maximum change in gamma: 0.00315053 \n", + "Iteration 123 maximum change in gamma: 0.00315053 \n", + "Iteration 124 maximum change in gamma: 0.00301346 \n", + "Iteration 124 maximum change in gamma: 0.00301346 \n", + "Iteration 125 maximum change in gamma: 0.00288069 \n", + "Iteration 125 maximum change in gamma: 0.00288069 \n", + "Iteration 126 maximum change in gamma: 0.00275224 \n", + "Iteration 126 maximum change in gamma: 0.00275224 \n", + "Iteration 127 maximum change in gamma: 0.00262812 \n", + "Iteration 127 maximum change in gamma: 0.00262812 \n", + "Iteration 128 maximum change in gamma: 0.00250831 \n", + "Iteration 128 maximum change in gamma: 0.00250831 \n", + "Iteration 129 maximum change in gamma: 0.00239279 \n", + "Iteration 129 maximum change in gamma: 0.00239279 \n", + "Iteration 130 maximum change in gamma: 0.00228152 \n", + "Iteration 130 maximum change in gamma: 0.00228152 \n", + "Iteration 131 maximum change in gamma: 0.00217445 \n", + "Iteration 131 maximum change in gamma: 0.00217445 \n", + "Iteration 132 maximum change in gamma: 0.00207151 \n", + "Iteration 132 maximum change in gamma: 0.00207151 \n", + "Iteration 133 maximum change in gamma: 0.00197264 \n", + "Iteration 133 maximum change in gamma: 0.00197264 \n", + "Iteration 134 maximum change in gamma: 0.00187775 \n", + "Iteration 134 maximum change in gamma: 0.00187775 \n", + "Iteration 135 maximum change in gamma: 0.00178675 \n", + "Iteration 135 maximum change in gamma: 0.00178675 \n", + "Iteration 136 maximum change in gamma: 0.00169956 \n", + "Iteration 136 maximum change in gamma: 0.00169956 \n", + "Iteration 137 maximum change in gamma: 0.00161608 \n", + "Iteration 137 maximum change in gamma: 0.00161608 \n", + "Iteration 138 maximum change in gamma: 0.00153619 \n", + "Iteration 138 maximum change in gamma: 0.00153619 \n", + "Iteration 139 maximum change in gamma: 0.00145981 \n", + "Iteration 139 maximum change in gamma: 0.00145981 \n", + "Iteration 140 maximum change in gamma: 0.00138681 \n", + "Iteration 140 maximum change in gamma: 0.00138681 \n", + "Iteration 141 maximum change in gamma: 0.0013171 \n", + "Iteration 141 maximum change in gamma: 0.0013171 \n", + "Iteration 142 maximum change in gamma: 0.00125055 \n", + "Iteration 142 maximum change in gamma: 0.00125055 \n", + "Iteration 143 maximum change in gamma: 0.00118707 \n", + "Iteration 143 maximum change in gamma: 0.00118707 \n", + "Iteration 144 maximum change in gamma: 0.00112654 \n", + "Iteration 144 maximum change in gamma: 0.00112654 \n", + "Iteration 145 maximum change in gamma: 0.00106885 \n", + "Iteration 145 maximum change in gamma: 0.00106885 \n", + "Iteration 146 maximum change in gamma: 0.00101389 \n", + "Iteration 146 maximum change in gamma: 0.00101389 \n", + "Iteration 147 maximum change in gamma: 0.000961562 \n", + "Iteration 147 maximum change in gamma: 0.000961562 \n", + "1502921989: [info] Finished maximum iterations, or found convergence! (/tmp/pip-bneszy3v-build/deps/meta/src/topics/lda_cvb.cpp:60)\n", + "1502921989: [info] Finished maximum iterations, or found convergence! (/tmp/pip-bneszy3v-build/deps/meta/src/topics/lda_cvb.cpp:60)\n" + ] + } + ], + "source": [ + "lda_inf = metapy.topics.LDACollapsedVB(dset, num_topics=2, alpha=1.0, beta=0.01)\n", + "lda_inf.run(num_iters=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "source": [ + "The above ran the CVB0 algorithm for 1000 iterations, or until an algorithm-specific convergence criterion was met. Now let's save the current estimate for our topics and topic proportions." + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "lda_inf.save('lda-cvb0')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can interrogate the topic inference results by using the `TopicModel` query class. Let's load our inference results back in." + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading topic term probabilities: [===========> ] 50% ETA 00:00:00 \r", + " > Loading topic term probabilities: [===========> ] 50% ETA 00:00:00 \r", + " > Loading topic term probabilities: [=======================] 100% ETA 00:00:00 \r", + " > Loading topic term probabilities: [=======================] 100% ETA 00:00:00 \n", + " \n", + " \r", + " > Loading document topic probabilities: [> ] 0% ETA 00:00:00 \r", + " > Loading document topic probabilities: [> ] 0% ETA 00:00:00 \r", + " > Loading document topic probabilities: [===================] 100% ETA 00:00:00 \r", + " > Loading document topic probabilities: [===================] 100% ETA 00:00:00 \n", + " \n" + ] + } + ], + "source": [ + "model = metapy.topics.TopicModel('lda-cvb0')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now, let's have a look at our topics. A typical way of doing this is to print the top $k$ words in each topic, so let's do that." + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(3759, 0.06705637112528769),\n", + " (1968, 0.05605930810442864),\n", + " (2635, 0.05222307061872271),\n", + " (3549, 0.04642939140343873),\n", + " (665, 0.03488141234942433),\n", + " (4157, 0.02906748539640022),\n", + " (2322, 0.02885022388702368),\n", + " (3729, 0.022331344581221765),\n", + " (1790, 0.020755699719924883),\n", + " (3554, 0.015483037834133842)]" + ] + }, + "execution_count": 162, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.top_k(tid=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The models operate on term ids instead of raw text strings, so let's convert this to a human readable format by using the vocabulary contained in our `ForwardIndex` to map the term ids to strings." + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('time', 0.06705637112528769),\n", + " ('job', 0.05605930810442864),\n", + " ('part', 0.05222307061872271),\n", + " ('student', 0.04642939140343873),\n", + " ('colleg', 0.03488141234942433),\n", + " ('work', 0.02906748539640022),\n", + " ('money', 0.02885022388702368),\n", + " ('think', 0.022331344581221765),\n", + " ('import', 0.020755699719924883),\n", + " ('studi', 0.015483037834133842)]" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=0)]" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('smoke', 0.13110394941553408),\n", + " ('restaur', 0.054349311633512025),\n", + " ('peopl', 0.036780087802958536),\n", + " ('smoker', 0.03349263454160484),\n", + " ('ban', 0.022530670096022554),\n", + " ('think', 0.015620489442527752),\n", + " ('japan', 0.012780916901417468),\n", + " ('complet', 0.012635067649017825),\n", + " ('cigarett', 0.011987181371938055),\n", + " ('non', 0.011317738574939687)]" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=1)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can pretty clearly see that this particular dataset was about two major issues: part time jobs for students and smoking in public. This dataset is actually a collection of essays written by students, and there just so happen to be two different topics they can choose from!\n", + "\n", + "The topics are pretty clear in this case, but in some cases it is also useful to score the terms in a topic using some function of the probability of the word in the topic and the probability of the word in the other topics. Intuitively, we might want to select words from each topic that best reflect that topic's content by picking words that both have high probability in that topic **and** have low probability in the other topics. In other words, we want to balance between high probability terms and highly specific terms (this is kind of like a tf-idf weighting). One such scoring function is provided by the toolkit in `BLTermScorer`, which implements a scoring function proposed by Blei and Lafferty." + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('job', 0.34822058296128233),\n", + " ('part', 0.31311075688049606),\n", + " ('student', 0.2832893627599442),\n", + " ('colleg', 0.20809000481963835),\n", + " ('time', 0.17796675292712294),\n", + " ('money', 0.16234684321361126),\n", + " ('work', 0.1558533795913366),\n", + " ('studi', 0.08228291023281153),\n", + " ('learn', 0.06491900298193354),\n", + " ('experi', 0.054945276562063716)]" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scorer = metapy.topics.BLTermScorer(model)\n", + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=0, scorer=scorer)]" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('smoke', 0.874164081128221),\n", + " ('restaur', 0.31746129947227786),\n", + " ('smoker', 0.20060262327581713),\n", + " ('ban', 0.128530349360076),\n", + " ('cigarett', 0.06557605570188008),\n", + " ('non', 0.061284206154067045),\n", + " ('complet', 0.0610537364588466),\n", + " ('japan', 0.0584657324517579),\n", + " ('health', 0.05054833214552534),\n", + " ('seat', 0.04533989023870699)]" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(fidx.term_text(pr[0]), pr[1]) for pr in model.top_k(tid=1, scorer=scorer)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Here we can see that the uninformative word stem \"think\" was downweighted from the word list from each topic, since it had relatively high probability in either topic." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can also see the inferred topic distribution for each document." + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 167, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.topic_distribution(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "It looks like our first document was written by a student who chose the part-time job essay topic..." + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.topic_distribution(900)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "...whereas this document looks like it was written by a student who chose the public smoking essay topic." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/apnews-config.toml b/metapy/tutorials/apnews-config.toml new file mode 100644 index 0000000000..c03e911368 --- /dev/null +++ b/metapy/tutorials/apnews-config.toml @@ -0,0 +1,16 @@ +prefix = "." +dataset = "apnews" +corpus = "line.toml" +index = "idx" +query-judgements = "qrels.txt" + +[[analyzers]] +method = "ngram-word" +ngram = 1 +filter = [{type = "icu-tokenizer", suppress-tags = true}, {type = "lowercase"}] + +[ranker] +method = "bm25" +k1 = 1.2 +b = 0.75 +k3 = 500 diff --git a/metapy/tutorials/ceeaus-config.toml b/metapy/tutorials/ceeaus-config.toml new file mode 100644 index 0000000000..3a3eb912ea --- /dev/null +++ b/metapy/tutorials/ceeaus-config.toml @@ -0,0 +1,10 @@ +prefix = "." +dataset = "ceeaus" +corpus = "line.toml" +index = "ceeaus-idx" +stop-words = "lemur-stopwords.txt" + +[[analyzers]] +method = "ngram-word" +ngram = 1 +filter = "default-unigram-chain" diff --git a/metapy/tutorials/config.toml b/metapy/tutorials/config.toml new file mode 100644 index 0000000000..575e98c4a0 --- /dev/null +++ b/metapy/tutorials/config.toml @@ -0,0 +1,4 @@ +[[analyzers]] +method = "ngram-word" +ngram = 1 +filter = [{type = "icu-tokenizer"}, {type = "lowercase"}] diff --git a/metapy/tutorials/sigir18-topic-models/sigir18-retrieval.ipynb b/metapy/tutorials/sigir18-topic-models/sigir18-retrieval.ipynb new file mode 100644 index 0000000000..838407dca2 --- /dev/null +++ b/metapy/tutorials/sigir18-topic-models/sigir18-retrieval.ipynb @@ -0,0 +1,983 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 1: Pseudo-feedback with Two-component Mixture Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's import the Python bindings for MeTA:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import metapy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't have `metapy` installed, you can install it with a\n", + "\n", + "```bash\n", + "pip install metapy\n", + "```\n", + "\n", + "on the command line on Linux, macOS, or Windows for either Python 2.7 or Python 3.x. (I will be using Python 3.6 in this tutorial.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Double-check that you are running the latest version. Right now, that should be `0.2.10`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.2.10'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metapy.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's set MeTA to log to standard error so we can see progress output for long-running commands. (Only do this once, or you'll get double the output.)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "metapy.log_to_stderr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's download all of the files we need for the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.request\n", + "import os\n", + "import tarfile\n", + "\n", + "if not os.path.exists('sigir18-tutorial.tar.gz'):\n", + " urllib.request.urlretrieve('https://meta-toolkit.org/data/2018-06-25/sigir18-tutorial.tar.gz',\n", + " 'sigir18-tutorial.tar.gz')\n", + " \n", + "if not os.path.exists('data'):\n", + " with tarfile.open('sigir18-tutorial.tar.gz', 'r:gz') as files:\n", + " files.extractall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's index our data using the `InvertedIndex` format. In a search engine, we want to quickly determine what documents mention a specific query term, so the `InvertedIndex` stores a mapping from term to a list of documents that contain that term (along with how many times they do)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Counting lines in file: [> ] 0% ETA 00:00:00 \r", + " > Counting lines in file: [=================================] 100% ETA 00:00:00 \n", + "1529953996: [info] Creating index: cranfield-idx/inv (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/inverted_index.cpp:119)\n", + " \r", + " > Tokenizing Docs: [> ] 0% ETA 00:00:00 \n", + "1529953996: [warning] Empty document (id = 470) generated! (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/inverted_index.cpp:228)\n", + " \n", + "1529953996: [warning] Empty document (id = 994) generated! (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/inverted_index.cpp:228)\n", + " \r", + " > Tokenizing Docs: [========================================] 100% ETA 00:00:00 \n", + " \r", + " > Merging: [> ] 0% ETA 00:00:00 \r", + " > Merging: [================================================] 100% ETA 00:00:00 \n", + "1529953996: [info] Created uncompressed postings file cranfield-idx/inv/postings.index (197.770000 KB) (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/inverted_index.cpp:148)\n", + " \r", + " > Compressing postings: [> ] 0% ETA 00:00:00 \r", + " > Compressing postings: [===================================] 100% ETA 00:00:00 \n", + "1529953996: [info] Created compressed postings file (168.060000 KB) (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/inverted_index.cpp:279)\n", + "1529953996: [info] Done creating index: cranfield-idx/inv (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/inverted_index.cpp:166)\n" + ] + } + ], + "source": [ + "inv_idx = metapy.index.make_inverted_index('cranfield.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This may take a minute at first, since the index needs to be built. Subsequent calls to `make_inverted_index` with this config file will simply load the index, which will not take any time.\n", + "\n", + "Here's how we can interact with the index object:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1400" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.num_docs()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4137" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.unique_terms()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "87.17857360839844" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.avg_doc_length()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "122050" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inv_idx.total_corpus_terms()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's search our index. We'll start by creating a ranker:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "ranker = metapy.index.DirichletPrior()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need a query. Let's create an example query." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "query = metapy.index.Document()\n", + "query.content(\"flow equilibrium\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use this to search our index like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(235, 1.2931444644927979),\n", + " (1251, 1.256299614906311),\n", + " (316, 1.1081531047821045),\n", + " (655, 1.0878994464874268),\n", + " (574, 1.076568841934204)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_docs = ranker.score(inv_idx, query, num_results=5)\n", + "top_docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are returned a ranked list of *(doc_id, score)* pairs. The scores are from the ranker, which in this case was Okapi BM25. Since the `tutorial.toml` file we created for the cranfield dataset has `store-full-text = true`, we can verify the content of our top documents by inspecting the document metadata field \"content\"." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. criteria for thermodynamic equilibrium in gas flow . when gases flow at high velocity, the rates of internal processes may not be fast enough to maintain thermodynamic equilibrium . by defining quasi-equilibrium in flow as the condition in which the...\n", + "\n", + "2. on the approach to chemical and vibrational equilibrium behind a strong normal shock wave . the concurrent approach to chemical and vibrational equilibrium of a pure diatomic gas passing through a strong normal shock wave is investigated . it is dem...\n", + "\n", + "3. non-equilibrium flow of an ideal dissociating gas . the theory of an'ideal dissociating'gas developed by lighthill/1957/for conditions of thermodynamic equilibrium is extended to non-equilibrium conditions by postulating a simple rate equation for th...\n", + "\n", + "4. departure from dissociation equilibrium in a hypersonic nozzle . the equations of motion for the flow of an ideal dissociating gas through a nearly conical nozzle have been solved numerically, assuming a simple equation for the rate of dissociation, ...\n", + "\n", + "5. atomic recombination in a hypersonic wind tunnel nozzle . the flow of an ideal dissociating gas through a nearly conical nozzle is considered . the equations of one-dimensional motion are solved numerically assuming a simple rate equation together wi...\n", + "\n" + ] + } + ], + "source": [ + "for num, (d_id, _) in enumerate(top_docs):\n", + " content = inv_idx.metadata(d_id).get('content')\n", + " print(\"{}. {}...\\n\".format(num + 1, content[0:250]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since we have the queries file and relevance judgements, we can do an IR evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "ev = metapy.index.IREval('cranfield.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will loop over the queries file and add each result to the `IREval` object `ev`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query 1 average precision: 0.19\n", + "Query 2 average precision: 0.5433333333333332\n", + "Query 3 average precision: 0.6541666666666666\n", + "Query 4 average precision: 0.5\n", + "Query 5 average precision: 0.35\n", + "Query 6 average precision: 0.0625\n", + "Query 7 average precision: 0.10666666666666666\n", + "Query 8 average precision: 0.0\n", + "Query 9 average precision: 0.6984126984126983\n", + "Query 10 average precision: 0.0625\n", + "Query 11 average precision: 0.028571428571428574\n", + "Query 12 average precision: 0.18\n", + "Query 13 average precision: 0.0\n", + "Query 14 average precision: 0.5\n", + "Query 15 average precision: 0.7\n", + "Query 16 average precision: 0.08333333333333333\n", + "Query 17 average precision: 0.07142857142857142\n", + "Query 18 average precision: 0.3333333333333333\n", + "Query 19 average precision: 0.0\n", + "Query 20 average precision: 0.2685185185185185\n", + "Query 21 average precision: 0.0\n", + "Query 22 average precision: 0.0\n", + "Query 23 average precision: 0.04722222222222222\n", + "Query 24 average precision: 0.3333333333333333\n", + "Query 25 average precision: 0.6507936507936507\n", + "Query 26 average precision: 0.13888888888888887\n", + "Query 27 average precision: 0.0\n", + "Query 28 average precision: 0.0\n", + "Query 29 average precision: 0.14166666666666666\n", + "Query 30 average precision: 0.11904761904761904\n", + "Query 31 average precision: 0.0\n", + "Query 32 average precision: 0.023809523809523808\n", + "Query 33 average precision: 0.5317460317460317\n", + "Query 34 average precision: 0.20555555555555557\n", + "Query 35 average precision: 0.0\n", + "Query 36 average precision: 0.25\n", + "Query 37 average precision: 0.05555555555555555\n", + "Query 38 average precision: 0.01111111111111111\n", + "Query 39 average precision: 0.12857142857142856\n", + "Query 40 average precision: 0.07222222222222222\n", + "Query 41 average precision: 0.7666666666666666\n", + "Query 42 average precision: 0.1638095238095238\n", + "Query 43 average precision: 0.48333333333333334\n", + "Query 44 average precision: 0.0\n", + "Query 45 average precision: 0.155\n", + "Query 46 average precision: 0.5380952380952382\n", + "Query 47 average precision: 0.15083333333333332\n", + "Query 48 average precision: 0.1433333333333333\n", + "Query 49 average precision: 0.0\n", + "Query 50 average precision: 0.0\n", + "Query 51 average precision: 0.16666666666666666\n", + "Query 52 average precision: 0.027777777777777776\n", + "Query 53 average precision: 0.15857142857142856\n", + "Query 54 average precision: 0.05555555555555555\n", + "Query 55 average precision: 0.11499999999999999\n", + "Query 56 average precision: 0.05\n", + "Query 57 average precision: 0.01\n", + "Query 58 average precision: 0.15873015873015872\n", + "Query 59 average precision: 0.03571428571428571\n", + "Query 60 average precision: 0.4666666666666667\n", + "Query 61 average precision: 0.45999999999999996\n", + "Query 62 average precision: 0.0\n", + "Query 63 average precision: 0.0\n", + "Query 64 average precision: 0.5\n", + "Query 65 average precision: 0.0\n", + "Query 66 average precision: 0.0\n", + "Query 67 average precision: 0.0365079365079365\n", + "Query 68 average precision: 0.1\n", + "Query 69 average precision: 0.06666666666666667\n", + "Query 70 average precision: 0.1\n", + "Query 71 average precision: 0.03125\n", + "Query 72 average precision: 0.0\n", + "Query 73 average precision: 0.2861111111111111\n", + "Query 74 average precision: 0.020833333333333332\n", + "Query 75 average precision: 0.03333333333333333\n", + "Query 76 average precision: 0.028571428571428574\n", + "Query 77 average precision: 0.28888888888888886\n", + "Query 78 average precision: 0.6666666666666666\n", + "Query 79 average precision: 0.0\n", + "Query 80 average precision: 0.0\n", + "Query 81 average precision: 0.25\n", + "Query 82 average precision: 0.09\n", + "Query 83 average precision: 0.0625\n", + "Query 84 average precision: 0.3\n", + "Query 85 average precision: 0.05\n", + "Query 86 average precision: 0.41666666666666663\n", + "Query 87 average precision: 0.0\n", + "Query 88 average precision: 0.594047619047619\n", + "Query 89 average precision: 0.075\n", + "Query 90 average precision: 0.19\n", + "Query 91 average precision: 0.0873015873015873\n", + "Query 92 average precision: 0.4726984126984126\n", + "Query 93 average precision: 0.5\n", + "Query 94 average precision: 0.4\n", + "Query 95 average precision: 0.5\n", + "Query 96 average precision: 0.4463095238095239\n", + "Query 97 average precision: 0.15416666666666665\n", + "Query 98 average precision: 0.0\n", + "Query 99 average precision: 0.19642857142857142\n", + "Query 100 average precision: 0.2185185185185185\n", + "Query 101 average precision: 0.6180555555555555\n", + "Query 102 average precision: 0.08333333333333333\n", + "Query 103 average precision: 0.05555555555555555\n", + "Query 104 average precision: 0.1\n", + "Query 105 average precision: 0.35333333333333333\n", + "Query 106 average precision: 0.24666666666666667\n", + "Query 107 average precision: 0.10476190476190476\n", + "Query 108 average precision: 0.6261904761904761\n", + "Query 109 average precision: 0.0\n", + "Query 110 average precision: 0.0\n", + "Query 111 average precision: 0.07619047619047618\n", + "Query 112 average precision: 0.3611111111111111\n", + "Query 113 average precision: 0.10416666666666666\n", + "Query 114 average precision: 0.0\n", + "Query 115 average precision: 0.0\n", + "Query 116 average precision: 0.03333333333333333\n", + "Query 117 average precision: 0.0\n", + "Query 118 average precision: 0.041666666666666664\n", + "Query 119 average precision: 0.5\n", + "Query 120 average precision: 0.17724867724867724\n", + "Query 121 average precision: 0.4768707482993197\n", + "Query 122 average precision: 0.05925925925925926\n", + "Query 123 average precision: 0.0\n", + "Query 124 average precision: 0.0\n", + "Query 125 average precision: 0.02\n", + "Query 126 average precision: 0.20833333333333331\n", + "Query 127 average precision: 0.025\n", + "Query 128 average precision: 0.0\n", + "Query 129 average precision: 0.4773809523809524\n", + "Query 130 average precision: 0.3933333333333333\n", + "Query 131 average precision: 0.09375\n", + "Query 132 average precision: 0.6592063492063491\n", + "Query 133 average precision: 0.12976190476190477\n", + "Query 134 average precision: 0.5\n", + "Query 135 average precision: 0.38690476190476186\n", + "Query 136 average precision: 0.03333333333333333\n", + "Query 137 average precision: 0.1875\n", + "Query 138 average precision: 0.25\n", + "Query 139 average precision: 0.0\n", + "Query 140 average precision: 0.10833333333333334\n", + "Query 141 average precision: 0.05555555555555555\n", + "Query 142 average precision: 0.0\n", + "Query 143 average precision: 0.6111111111111112\n", + "Query 144 average precision: 0.14722222222222223\n", + "Query 145 average precision: 0.07142857142857142\n", + "Query 146 average precision: 0.41666666666666663\n", + "Query 147 average precision: 0.14666666666666667\n", + "Query 148 average precision: 0.125\n", + "Query 149 average precision: 0.18285714285714286\n", + "Query 150 average precision: 1.0\n", + "Query 151 average precision: 0.0\n", + "Query 152 average precision: 0.0\n", + "Query 153 average precision: 0.19999999999999998\n", + "Query 154 average precision: 1.0\n", + "Query 155 average precision: 0.06481481481481481\n", + "Query 156 average precision: 0.6365476190476189\n", + "Query 157 average precision: 0.43809523809523804\n", + "Query 158 average precision: 0.175\n", + "Query 159 average precision: 0.015625\n", + "Query 160 average precision: 0.2\n", + "Query 161 average precision: 0.43333333333333335\n", + "Query 162 average precision: 0.020833333333333332\n", + "Query 163 average precision: 0.21666666666666667\n", + "Query 164 average precision: 0.40208333333333335\n", + "Query 165 average precision: 0.16666666666666666\n", + "Query 166 average precision: 0.0\n", + "Query 167 average precision: 0.325\n", + "Query 168 average precision: 0.08333333333333333\n", + "Query 169 average precision: 0.125\n", + "Query 170 average precision: 0.4611111111111111\n", + "Query 171 average precision: 0.4888888888888889\n", + "Query 172 average precision: 0.5416666666666666\n", + "Query 173 average precision: 0.8333333333333333\n", + "Query 174 average precision: 0.03333333333333333\n", + "Query 175 average precision: 0.02222222222222222\n", + "Query 176 average precision: 0.0\n", + "Query 177 average precision: 0.475\n", + "Query 178 average precision: 0.49166666666666664\n", + "Query 179 average precision: 0.125\n", + "Query 180 average precision: 0.34523809523809523\n", + "Query 181 average precision: 0.2\n", + "Query 182 average precision: 0.7\n", + "Query 183 average precision: 0.5308730158730158\n", + "Query 184 average precision: 0.06938775510204082\n", + "Query 185 average precision: 0.6388888888888888\n", + "Query 186 average precision: 0.016666666666666666\n", + "Query 187 average precision: 0.16666666666666666\n", + "Query 188 average precision: 0.09\n", + "Query 189 average precision: 0.015873015873015872\n", + "Query 190 average precision: 0.1\n", + "Query 191 average precision: 0.014285714285714285\n", + "Query 192 average precision: 0.5\n", + "Query 193 average precision: 0.6481481481481481\n", + "Query 194 average precision: 0.08888888888888889\n", + "Query 195 average precision: 0.1111111111111111\n", + "Query 196 average precision: 0.03333333333333333\n", + "Query 197 average precision: 0.6666666666666666\n", + "Query 198 average precision: 0.3125\n", + "Query 199 average precision: 0.05208333333333333\n", + "Query 200 average precision: 0.24074074074074073\n", + "Query 201 average precision: 0.3\n", + "Query 202 average precision: 0.26\n", + "Query 203 average precision: 0.07222222222222222\n", + "Query 204 average precision: 0.0\n", + "Query 205 average precision: 0.75\n", + "Query 206 average precision: 0.3333333333333333\n", + "Query 207 average precision: 0.15\n", + "Query 208 average precision: 0.6829365079365081\n", + "Query 209 average precision: 0.02\n", + "Query 210 average precision: 0.27777777777777773\n", + "Query 211 average precision: 0.0125\n", + "Query 212 average precision: 0.36666666666666664\n", + "Query 213 average precision: 0.5583333333333333\n", + "Query 214 average precision: 0.08333333333333333\n", + "Query 215 average precision: 0.0\n", + "Query 216 average precision: 0.0\n", + "Query 217 average precision: 0.05833333333333333\n", + "Query 218 average precision: 0.0\n", + "Query 219 average precision: 0.014285714285714285\n", + "Query 220 average precision: 0.05333333333333333\n", + "Query 221 average precision: 0.24333333333333332\n", + "Query 222 average precision: 0.5148148148148147\n", + "Query 223 average precision: 0.44375\n", + "Query 224 average precision: 0.0\n", + "Query 225 average precision: 0.13333333333333333\n" + ] + } + ], + "source": [ + "def evaluate_ranker(ranker, ev, num_results):\n", + " ev.reset_stats()\n", + " with open('data/cranfield/cranfield-queries.txt') as query_file:\n", + " for query_num, line in enumerate(query_file):\n", + " query.content(line.strip())\n", + " results = ranker.score(inv_idx, query, num_results) \n", + " avg_p = ev.avg_p(results, query_num + 1, num_results)\n", + " print(\"Query {} average precision: {}\".format(query_num + 1, avg_p))\n", + " \n", + "evaluate_ranker(ranker, ev, 10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Afterwards, we can get the mean average precision of all the queries." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAP: 0.21512203955656342\n" + ] + } + ], + "source": [ + "dp_map = ev.map()\n", + "print(\"MAP: {}\".format(dp_map))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's use the two-component mixture model we discussed as an implementation of pseudo-feedback for retrieval and see if it helps improve performance. The actual ranking function used here is KL-divergence, where the query model is adjusted to include pseudo-feedback from the retrieved documents.\n", + "\n", + "In order to work, the ranker needs to be able to quickly determine what words were used in the feedback document set. The `InvertedIndex` does not provide fast access to this (since it is a mapping from term to documents, rather than from documents to terms), so we will want to first create a `ForwardIndex` to get the document -> terms mapping." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Counting lines in file: [> ] 0% ETA 00:00:00 \r", + " > Counting lines in file: [=================================] 100% ETA 00:00:00 \n", + "1529954010: [info] Creating forward index: cranfield-idx/fwd (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/forward_index.cpp:239)\n", + " \r", + " > Tokenizing Docs: [> ] 0% ETA 00:00:00 \n", + "1529954010: [warning] Empty document (id = 470) generated! (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/forward_index.cpp:335)\n", + " \n", + "1529954010: [warning] Empty document (id = 994) generated! (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/forward_index.cpp:335)\n", + " \r", + " > Tokenizing Docs: [========================================] 100% ETA 00:00:00 \n", + " \r", + " > Merging: [> ] 0% ETA 00:00:00 \r", + " > Merging: [================================================] 100% ETA 00:00:00 \n", + "1529954010: [info] Done creating index: cranfield-idx/fwd (/tmp/pip-req-build-m473bt6z/deps/meta/src/index/forward_index.cpp:278)\n" + ] + } + ], + "source": [ + "fwd_idx = metapy.index.make_forward_index('cranfield.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can construct the KL-divergence pseudo-feedback ranker. The main components are:\n", + "1. The forward index\n", + "2. A base language-model ranker (here we'll use `DirichletPrior`)\n", + "3. $\\alpha$, the query interpolation parameter (how strongly do we prefer terms from the feedback model? default 0.5)\n", + "4. $\\lambda$, the language-model interpolation parameter (how strong is the background model in the two-component mixture? default 0.5)\n", + "5. $k$, the number of documents to retrieve for the feedback set (default 10)\n", + "6. `max_terms`, the number of terms from the feedback model to incorporate into the new query model (default 50) " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "feedback = metapy.index.KLDivergencePRF(fwd_idx, metapy.index.DirichletPrior())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query 1 average precision: 0.13999999999999999\n", + "Query 2 average precision: 0.524047619047619\n", + "Query 3 average precision: 0.6642857142857143\n", + "Query 4 average precision: 0.5\n", + "Query 5 average precision: 0.6875\n", + "Query 6 average precision: 0.0625\n", + "Query 7 average precision: 0.11666666666666665\n", + "Query 8 average precision: 0.0\n", + "Query 9 average precision: 0.49999999999999994\n", + "Query 10 average precision: 0.0625\n", + "Query 11 average precision: 0.023809523809523808\n", + "Query 12 average precision: 0.15714285714285714\n", + "Query 13 average precision: 0.0\n", + "Query 14 average precision: 0.5\n", + "Query 15 average precision: 0.6428571428571428\n", + "Query 16 average precision: 0.05555555555555555\n", + "Query 17 average precision: 0.05\n", + "Query 18 average precision: 0.16666666666666666\n", + "Query 19 average precision: 0.0\n", + "Query 20 average precision: 0.33201058201058203\n", + "Query 21 average precision: 0.0\n", + "Query 22 average precision: 0.0\n", + "Query 23 average precision: 0.075\n", + "Query 24 average precision: 0.38888888888888884\n", + "Query 25 average precision: 0.7530864197530864\n", + "Query 26 average precision: 0.061111111111111116\n", + "Query 27 average precision: 0.0\n", + "Query 28 average precision: 0.0\n", + "Query 29 average precision: 0.091005291005291\n", + "Query 30 average precision: 0.19999999999999998\n", + "Query 31 average precision: 0.0\n", + "Query 32 average precision: 0.023809523809523808\n", + "Query 33 average precision: 0.6666666666666666\n", + "Query 34 average precision: 0.21031746031746032\n", + "Query 35 average precision: 0.03333333333333333\n", + "Query 36 average precision: 0.6\n", + "Query 37 average precision: 0.1111111111111111\n", + "Query 38 average precision: 0.02\n", + "Query 39 average precision: 0.12857142857142856\n", + "Query 40 average precision: 0.07222222222222222\n", + "Query 41 average precision: 0.7916666666666666\n", + "Query 42 average precision: 0.14523809523809522\n", + "Query 43 average precision: 0.3333333333333333\n", + "Query 44 average precision: 0.0\n", + "Query 45 average precision: 0.13999999999999999\n", + "Query 46 average precision: 0.49071428571428566\n", + "Query 47 average precision: 0.14027777777777778\n", + "Query 48 average precision: 0.1433333333333333\n", + "Query 49 average precision: 0.0\n", + "Query 50 average precision: 0.0\n", + "Query 51 average precision: 0.22999999999999998\n", + "Query 52 average precision: 0.08125\n", + "Query 53 average precision: 0.15\n", + "Query 54 average precision: 0.037037037037037035\n", + "Query 55 average precision: 0.11833333333333333\n", + "Query 56 average precision: 0.05\n", + "Query 57 average precision: 0.0125\n", + "Query 58 average precision: 0.16666666666666666\n", + "Query 59 average precision: 0.03571428571428571\n", + "Query 60 average precision: 0.45999999999999996\n", + "Query 61 average precision: 0.4083333333333333\n", + "Query 62 average precision: 0.0\n", + "Query 63 average precision: 0.0\n", + "Query 64 average precision: 0.5\n", + "Query 65 average precision: 0.0\n", + "Query 66 average precision: 0.0\n", + "Query 67 average precision: 0.03111111111111111\n", + "Query 68 average precision: 0.06666666666666667\n", + "Query 69 average precision: 0.2\n", + "Query 70 average precision: 0.1\n", + "Query 71 average precision: 0.041666666666666664\n", + "Query 72 average precision: 0.0\n", + "Query 73 average precision: 0.38916666666666666\n", + "Query 74 average precision: 0.05555555555555555\n", + "Query 75 average precision: 0.02222222222222222\n", + "Query 76 average precision: 0.02040816326530612\n", + "Query 77 average precision: 0.34027777777777773\n", + "Query 78 average precision: 0.7916666666666666\n", + "Query 79 average precision: 0.0\n", + "Query 80 average precision: 0.0\n", + "Query 81 average precision: 0.25\n", + "Query 82 average precision: 0.07333333333333333\n", + "Query 83 average precision: 0.05\n", + "Query 84 average precision: 0.2375\n", + "Query 85 average precision: 0.05\n", + "Query 86 average precision: 0.30952380952380953\n", + "Query 87 average precision: 0.0\n", + "Query 88 average precision: 0.594047619047619\n", + "Query 89 average precision: 0.10833333333333334\n", + "Query 90 average precision: 0.1875\n", + "Query 91 average precision: 0.1111111111111111\n", + "Query 92 average precision: 0.5747619047619048\n", + "Query 93 average precision: 0.5\n", + "Query 94 average precision: 0.4\n", + "Query 95 average precision: 0.5\n", + "Query 96 average precision: 0.6042063492063492\n", + "Query 97 average precision: 0.24166666666666664\n", + "Query 98 average precision: 0.0\n", + "Query 99 average precision: 0.25\n", + "Query 100 average precision: 0.25555555555555554\n", + "Query 101 average precision: 0.5444444444444444\n", + "Query 102 average precision: 0.08333333333333333\n", + "Query 103 average precision: 0.1\n", + "Query 104 average precision: 0.1\n", + "Query 105 average precision: 0.4333333333333333\n", + "Query 106 average precision: 0.3746031746031746\n", + "Query 107 average precision: 0.1619047619047619\n", + "Query 108 average precision: 0.5488095238095239\n", + "Query 109 average precision: 0.0\n", + "Query 110 average precision: 0.0\n", + "Query 111 average precision: 0.014285714285714287\n", + "Query 112 average precision: 0.3611111111111111\n", + "Query 113 average precision: 0.10555555555555556\n", + "Query 114 average precision: 0.0\n", + "Query 115 average precision: 0.0\n", + "Query 116 average precision: 0.02857142857142857\n", + "Query 117 average precision: 0.0\n", + "Query 118 average precision: 0.037037037037037035\n", + "Query 119 average precision: 0.3333333333333333\n", + "Query 120 average precision: 0.1527777777777778\n", + "Query 121 average precision: 0.42857142857142855\n", + "Query 122 average precision: 0.07777777777777778\n", + "Query 123 average precision: 0.0\n", + "Query 124 average precision: 0.0\n", + "Query 125 average precision: 0.03333333333333333\n", + "Query 126 average precision: 0.20833333333333331\n", + "Query 127 average precision: 0.02222222222222222\n", + "Query 128 average precision: 0.0\n", + "Query 129 average precision: 0.6738095238095239\n", + "Query 130 average precision: 0.55\n", + "Query 131 average precision: 0.18333333333333335\n", + "Query 132 average precision: 0.7254365079365078\n", + "Query 133 average precision: 0.18010204081632653\n", + "Query 134 average precision: 0.5\n", + "Query 135 average precision: 0.37351190476190477\n", + "Query 136 average precision: 0.047619047619047616\n", + "Query 137 average precision: 0.1125\n", + "Query 138 average precision: 0.5\n", + "Query 139 average precision: 0.0\n", + "Query 140 average precision: 0.10833333333333334\n", + "Query 141 average precision: 0.041666666666666664\n", + "Query 142 average precision: 0.0\n", + "Query 143 average precision: 0.39285714285714285\n", + "Query 144 average precision: 0.20555555555555557\n", + "Query 145 average precision: 0.07142857142857142\n", + "Query 146 average precision: 0.30952380952380953\n", + "Query 147 average precision: 0.14999999999999997\n", + "Query 148 average precision: 0.075\n", + "Query 149 average precision: 0.17666666666666667\n", + "Query 150 average precision: 1.0\n", + "Query 151 average precision: 0.0\n", + "Query 152 average precision: 0.0\n", + "Query 153 average precision: 0.2571428571428571\n", + "Query 154 average precision: 1.0\n", + "Query 155 average precision: 0.06666666666666667\n", + "Query 156 average precision: 0.7532142857142856\n", + "Query 157 average precision: 0.40555555555555556\n", + "Query 158 average precision: 0.19375\n", + "Query 159 average precision: 0.020833333333333332\n", + "Query 160 average precision: 0.2\n", + "Query 161 average precision: 0.3611111111111111\n", + "Query 162 average precision: 0.013888888888888888\n", + "Query 163 average precision: 0.19444444444444442\n", + "Query 164 average precision: 0.40208333333333335\n", + "Query 165 average precision: 0.125\n", + "Query 166 average precision: 0.0\n", + "Query 167 average precision: 0.5\n", + "Query 168 average precision: 0.08333333333333333\n", + "Query 169 average precision: 0.25\n", + "Query 170 average precision: 0.46990740740740744\n", + "Query 171 average precision: 0.5555555555555555\n", + "Query 172 average precision: 0.5583333333333333\n", + "Query 173 average precision: 0.7\n", + "Query 174 average precision: 0.03333333333333333\n", + "Query 175 average precision: 0.02857142857142857\n", + "Query 176 average precision: 0.0\n", + "Query 177 average precision: 0.475\n", + "Query 178 average precision: 0.75\n", + "Query 179 average precision: 0.25\n", + "Query 180 average precision: 0.3880952380952381\n", + "Query 181 average precision: 0.1\n", + "Query 182 average precision: 0.7\n", + "Query 183 average precision: 0.6842063492063492\n", + "Query 184 average precision: 0.05215419501133787\n", + "Query 185 average precision: 0.6296296296296297\n", + "Query 186 average precision: 0.04722222222222222\n", + "Query 187 average precision: 0.125\n", + "Query 188 average precision: 0.19\n", + "Query 189 average precision: 0.022222222222222223\n", + "Query 190 average precision: 0.05\n", + "Query 191 average precision: 0.05357142857142857\n", + "Query 192 average precision: 0.575\n", + "Query 193 average precision: 0.6565255731922398\n", + "Query 194 average precision: 0.19999999999999998\n", + "Query 195 average precision: 0.3333333333333333\n", + "Query 196 average precision: 0.05\n", + "Query 197 average precision: 0.6666666666666666\n", + "Query 198 average precision: 0.1875\n", + "Query 199 average precision: 0.048611111111111105\n", + "Query 200 average precision: 0.40740740740740744\n", + "Query 201 average precision: 0.33999999999999997\n", + "Query 202 average precision: 0.22666666666666666\n", + "Query 203 average precision: 0.07333333333333333\n", + "Query 204 average precision: 0.0\n", + "Query 205 average precision: 0.7\n", + "Query 206 average precision: 0.38888888888888884\n", + "Query 207 average precision: 0.3\n", + "Query 208 average precision: 0.7345238095238096\n", + "Query 209 average precision: 0.03333333333333333\n", + "Query 210 average precision: 0.27777777777777773\n", + "Query 211 average precision: 0.025\n", + "Query 212 average precision: 0.36666666666666664\n", + "Query 213 average precision: 0.49309523809523814\n", + "Query 214 average precision: 0.08333333333333333\n", + "Query 215 average precision: 0.0\n", + "Query 216 average precision: 0.0\n", + "Query 217 average precision: 0.07857142857142857\n", + "Query 218 average precision: 0.0\n", + "Query 219 average precision: 0.014285714285714285\n", + "Query 220 average precision: 0.07333333333333333\n", + "Query 221 average precision: 0.24333333333333332\n", + "Query 222 average precision: 0.5238095238095238\n", + "Query 223 average precision: 0.4583333333333333\n", + "Query 224 average precision: 0.0\n", + "Query 225 average precision: 0.15\n" + ] + } + ], + "source": [ + "evaluate_ranker(feedback, ev, 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feedback MAP: 0.22816526133086987\n", + "DP MAP: 0.21512203955656342\n" + ] + } + ], + "source": [ + "fb_map = ev.map()\n", + "print(\"Feedback MAP: {}\".format(fb_map))\n", + "print(\"DP MAP: {}\".format(dp_map))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/metapy/tutorials/sigir18-topic-models/sigir18-topic-models.ipynb b/metapy/tutorials/sigir18-topic-models/sigir18-topic-models.ipynb new file mode 100644 index 0000000000..84ad41f4c4 --- /dev/null +++ b/metapy/tutorials/sigir18-topic-models/sigir18-topic-models.ipynb @@ -0,0 +1,1188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 2: Topic Model Inference in LDA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's import the Python bindings for MeTA:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import metapy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't have `metapy` installed, you can install it with a\n", + "\n", + "```bash\n", + "pip install metapy\n", + "```\n", + "\n", + "on the command line on Linux, macOS, or Windows for either Python 2.7 or Python 3.x. (I will be using Python 3.6 in this tutorial.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Double-check that you are running the latest version. Right now, that should be `0.2.10`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.2.11'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metapy.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's set MeTA to log to standard error so we can see progress output for long-running commands. (Only do this once, or you'll get double the output.)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "metapy.log_to_stderr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's download all of the files we need for the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.request\n", + "import os\n", + "import tarfile\n", + "\n", + "if not os.path.exists('sigir18-tutorial.tar.gz'):\n", + " urllib.request.urlretrieve('https://meta-toolkit.org/data/2018-06-25/sigir18-tutorial.tar.gz',\n", + " 'sigir18-tutorial.tar.gz')\n", + " \n", + "if not os.path.exists('data'):\n", + " with tarfile.open('sigir18-tutorial.tar.gz', 'r:gz') as files:\n", + " files.extractall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The tutorial files come with a dataset consisting of four years of NIPS proceedings (full text): 2002, 2007, 2012, and 2017.\n", + "\n", + "To start, we first want to understand what topics are being discussed in NIPS in these for years. To do that, we'll first index the dataset in the `ForwardIndex` format (we want to map documents to the terms that they contain)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1530029662: [info] Creating forward index: nips-idx/fwd (/tmp/pip-req-build-s8007td9/deps/meta/src/index/forward_index.cpp:239)\n", + " > Tokenizing Docs: [===================================> ] 88% ETA 00:00:00 \n", + "1530029664: [warning] Empty document (id = 1435) generated! (/tmp/pip-req-build-s8007td9/deps/meta/src/index/forward_index.cpp:335)\n", + " > Tokenizing Docs: [========================================] 100% ETA 00:00:00 \n", + " > Merging: [================================================] 100% ETA 00:00:00 \n", + "1530029664: [info] Done creating index: nips-idx/fwd (/tmp/pip-req-build-s8007td9/deps/meta/src/index/forward_index.cpp:278)\n" + ] + } + ], + "source": [ + "fidx = metapy.index.make_forward_index('nips.toml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's load in all of the documents into memory so we can start to infer a topic model. I'm going to load them in as a `MulticlassDataset` because each document here has been associated with a label (the year it came from), but you could also load them in as just a standard `Dataset` with no associated labels if you don't plan to use them." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading instances into memory: [> ] 0% ETA 00:00:00 \r", + " > Loading instances into memory: [==========================] 100% ETA 00:00:00 \n" + ] + } + ], + "source": [ + "dset = metapy.classify.MulticlassDataset(fidx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the documents loaded into memory, we can start to run LDA inference on them to infer the topics and their coverage in each of the documents. There are several choices for inference algorithm in MeTA, so in general you can just pick your favorite. Here, I'm going to pick a parallelized version of Gibbs sampling.\n", + "\n", + "The below will run the sampler for either 1000 iterations or until the log likelihood ($\\log P(W \\mid Z)$) stabilizes, whichever comes first. (If you want to disable the convergence checking and just run the sampler for a fixed number of iterations, you can add the parameter `convergence=0`.)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Initialization log likelihood (log P(W|Z)): -3.05507e+07 \n", + "Iteration 1 log likelihood (log P(W|Z)): -3.05155e+07 \n", + "Iteration 2 log likelihood (log P(W|Z)): -3.04318e+07 \n", + "Iteration 3 log likelihood (log P(W|Z)): -3.03566e+07 \n", + "Iteration 4 log likelihood (log P(W|Z)): -3.02892e+07 \n", + "Iteration 5 log likelihood (log P(W|Z)): -3.0229e+07 \n", + "Iteration 6 log likelihood (log P(W|Z)): -3.01748e+07 \n", + "Iteration 7 log likelihood (log P(W|Z)): -3.0123e+07 \n", + "Iteration 8 log likelihood (log P(W|Z)): -3.00742e+07 \n", + "Iteration 9 log likelihood (log P(W|Z)): -3.00266e+07 \n", + "Iteration 10 log likelihood (log P(W|Z)): -2.99788e+07 \n", + "Iteration 11 log likelihood (log P(W|Z)): -2.99365e+07 \n", + "Iteration 12 log likelihood (log P(W|Z)): -2.98953e+07 \n", + "Iteration 13 log likelihood (log P(W|Z)): -2.98584e+07 \n", + "Iteration 14 log likelihood (log P(W|Z)): -2.9823e+07 \n", + "Iteration 15 log likelihood (log P(W|Z)): -2.97915e+07 \n", + "Iteration 16 log likelihood (log P(W|Z)): -2.9764e+07 \n", + "Iteration 17 log likelihood (log P(W|Z)): -2.97375e+07 \n", + "Iteration 18 log likelihood (log P(W|Z)): -2.97107e+07 \n", + "Iteration 19 log likelihood (log P(W|Z)): -2.96852e+07 \n", + "Iteration 20 log likelihood (log P(W|Z)): -2.96599e+07 \n", + "Iteration 21 log likelihood (log P(W|Z)): -2.96358e+07 \n", + "Iteration 22 log likelihood (log P(W|Z)): -2.9616e+07 \n", + "Iteration 23 log likelihood (log P(W|Z)): -2.95975e+07 \n", + "Iteration 24 log likelihood (log P(W|Z)): -2.95787e+07 \n", + "Iteration 25 log likelihood (log P(W|Z)): -2.95601e+07 \n", + "Iteration 26 log likelihood (log P(W|Z)): -2.95423e+07 \n", + "Iteration 27 log likelihood (log P(W|Z)): -2.95256e+07 \n", + "Iteration 28 log likelihood (log P(W|Z)): -2.95089e+07 \n", + "Iteration 29 log likelihood (log P(W|Z)): -2.94934e+07 \n", + "Iteration 30 log likelihood (log P(W|Z)): -2.94771e+07 \n", + "Iteration 31 log likelihood (log P(W|Z)): -2.94604e+07 \n", + "Iteration 32 log likelihood (log P(W|Z)): -2.94411e+07 \n", + "Iteration 33 log likelihood (log P(W|Z)): -2.94258e+07 \n", + "Iteration 34 log likelihood (log P(W|Z)): -2.94121e+07 \n", + "Iteration 35 log likelihood (log P(W|Z)): -2.93981e+07 \n", + "Iteration 36 log likelihood (log P(W|Z)): -2.93842e+07 \n", + "Iteration 37 log likelihood (log P(W|Z)): -2.93683e+07 \n", + "Iteration 38 log likelihood (log P(W|Z)): -2.93535e+07 \n", + "Iteration 39 log likelihood (log P(W|Z)): -2.93385e+07 \n", + "Iteration 40 log likelihood (log P(W|Z)): -2.93255e+07 \n", + "Iteration 41 log likelihood (log P(W|Z)): -2.93134e+07 \n", + "Iteration 42 log likelihood (log P(W|Z)): -2.93024e+07 \n", + "Iteration 43 log likelihood (log P(W|Z)): -2.92893e+07 \n", + "Iteration 44 log likelihood (log P(W|Z)): -2.92773e+07 \n", + "Iteration 45 log likelihood (log P(W|Z)): -2.92652e+07 \n", + "Iteration 46 log likelihood (log P(W|Z)): -2.92528e+07 \n", + "Iteration 47 log likelihood (log P(W|Z)): -2.92426e+07 \n", + "Iteration 48 log likelihood (log P(W|Z)): -2.92319e+07 \n", + "Iteration 49 log likelihood (log P(W|Z)): -2.92214e+07 \n", + "Iteration 50 log likelihood (log P(W|Z)): -2.92125e+07 \n", + "Iteration 51 log likelihood (log P(W|Z)): -2.92024e+07 \n", + "Iteration 52 log likelihood (log P(W|Z)): -2.91915e+07 \n", + "Iteration 53 log likelihood (log P(W|Z)): -2.91817e+07 \n", + "Iteration 54 log likelihood (log P(W|Z)): -2.91718e+07 \n", + "Iteration 55 log likelihood (log P(W|Z)): -2.91624e+07 \n", + "Iteration 56 log likelihood (log P(W|Z)): -2.91524e+07 \n", + "Iteration 57 log likelihood (log P(W|Z)): -2.91437e+07 \n", + "Iteration 58 log likelihood (log P(W|Z)): -2.9136e+07 \n", + "Iteration 59 log likelihood (log P(W|Z)): -2.91267e+07 \n", + "Iteration 60 log likelihood (log P(W|Z)): -2.91166e+07 \n", + "Iteration 61 log likelihood (log P(W|Z)): -2.91116e+07 \n", + "Iteration 62 log likelihood (log P(W|Z)): -2.91032e+07 \n", + "Iteration 63 log likelihood (log P(W|Z)): -2.90974e+07 \n", + "Iteration 64 log likelihood (log P(W|Z)): -2.90928e+07 \n", + "Iteration 65 log likelihood (log P(W|Z)): -2.90862e+07 \n", + "Iteration 66 log likelihood (log P(W|Z)): -2.90802e+07 \n", + "Iteration 67 log likelihood (log P(W|Z)): -2.9076e+07 \n", + "Iteration 68 log likelihood (log P(W|Z)): -2.90701e+07 \n", + "Iteration 69 log likelihood (log P(W|Z)): -2.90642e+07 \n", + "Iteration 70 log likelihood (log P(W|Z)): -2.90588e+07 \n", + "Iteration 71 log likelihood (log P(W|Z)): -2.90521e+07 \n", + "Iteration 72 log likelihood (log P(W|Z)): -2.90461e+07 \n", + "Iteration 73 log likelihood (log P(W|Z)): -2.90403e+07 \n", + "Iteration 74 log likelihood (log P(W|Z)): -2.90336e+07 \n", + "Iteration 75 log likelihood (log P(W|Z)): -2.90275e+07 \n", + "Iteration 76 log likelihood (log P(W|Z)): -2.9024e+07 \n", + "Iteration 77 log likelihood (log P(W|Z)): -2.90169e+07 \n", + "Iteration 78 log likelihood (log P(W|Z)): -2.90139e+07 \n", + "Iteration 79 log likelihood (log P(W|Z)): -2.90059e+07 \n", + "Iteration 80 log likelihood (log P(W|Z)): -2.90029e+07 \n", + "Iteration 81 log likelihood (log P(W|Z)): -2.89997e+07 \n", + "Iteration 82 log likelihood (log P(W|Z)): -2.8994e+07 \n", + "Iteration 83 log likelihood (log P(W|Z)): -2.89882e+07 \n", + "Iteration 84 log likelihood (log P(W|Z)): -2.89821e+07 \n", + "Iteration 85 log likelihood (log P(W|Z)): -2.89808e+07 \n", + "Iteration 86 log likelihood (log P(W|Z)): -2.89763e+07 \n", + "Iteration 87 log likelihood (log P(W|Z)): -2.89707e+07 \n", + "Iteration 88 log likelihood (log P(W|Z)): -2.89659e+07 \n", + "Iteration 89 log likelihood (log P(W|Z)): -2.89618e+07 \n", + "Iteration 90 log likelihood (log P(W|Z)): -2.89592e+07 \n", + "Iteration 91 log likelihood (log P(W|Z)): -2.89556e+07 \n", + "Iteration 92 log likelihood (log P(W|Z)): -2.89521e+07 \n", + "Iteration 93 log likelihood (log P(W|Z)): -2.89499e+07 \n", + "Iteration 94 log likelihood (log P(W|Z)): -2.89452e+07 \n", + "Iteration 95 log likelihood (log P(W|Z)): -2.8943e+07 \n", + "Iteration 96 log likelihood (log P(W|Z)): -2.89395e+07 \n", + "Iteration 97 log likelihood (log P(W|Z)): -2.89344e+07 \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Iteration 98 log likelihood (log P(W|Z)): -2.89316e+07 \n", + "Iteration 99 log likelihood (log P(W|Z)): -2.89272e+07 \n", + "Iteration 100 log likelihood (log P(W|Z)): -2.89221e+07 \n", + "Iteration 101 log likelihood (log P(W|Z)): -2.89194e+07 \n", + "Iteration 102 log likelihood (log P(W|Z)): -2.89157e+07 \n", + "Iteration 103 log likelihood (log P(W|Z)): -2.89128e+07 \n", + "Iteration 104 log likelihood (log P(W|Z)): -2.89098e+07 \n", + "Iteration 105 log likelihood (log P(W|Z)): -2.89056e+07 \n", + "Iteration 106 log likelihood (log P(W|Z)): -2.89036e+07 \n", + "Iteration 107 log likelihood (log P(W|Z)): -2.88998e+07 \n", + "Iteration 108 log likelihood (log P(W|Z)): -2.88973e+07 \n", + "Iteration 109 log likelihood (log P(W|Z)): -2.88933e+07 \n", + "Iteration 110 log likelihood (log P(W|Z)): -2.88906e+07 \n", + "Iteration 111 log likelihood (log P(W|Z)): -2.88857e+07 \n", + "Iteration 112 log likelihood (log P(W|Z)): -2.88837e+07 \n", + "Iteration 113 log likelihood (log P(W|Z)): -2.88801e+07 \n", + "Iteration 114 log likelihood (log P(W|Z)): -2.88774e+07 \n", + "Iteration 115 log likelihood (log P(W|Z)): -2.8874e+07 \n", + "Iteration 116 log likelihood (log P(W|Z)): -2.88712e+07 \n", + "Iteration 117 log likelihood (log P(W|Z)): -2.88682e+07 \n", + "Iteration 118 log likelihood (log P(W|Z)): -2.88675e+07 \n", + "Iteration 119 log likelihood (log P(W|Z)): -2.88655e+07 \n", + "Iteration 120 log likelihood (log P(W|Z)): -2.88631e+07 \n", + "Iteration 121 log likelihood (log P(W|Z)): -2.88604e+07 \n", + "Iteration 122 log likelihood (log P(W|Z)): -2.886e+07 \n", + "Iteration 123 log likelihood (log P(W|Z)): -2.88581e+07 \n", + "Iteration 124 log likelihood (log P(W|Z)): -2.88562e+07 \n", + "Iteration 125 log likelihood (log P(W|Z)): -2.88539e+07 \n", + "Iteration 126 log likelihood (log P(W|Z)): -2.88511e+07 \n", + "Iteration 127 log likelihood (log P(W|Z)): -2.88496e+07 \n", + "Iteration 128 log likelihood (log P(W|Z)): -2.88483e+07 \n", + "Iteration 129 log likelihood (log P(W|Z)): -2.88485e+07 \n", + "Iteration 130 log likelihood (log P(W|Z)): -2.88463e+07 \n", + "Iteration 131 log likelihood (log P(W|Z)): -2.88444e+07 \n", + "Iteration 132 log likelihood (log P(W|Z)): -2.8841e+07 \n", + "Iteration 133 log likelihood (log P(W|Z)): -2.88389e+07 \n", + "Iteration 134 log likelihood (log P(W|Z)): -2.8839e+07 \n", + "Iteration 135 log likelihood (log P(W|Z)): -2.88364e+07 \n", + "Iteration 136 log likelihood (log P(W|Z)): -2.88347e+07 \n", + "Iteration 137 log likelihood (log P(W|Z)): -2.8835e+07 \n", + "Iteration 138 log likelihood (log P(W|Z)): -2.88348e+07 \n", + "Iteration 139 log likelihood (log P(W|Z)): -2.8833e+07 \n", + "Iteration 140 log likelihood (log P(W|Z)): -2.88309e+07 \n", + "Iteration 141 log likelihood (log P(W|Z)): -2.8828e+07 \n", + "Iteration 142 log likelihood (log P(W|Z)): -2.8827e+07 \n", + "Iteration 143 log likelihood (log P(W|Z)): -2.88244e+07 \n", + "Iteration 144 log likelihood (log P(W|Z)): -2.88224e+07 \n", + "Iteration 145 log likelihood (log P(W|Z)): -2.88207e+07 \n", + "Iteration 146 log likelihood (log P(W|Z)): -2.88156e+07 \n", + "Iteration 147 log likelihood (log P(W|Z)): -2.88159e+07 \n", + "Iteration 148 log likelihood (log P(W|Z)): -2.88156e+07 \n", + "Iteration 149 log likelihood (log P(W|Z)): -2.88144e+07 \n", + "Iteration 150 log likelihood (log P(W|Z)): -2.88137e+07 \n", + "Iteration 151 log likelihood (log P(W|Z)): -2.88135e+07 \n", + "Iteration 152 log likelihood (log P(W|Z)): -2.8813e+07 \n", + "Iteration 153 log likelihood (log P(W|Z)): -2.88128e+07 \n", + "Iteration 154 log likelihood (log P(W|Z)): -2.88114e+07 \n", + "Iteration 155 log likelihood (log P(W|Z)): -2.88099e+07 \n", + "Iteration 156 log likelihood (log P(W|Z)): -2.88091e+07 \n", + "Iteration 157 log likelihood (log P(W|Z)): -2.88062e+07 \n", + "Iteration 158 log likelihood (log P(W|Z)): -2.88021e+07 \n", + "Iteration 159 log likelihood (log P(W|Z)): -2.88032e+07 \n", + "Iteration 160 log likelihood (log P(W|Z)): -2.88007e+07 \n", + "Iteration 161 log likelihood (log P(W|Z)): -2.88005e+07 \n", + "Iteration 162 log likelihood (log P(W|Z)): -2.87996e+07 \n", + "Iteration 163 log likelihood (log P(W|Z)): -2.87982e+07 \n", + "Iteration 164 log likelihood (log P(W|Z)): -2.87974e+07 \n", + "Iteration 165 log likelihood (log P(W|Z)): -2.87959e+07 \n", + "Iteration 166 log likelihood (log P(W|Z)): -2.8795e+07 \n", + "Iteration 167 log likelihood (log P(W|Z)): -2.87936e+07 \n", + "Iteration 168 log likelihood (log P(W|Z)): -2.87928e+07 \n", + "Iteration 169 log likelihood (log P(W|Z)): -2.87938e+07 \n", + "Iteration 170 log likelihood (log P(W|Z)): -2.87925e+07 \n", + "Iteration 171 log likelihood (log P(W|Z)): -2.87933e+07 \n", + "Iteration 172 log likelihood (log P(W|Z)): -2.87899e+07 \n", + "Iteration 173 log likelihood (log P(W|Z)): -2.8791e+07 \n", + "Iteration 174 log likelihood (log P(W|Z)): -2.8792e+07 \n", + "Iteration 175 log likelihood (log P(W|Z)): -2.87905e+07 \n", + "Iteration 176 log likelihood (log P(W|Z)): -2.8789e+07 \n", + "Iteration 177 log likelihood (log P(W|Z)): -2.87893e+07 \n", + "Iteration 178 log likelihood (log P(W|Z)): -2.87886e+07 \n", + "Iteration 179 log likelihood (log P(W|Z)): -2.87889e+07 \n", + "Iteration 180 log likelihood (log P(W|Z)): -2.87903e+07 \n", + "Iteration 181 log likelihood (log P(W|Z)): -2.87883e+07 \n", + "Iteration 182 log likelihood (log P(W|Z)): -2.87883e+07 \n", + " Found convergence after 182 iterations!\n", + "1530029871: [info] Finished maximum iterations, or found convergence! (/tmp/pip-req-build-s8007td9/deps/meta/src/topics/lda_gibbs.cpp:77)\n" + ] + } + ], + "source": [ + "model = metapy.topics.LDAParallelGibbs(docs=dset, num_topics=10, alpha=0.1, beta=0.1)\n", + "model.run(num_iters=1000)\n", + "model.save('lda-pgibbs-nips')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the above converges, it will save the results to disk. We can load the results into memory for inspection by loading an instance of the `TopicModel` class:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading topic term probabilities: [==> ] 10% ETA 00:00:00 \r", + " > Loading topic term probabilities: [=======================] 100% ETA 00:00:00 \n", + " \r", + " > Loading document topic probabilities: [> ] 0% ETA 00:00:00 \r", + " > Loading document topic probabilities: [===================] 100% ETA 00:00:00 \n" + ] + } + ], + "source": [ + "model = metapy.topics.TopicModel('lda-pgibbs-nips')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What do the topics discussed in NIPS over the last two decades roughly look like?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 1:\n", + "model: 0.047845971473469015\n", + "item: 0.036164145122550555\n", + "topic: 0.03582248886330763\n", + "document: 0.0301900041003383\n", + "latent: 0.029632168260620255\n", + "word: 0.02883241321447034\n", + "user: 0.0262748217085278\n", + "languag: 0.021573178212931057\n", + "lda: 0.014349119364181297\n", + "dirichlet: 0.013994334038283054\n", + "======\n", + "\n", + "Topic 2:\n", + "neuron: 0.10418809366208093\n", + "spike: 0.07989410305538147\n", + "stimulus: 0.03402944466933788\n", + "respons: 0.027507819831649183\n", + "cell: 0.024650823093126214\n", + "signal: 0.023464992947839394\n", + "brain: 0.021329972080580163\n", + "time: 0.017871200820055985\n", + "fire: 0.017004659525601817\n", + "stimuli: 0.016437347395342597\n", + "======\n", + "\n", + "Topic 3:\n", + "polici: 0.18494968655605182\n", + "action: 0.09615364214854322\n", + "reward: 0.08377584211666766\n", + "agent: 0.0808785827262104\n", + "game: 0.05217837257413512\n", + "state: 0.04843198689741257\n", + "reinforc: 0.04066073218825473\n", + "trajectori: 0.03331838788594098\n", + "mdp: 0.02327701848342051\n", + "player: 0.022277731661894288\n", + "======\n", + "\n", + "Topic 4:\n", + "kernel: 0.058391015341538656\n", + "label: 0.05470488694787862\n", + "classifi: 0.04543920685948777\n", + "classif: 0.031377691511149046\n", + "featur: 0.025877748375414726\n", + "train: 0.02553568880708937\n", + "svm: 0.022467854412508627\n", + "loss: 0.02242496964410883\n", + "data: 0.01836611529415411\n", + "class: 0.015976898254645416\n", + "======\n", + "\n", + "Topic 5:\n", + "posterior: 0.043363904458780314\n", + "estim: 0.039974266551518416\n", + "distribut: 0.037820339074187546\n", + "gaussian: 0.03719766917650302\n", + "model: 0.026377288657187144\n", + "densiti: 0.026100306065981422\n", + "log: 0.025795145745510784\n", + "bayesian: 0.025720736258683725\n", + "likelihood: 0.02534425739469013\n", + "infer: 0.023800936808848146\n", + "======\n", + "\n", + "Topic 6:\n", + "imag: 0.1848253958028947\n", + "featur: 0.03939641238659535\n", + "pixel: 0.03318427554113301\n", + "object: 0.03283463933643374\n", + "video: 0.025197762764204847\n", + "detect: 0.024957985815931668\n", + "patch: 0.024620771066662606\n", + "visual: 0.02064621098389648\n", + "recognit: 0.02001959511406888\n", + "segment: 0.019845380852013594\n", + "======\n", + "\n", + "Topic 7:\n", + "network: 0.07910535955554107\n", + "layer: 0.07722388990594499\n", + "train: 0.06835261643490363\n", + "deep: 0.055934507818227536\n", + "arxiv: 0.040912721761014216\n", + "gan: 0.036840912548190934\n", + "imag: 0.02695400262646877\n", + "neural: 0.02637093023830018\n", + "adversari: 0.026152695140267815\n", + "convolut: 0.025571579176695725\n", + "======\n", + "\n", + "Topic 8:\n", + "regret: 0.06361698186025525\n", + "bound: 0.058111744797171745\n", + "algorithm: 0.03820927485218057\n", + "theorem: 0.033408446197696236\n", + "arm: 0.0277968207933802\n", + "bandit: 0.027590683375529772\n", + "xt: 0.024684500137442878\n", + "loss: 0.022575873415732153\n", + "lemma: 0.021536924307693606\n", + "submodular: 0.0213692020268514\n", + "======\n", + "\n", + "Topic 9:\n", + "convex: 0.040845864136262344\n", + "matrix: 0.040525229678109884\n", + "norm: 0.025644169130572214\n", + "gradient: 0.022916528479412043\n", + "theorem: 0.019813100486113736\n", + "converg: 0.018802910828488756\n", + "algorithm: 0.01688950995760653\n", + "xk: 0.016387059495701076\n", + "spars: 0.016093131882315426\n", + "descent: 0.013754336933754541\n", + "======\n", + "\n", + "Topic 10:\n", + "cluster: 0.08828989453659127\n", + "node: 0.08591088457917238\n", + "graph: 0.08147474857171028\n", + "tree: 0.04597631491614074\n", + "edg: 0.04420999696187278\n", + "algorithm: 0.01969405292165792\n", + "hash: 0.01941101971247532\n", + "partit: 0.016920807850431176\n", + "xi: 0.014332616516548316\n", + "network: 0.012698787363157476\n", + "======\n", + "\n" + ] + } + ], + "source": [ + "for topic in range(0, model.num_topics()):\n", + " print(\"Topic {}:\".format(topic + 1))\n", + " for tid, val in model.top_k(topic, 10, metapy.topics.BLTermScorer(model)):\n", + " print(\"{}: {}\".format(fidx.term_text(tid), val))\n", + " print(\"======\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 3: Text Mining using Topic Models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Topics over Time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An interesting \"mining\" question to ask on top of this is whether or not the topics used in NIPS have changed over time. Are certain topics exhibited only in the earlier years, or vice-versa?\n", + "\n", + "To do this, let's take a look at the other output of LDA---the topic proportion vectors associated with each document. Since each document also has a label in our dataset, we can create plots for each topic to see the number of documents that mention a specific topic in a specific year, and to what degree.\n", + "\n", + "We'll start by creating a simple dataset with `pandas`:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "data = []\n", + "for doc in dset:\n", + " proportions = model.topic_distribution(doc.id)\n", + " data.append([dset.label(doc)] + [proportions.probability(i) for i in range(0, model.num_topics())])\n", + "df = pd.DataFrame(data, columns=['label'] + [\"Topic {}\".format(i + 1) for i in range(0, model.num_topics())])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's plot the results. There a lot of ways to do this, but here I'm going to use a \"swarm plot\" so we can see where each and every document falls." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 1\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzs3Xd4VNX28PHvmZLeAwkhHRI60nsLvXgBG4LYEET02lH5XcvrRdQr9gI2EBFRkKICKk2a9N57Aqmk955p5/3jhAlDAgRIMjPJ/jxPHjn7nJks8mDWnLP3XkuSZRlBEARBAFBZOwBBEATBdoikIAiCIJiJpCAIgiCYiaQgCIIgmImkIAiCIJiJpCAIgiCYiaQgCIIgmImkIAiCIJiJpCAIgiCYaawdwM1q1KiRHBYWZu0wBEEQ7MqhQ4cyZVlufKPr7C4phIWFcfDgQWuHIQiCYFckSYqvznXi8ZEgCIJgJpKCIAiCYCaSgiAIgmBWa0lBkqTvJUlKlyTp5DXOS5IkfSFJUowkScclSepcW7EIgiAI1VObdwo/ACOuc34kEFn+9QTwdS3GIgiCIFRDrSUFWZa3A9nXuWQs8KOs2At4SZIUUFvxCIIgCDdmzTmFQCDxiuOk8jFBEATBSuxiolmSpCckSTooSdLBjIwMa4cjCIJQb1kzKVwCgq84Diofq0SW5XmyLHeVZblr48Y33JAnCEIVUotS+e/u//LoukdZdGoRJtlk7ZAEG2TNHc1rgGckSfoF6AHkybKcYsV4BKFee3rz05zPOQ/A4fTD6E16Hm//uJWjEmxNbS5JXQrsAVpKkpQkSdIUSZKelCTpyfJL1gIXgRhgPvDv2opFsGPHlsGvj8OOj0FXZO1o7FZ8frw5IVy2KX6TlaIRbFmt3SnIsvzADc7LwNO19f2FemD3HNj4RsVx4n6YuMx68dgxXydfnDXOlBhKzGNB7kFWjEiwVXYx0Sw0UEd+sjw+vx6KMq0Ti51zc3BjRrcZOKodAQhyC+LZTs9aOSrBFtldlVShAXHxtTzWuoDW2Tqx1AP3tbiPoaFDSSlKIdIrErVKbe2QBBsk7hQE2zXoDXBwKz+QYOBr4OBq1ZDsnaejJ618WomEIFyTuFMQbFdob3jhBCTsgcatwLe5tSMShHpPJAXBtrn4QKs7rR2FIDQY4vGRIAiCYCbuFGpYdpGOv06k4KRRcecdAbg4iB+xIAj2Q/zGqkGpeaX8a85OMgvLAPhuRyyrn+mDk1ZM6gmCYB/E46MatOxAojkhAJxLK2DTmTQrRiQIgnBzRFKoQSZZrjRmNFUeEwRBsFUiKdSgcV2D8HLRmo/DfF0Y2sbfihHVA/kpcHw5pJ6wdiT1SomhhJ/P/Mzs/bM5kHrA2uEINkSSq/h0a8u6du0qHzx40NphXFNcZhFfbInGxUHN9KEt8XF1sHZI9uvCVlgyHozlj+QGvgEDXrFuTDauzFjG+tj15JTmMDRsKIFuVfetmvb3NHYn7zYffzjgQ0aEXa97rmDvJEk6JMty1xtdJyaaa1B2kY5JC/cTl1UMwLnUApZM7YlWLW7Ibsm22RUJAWDHR9DzKXB0u/ZrGjBZlnl8w+MczTgKwFfHvmLxyMV4OnryycFPiM6NpnfT3twTeY9FQgBYfm65SAoCIJJCjVp2INGcEAAOxOWw5Ww6w9s2sWJUdqyswPLYUKZ8iaRQpUNph8wJAZRHRL+c+4XTWac5nXUagJjcGAp1hagklUWTHReNS53HK9gm8RG2BhWU6qsYM1ghknqi62OWx23GgKtv1dcKVSo1lJoTwmUH0g5wf4v7zcfOGmfRbEcwE3cKNeiezoF8vyuWUr3yCayxu6OYaL4d3aeCewDE/A1+baDLJGtHZNM6+3emQ+MOHMs4Bii/7B9q8xC7k3eTXZptvq65Z3Ne7/k6I8NHklCQQHpROq/vfB0HtQPT7pjGiHDxGKkhExPNNexcagHLDiTipFXxYM9QAr1Eqefbkh0LF7eBX2sI6WntaGxeqaGU9XHKRPOwsGEEugXyT+I/vLHrDXLLcglxD+HO8DsJdA9kaOhQjqYfZdqmaebXqyQVv435jeZeovhgfVPdiWaRFGqRySSzP075hNY9zAeVSrJyRHbm/Ab4ZSKYyh/B9Z0OQ/5r3ZjslM6o43TWaZ7f8jzZZcq/yUjvSHoG9GTx6cUW177a/VUmtp5ojTCFWlTdpCDmFGpJqd7Ivd/sZsK8vUyYt5dx3+6hVG+0dlj2ZfuHFQkBYM9cKM23Xjx2zEHtwJ7kPeaEABCdE01VHwpb+7auy9AEGyPmFGrJmmPJHEnINR8fis/hr+Mp3NtF9MW9pl2fw4EFSmOdga+CvtTyvMlgmSSE69IZdfxw6gcOpx3mjsZ3UHbl8t5y7Rq1Y0LLCfwa/SsalYbJ7SbTya+TFaIVbIVICrUku0hXrTGh3Ok18PebFccrJkHUa5B2xU7m9uOU/gqleZCbAI1bg1r8E76W2ftns+L8CgB2Je9iUPAgPBw8yNcpd1uhHqEMChnEnc3u5KWuL6GSVDioxWbLhk78H3UbVhxMZMOpNMJ8XXgyqjmN3ByRZZnjSXm0buKOi4OaYp3yyMjVQc3I9mK/wjXF/mN5bDKAuz88/DtEb1ImmjtMUEpe/PE86IvBMxgeXAl+rawTs41bF7vO4nh70nbW3rOWtbFrcdI4Mbr5aJw1ykIIJ42TNUIUbJBICrdo8d54/t+qk+bjfbHZ/PJETx5asM/82KhvhC/B3i5IKolHeoUS5C02CF1TQIfKY8XZsPMzZRNb18fAZIS1LysJASAvETb9FyYuq9tY7YS/iz+FeYXmYz8XP7YkbuHv+L9p5NyIzn6d8fD1sGKEgi0SSeEWrT5yyeL4xKU8vtoaYzGPsDMmi0WTmzOgReO6Ds/+dJgISQfg6BLQOCt7FDbNBLl8cn7be+Bc/ujoStmxdR6qvZjRbQYvbnuRYkMxzhpnooKjmL1/tvn88YzjbLhvg/luQRBArD66Zf4elrfbDmoVBWWVJ0FT80rqKiT7ptbAmDnwahLMuAiNW1UkhMvSTkLgVSvq2oypuxjtTEe/jjzX6Tnui7yPxSMXk1Zs2dsjpyyHo+lHr/FqoaESdwq36IUhkeyPyyajoAyVBC8MjaRfRGN+3pdg7qHg7qhhUCuxo/mmaMs/tTbtWPlc45ZKsnDygLJCaDEM+rxYt/HZCYPJwKT1kziTfQaAdXHrKhW8U0kqQj1CrRGeYMNEUrhFkf7u7JgxkCMJuQT7OJvnCxZP7s5P++Jx0qh5vF8zGrs7WjlSO9W4JYz6CLa+C7oi5Y5g+0dQUr7OvlkU9HsZJLEhsCp7U/aaEwJAkb4ISZLo5NeJI+lHcFA58GynZ2nq1tSKUQq2SCSF2+CkVdOruWWBtt4RjWjZxB2NWoWns/YarxRuqCQHJBUMfB3ajIUdn1QkBFBKXyTsgdDeVgvRlqmqeDLspHbi0baP0smvE4OCB9HBr4rJfaHBE0mhBumNJqYvP8afx5PRqCQm9wnn1VFid+hNK86GbwdAXoJyvOsLCOtT+Tq9mK+5lh4BPWjn246TWcoKOXetO8mFybyw9QUAFp1axJxBc+gX1M+aYQo2SCSFGvTb4ST+OJYMgN4o8+32iwxp40+3MB8rR2ZnTv5akRBA+bPr3aB2rGi607gVhA+wTnx2QK1Ss3DEQjbEbSBfl08X/y5M+HOC+bxRNvLDqR9EUrgNuoQEcpYtA6MJr/vH4dismbVDqhG1mhQkSRoBfA6oge9kWZ591fkQYBHgVX7Nf2RZXlubMdWm6LTCSmPn0wpEUrhZVzR/MfMMgie2wfFl4OILnR8Wu5lvwEnjxNiIsQCkF6dXOm+q6ucsVIshI4O4cfdjzFOWSOeuWEGzNavRBlbd/tSe1NqSVEmS1MCXwEigDfCAJEltrrrsDWC5LMudgAnAV7UVT10Y2MrP4lirlugfKfYoVNulQ3Dwe2jaGdyvmAD1CIT294F/Gxj6FvR5Dpy9rRenHfJz8WNUs1HmY5Wk4tG2j1oxIvuWv2GjOSEAmIqKyPvLbj/PWqjNj1rdgRhZli8CSJL0CzAWuLINlAxc3lLpCSTXYjy1rk9EIz4a14FFu+Nw0qp4emAEwT5iF3O17PgENr+l/FlSwejPlWWnCXugLB92fgr9potkcBve7fMuUcFRxOfFMyB4AK18WlFmLGNb4jZMsomo4CicNc6kFqXy3YnvuFR4ieFhw7kr4i5rh25z1O6VW8JWNWaPajMpBAKJVxwnAT2uumYmsFGSpGcBV2BILcZTJ+7rEsR9ohLqzTHoYMfHFceyCfZ+DW3vhjNrlLGL2yDlKDz6h1VCtEcm2YRRNqJVKavg1Cq1xV6FYn0xD617iOicaEApkPfTqJ+YunEqcflxAOy8tBNZlrk78u46j9+WuQ8fjtOPiyk9dQoAx8gIPEaPtnJUNcPaO5ofAH6QZTkIGAUsliSpUkySJD0hSdJBSZIOZmRk1HmQQi2TjWC4qqyzvhhOrLAci90OhZWfjQuV/Xr+V6KWRdH95+68uetN9Calf3iBroCYnBhMsomN8RvNCQEgPj+eBScWmBPCZWtj68djkZqkcnIiZOH3eE96FO+HHybkxx9Ru4k7hRu5BARfcRxUPnalKcAIAFmW90iS5AQ0Aiz+z5dleR4wD5TOa7UVsGAlWmdl4vjg9xVj3Z+A6I2Qeb5izNEDHN3rPj47k1iQyKy9s8wTyb/H/E5Ln5a4aFx4b/97lBhKCPMIY1T4qEqv1Uga1JIa4xUlRpq4iuq+VzMWFhE3fgK6WKX2VuGWLYT/uhK1l5eVI7t9tZkUDgCRkiSFoySDCcDVPf4SgMHAD5IktQacAHEr0BCN+giCe0DKMWWpacsRENILUo4rm9ZUWhg6q6IMhnBNZ7LOVFpZdCz9GNuStlFiUPZ2xOXHcS77HD5OPmSXKpsCPRw8GN9qPE4aJ7469hUm2USgWyBP3PFEnf8dbF3BhvXmhACgv3SJvDV/4PPIw1aMqmbUWlKQZdkgSdIzwAaU5abfy7J8SpKkWcBBWZbXAC8B8yVJehFl0nmSbG9No4WaoVIr/RI6VKylJ7AzvHgKkg+Db6TSX0G4oY5+HdGoNBiu6FIX6R3JujjL/gopxSks+9cyfo/+HaNs5O7Iu2ni2oRpHaYxNmIsqUWptGvUDo1KLP29mqzXVzFWP5poSfb2O7hr167ywYMHrR2GINi0LQlbmHNkDrlludwdcTfPdHyGe/+4l5jcGPM1z3Z6VtwF3CJDTg6xY+/CkK486VZ7exO+ehVaP78bvNJ6JEk6JMty1xteJ5KCIDQMKYUpzD06l7i8OAaGDOSxto+hVqmtHZbdMmRkkLtqFRhNeN59F1p/276TFUlBEARBMKtuUrD2ktR6p8xgZNPpNHZGZ2Iy2VfCFeqXEkMJifmJ2NsHP8G6xAxSDcop0nHv17u5mFkEQPdwH5Y83gONWuReoW5tiNvAW7vfokBfQDPPZswdNJdgj+Abv1CoNmNuLnl//AmyCY9//QuNT/2ocSZ+W9WgZQcTzQkBYH9sNlvOis1WQt0qM5bx1h4lIQBczLvIp4c/RW/U83v073x66FPRhvM2GfPzib3nXtLefZe0/72nTDpnZVk7rBohkkINyi+pvEwtv7Ry32ZBqE1ZJVkU6AosxmLzYpmxfQZv7n6T709+zyPrHmFj3EYrRWj/8tetR59cUarNkJFB3pr6UYJFJIUadHenQBw1FT/SRm6ODG1t2ysShPqnqVtTWni3sBjr6t+VTQmbzMcyMj+f+bmuQ6s/quoCW086w4qkUIMi/d1Z9XQfJvUOY9qAZqx6ujeeLqIlp1D35gyaw8iwkbT0bsnU9lOZ3G4yqqvKijmoHawUnf3zGDkSbXDFHI3G3x/PMWOsGFHNEUtSBaGBeHvP2yw/vxwArUrLV0O+omdATytHZb+M+fnkr12LbDTiMWoUGm/bLute3SWpYvXRbTiVnIdaJdGqiceNLxZqTm4ipJ2EoO7g6mvtaOzGGz3fYEjoEOLy4+gb2Jdgd7Ea6XaYioowpGcgyyZMRUVg40mhusSdwi0o1RuZsugAu2KU1QaDW/nx7cNdxNLT23VxG8TvhqBuEDm06msOL4Y/nlfKbWucYcJPEGH3bTgEO2PIzubi6DEYy1ccqTw9abZ6FdomtltRVmxeq0V/HEs2JwSAzWfT2Xg6rcprT17KY+6WaDacShWb2a5n91z4cSz88z78fB9sfU8ZL8mFc+shOxZMRvj7TSUhABhKYNNMq4UsNFz569ebEwKAKS+PvD/qx+oj8fjoFqTklVYaS8op5vudsRxLyqVHuC8TugWz/lQqTy85zOWbsQe6h/DePe3rOFo7sXuO5fGeL5US2kvuB10BIMGQt6A0z/K6ovqxNtwaFpxYwA+nfkAlqZjSbgqPtH3E2iHZDZVT5RLuKuf60XpX3CncgpHtmuBwxaMiJ62KY4l5zPrzNKuPJvPa7yf4YMM55m2/yJVP55YfTCS3uH6U161xVxdmk1Sw9d3yhAAgK3cRba5a4dHx6hYdwmV6k57UotQqz+1O3s1nhz8jtyyX7NJsPjz4IYfTDtdxhPbLY8RwHFtULPt1CAvDc0z9aMcp7hRuQaS/Oz9P7cEPu+JQqSQe7RXKA/P3Wlyz4mAiwT7145NDnej7Iqx9+YrjF+D4Mstr9EUw7F0I6Kj0aw4fAJ0frds47cSuS7t4fefrZJVmEeEVwWcDPyPUI9R8/lj6sUqvOZJ+hM7+nesyTLulcnEhbOUKCrdsBZMRt4EDUTnXjwZQIincom5hPnQLU2qdyLKMp7OWzMKKuwBvVwem9W/G00sOc3kqYUK3YLxcxNrwKnWfqvyyj98FQV0hrK9y9/D3mxXXRA4Dz0AlYQjXZDQZeXPXm2SVKo/WYnJj+ODAB3w5+EsOpx0mLj+uypVHHf061nWodk3l4IDHiOHWDqPGiaRwG347nMSaY8kEeDozbUBzZq87i9Ek46BWMWN4S4a1bcIfz/Zl27kMWvi7M7iV7TbgsAnB3ZSvy/o8D66NIfpv8GsDPZ+yXmx2JE+XR3qJZc2tC7kXeH//+/x05icAHNWO3Bt5L5sSNqFCxZT2U+ji38Ua4dYruvh4Ss+fx6VLF7stkCeSwi367XAS05dX3IKH+rqwZfoAFuyKZX9sNvN3XMRBoyKqpR9tm3paMVI713GimDe4ST5OPrT1bcuprFPmsW7+3Vh6dqn5uMxYRkpRCjsn7LRGiPVS9o+LSXvvPZBlJCcngr/5Gtee9rc5UEw036I/jiVbHMdnFbP1XDo/7onnbGoBB+JymPrjQRKzi60UoZ1LOgixO5RlqMJN+zTqU4aGDiXMI4yJrSYyud1kjLLlz7LEUGKl6OxP2cVY0j//nKwF32PMza103lRWRsbnn3N5ZYlcWkrGZ5/XdZg1Qtwp3KImnpaTSioJzqZYVqbUG2W2R2fwYI9QhGoyGWHpBIgur+Dp3x4e+wucxN3WzQhwC+CTqE8sxqKCotiWtM18PKHlhDqOyj6VRUcTe/945BIlieYuX074mtUYc3PJXbkSTDLuw4ZiKrb8AGjMybFGuLdNJIVb9PTA5uy+kEl8VjEqCZ4ZFEmglxO/HEy0uK6Fv7uVIrRTMZsqEgJA2gk4/CP0ftZ6MdUTH0V9xKroVcTlxzEweCDdA7pbOyS7kLtypTkhgDJvkL92Hekff4wxMxOAnJ9+wrV/P4r+2W6+zvPee+s81pogksItCvJ2YfP0ARxLyqOJpxOBXs7ojSZ2xmTx5/FkNCqJyX3DzSuUhGoqyqjemHDTHNWOjG813tph2B3JwbHSWMmJ4+aEAGDMy8OlY0dcu3en9Ow53Pr2wXPs2LoMs8aIpHAbNGoVXUIrimBp1SrmPNCJmaPboFGr8HQWZbNvWouR4OwNJeW33iottB9X+Tp9iVIHKSsGWo2CZlF1GaXQgHg/MIHc3383JwHnTp1watO20nUqN3d8Hn6orsOrcaIgnmB7MqNh79dQlg/FOZByRFmSOupD8GutXPPzOMvHTPcugPb3WSdeG6U36TmYehAPRw/a+lb+JQaQUpjC9qTtBLsH06tpLySpnnSKqWHGvDwKNm1G5eaG+6CByHo9cePHUxYdA4A2NITw5ctRe9ru3Fd1C+KJpCDYrjXPweFFFce+kfDMAchNgM/vsLw2pDdMXle38dmwzJJMJq2fRHx+PAAjw0bywYAPLK45nHaYJ/5+gjJjGQBjm4/lnb7v1Hms9spUWkrhli3IRhPugwehcrHtCgaiSqqV6AwmtpxNY3dMpqiKeiMJ++CXB8s/9W+qfD5uh+VxVjQUpIDWGaSraiU52Pb/kHVtyZkl5oQAsC5uHUfTj1pcs/DkQnNCAFhzYQ3JhZZLrYVrM+bkUBYTQ9mFGAxZ9acwo5hTqEG5xTru+Xo3FzOKAOjZzIefpvQQfRaqkpsIP44BQ3nF2ZhN8PhmCLyi9k5AR8i+WHHs3hTc/JXyF92fgH1fK+MaZ+h3Rd0kgezS7EpjiQWJbE/aTnx+PANDBqIzWRZnlJHRm/R1FaJdM2RlEXvvfRizlZ9zzpKlSj+FgAArR3b7xG+rGrTsQKI5IQDsvZjN1nNi5UyVzq+vSAgAsgnOXFWPfvj/ILSv8mfvcLj3O6UJz5c9lcdKEUPgX5/Cc0cgtFfdxW4HRjcfbdGTubFzY5afW878E/PZGL+RV3e8SpBbkMU1fQP7WhTNE64tf/16c0IAMOXnk/fnn1aMqOaIO4UalFtS+VOWKJV9DV5V/PLxvmrMI0DZuKYvAY2T8t9PWkNp+Y7SmE3QpL1ynWChi38X5g+dz6qYVXg4ejAkZAiPbXjM4pqz2WdZPHIxmxM2E+wezOjm9aP0c00w5ucjqdWoXF2rPF9V7wTRT0Go5J5OgThqKn6kjdwcGNbGdtvzWVXEEGh/f8VxSE84tRo+aaNMMJcVKuNHl8Cqf8P2DyH5SEVCuCzBsmS5UMHVwZVg92DaN2pPuGc4WpXlEmkfZx/cHZTNlRnFGeSV5VX1Ng2KbDCQ/NrrnO/Vm/O9epP+yadVXucxYjiOLVuaj+tTPwWx+qiGnUrOY9mBRJy0ah7uGSp6KtxIdiwY9bDiUUg/XTHedYpy53Bl6ezmg+HSQcvua72fg2Fv1128dmJb4jae3/o8JtkEwIiwEbT0ackXh79ARsbT0ZNZvWfx6o5XKTYo5Rn8XPxYNXaVOVE0RLmrVpHyn1ctxkJ/Woxzly6UnTmD2tcXrb8/ACadjsKt2yr6KTg5YczNJeeXZRgyM/Ec/S+cO3Swwt+iatVdfVSrj48kSRoBfA6oge9kWZ5dxTX3AzMBGTgmy7Jdl8Rs29STWWNtd62yzfEJh4JUy4QAcGELaK7aSXphM4xbBFvehpx4pQvbgP+ru1jtyOLTi80JAWB93Hpe6voSfZv25UTmCUaGj2ThqYXmhACQXpzO1sStjGk+pqq3bBDKzkdXGis+coTUt9+h7Nw5UKnwfWIqfi+8QOmpU+SvWwdGI2pfX1y6dCH+4YfNexdyli4lZOH3uHa3r3IitZYUJElSA18CQ4Ek4IAkSWtkWT59xTWRwKtAH1mWcyRJsvuGA+JO4Ra4NAL3AGW56WVN2iu7mjPOVoxpXZVGO23vqvsY7czVm9AkJDbGbWTu0bmUGEr48fSPRAVHVXqds6Z+dA+7VW79+pL9/fcVA2o1urh4JSEAmExkffMtLj16kvTkk8hlypLegm3bCHhrpjkhAGA0krtipd0lhdqcU+gOxMiyfFGWZR3wC3B1MZCpwJeyLOcAyLKcjh2LTivgnq928+OeeOZtv8jdX+0ir1gs8bshtQbu/gY8gpTjpp2VlUeD3gDzowyp/Fgk2eqY3HYyGqniM9+oZqPMCQEgLj+OhPwEAt0Czde0b9SeqKCoug7Vprj26kWTt2fh2KIFTu3bE/TF55gKCytdV7B+nTkhAKDXU3L8eKXrVG5VT1Tbstp8fBQIXFkyNAnocdU1LQAkSdqF8ohppizL669+I0mSngCeAAgJCamVYGvC70cuUWaouGXPLNSx8XQq47pWbn0oXKVZFLxwXJkvcCkvIugVDC+eVCaT/VqBd5gVA7QvvQN78+vYX9mRtINQj1BC3EP46+JfFtekFqfy25jf+CfpHxzVjvQL6ldpMroh8h43Du9xFfW2ZJ2Ogg0bzMeaxo1x6dad3GXLLV7n3KEjxuwcCjYq5VfU3t74Pmp/PcStvSRVA0QCUUAQsF2SpPayLFssMZFleR4wD5SJ5roOsro8qiiAJ3oy3wSVuiIhABjKYOenytJTv9YwZCZ4BlkrOrthNBk5nXWaJq5NeLSt8ktJlmUivCKIya14vDE4ZDAuWhdGho+0Vqh2wWPkSGSdjrzVq9E0bozvtCdxCAkmf906CjdvBsC1d2887hyF511jKd63H0NmJm4D+qN2t79J+9pMCpeAKz8iB5WPXSkJ2CfLsh6IlSTpPEqSOFCLcdWaCd2CWX4w0WJH88CWja0clR3bPAv2zFX+nHZSKZQ37R/rxmTjkgqSmLpxKkmFSWgkDc92fpbJ7SZTaixlfMvxbIrfRJG+iEEhg5jUdhK7k3eTX5ZPv6B+uGrt71FHXfEcO7ZSKezgL+dSFhODbDTh1LKFedy159UPROxLbSaFA0CkJEnhKMlgAnD1yqJVwAPAQkmSGqE8TrqInfJycWD98/3ZEZ2Bk1ZNr2a+qFQSZ1LyWX00mUZuDtzfLRgPJ3GLXi3nripwl3IU8lPEZrXr+ObYNyQVJgFgkA3MOTKHvoF9eXbzsyQXKXWNujXpxmNtH+OZLc+wK3kXAH7Ofvw06icC3MTP9mY4RkRYO4QaV2sTzbIsG4BngA3AGWC5LMunJEmaJUnS5TVvG4AsSZJOA1uBV2RZtqvKUglZxRTrDOZjB42Kwa396RPRCJVK4khCDmPn7uKbfy7wzl9nuP+bPRhFobzqadTC8tjFV/kSrimlKMXi2GAy8MuZX8wJAeBA6gEWnV5kTggA6SXpLD23tM7iFGzXLd0pSJIUKcty5QW9V5FleS2w9qqxN6+xd/vaAAAgAElEQVT4swxML/+yK8m5JUz+4QBnUwtwc9Qwa2xb7ukcxNaz6fy4Jw5nBzVP9G/Okn0J6IwVk89nUwvYdzGL3hGNrBe8vRj2jtJEJysanLxg9BegEXM01zM8bDj7U/ebj8M9w3HSOFW6rqrdy0W6okpjQsNzq4+PNgO2uwyoDny08RxnUwsAKCwz8Maqk/h5ODJl0QEu3whsO5fBqPaVb8cdtepKY0IVGkUo/RNyYpV9DNqGvYa+Ou5veT8qScXGuI0Eugcy7Y5p5OvyWX5+ublMtr+LP5PbTWZTwiYSC5QFghqVhnsi77Fm6DbFkJODpFLZdNOc2nLNpCBJ0ifXOgU0vJ/UVS5kWH6qKtYZWXUkmSufDBXrjIT6uODloiW3fL/CgBaNLVp4CuUyo+Gv6ZB6ApoNhDs/VlYiFaRC0kFl9VGT9taO0i7c1+I+ejTpwZKzS/j2+LfcF3kfP4/6md+ifyOxIBF/F39icmNYPHIxy88vJ78snzHNx9Dat7W1Q7c62Wgk5fU3yFuzBlQqvCc+QJPXXrN2WHXqencKTwAzgLIqzjX4HVlDWvlxLLFi5WyYrwvtAj1Yecjyuk4h3mzpGcqmM2k0cnNgQAu737RdO1ZMUlYYAZz6DdRa6PAALBkPlxvBDHwDBrxitRDtRW5pLg+ufZCcMqXP9ZqYNfzyr19IKUphxyWlcdHK6JV82P9DnurwlDVDtTn5a9eRt2qVcmAykfPjYtwGDMCtTx/0ycmoPDxR32BDmrGwCGNONg7B9rk/6XpJ4QBwRJblPVefkCRpZq1FZCeeimqOUZbZeCqNsEYuzBjeCn8PJzacTGPPRWWu/J7OgfSJ8EWSJO4XG9iurTi7IiFcFrtDqW90RWcwdnwEPZ8CR7e6jc/ObE3cak4IADqTjuXnlrM1cavFdUvPLmVE+Ii6Ds+mlV2IqTRWeuIEmV99TcmhQ0hOTvhNfxGfRx4h99ffyJo3D9lkwuexSfhMnEjOsuWkvf8+cnExTm3bEvzN12ga29ey9OslhfuB4qpOyLLc4H/DadQqXhjSgheGWK6QWfpET86nFeCkURPiK0oyVIuzt9JEJye2YqxpJ8iJs7zOUKZ8iaRwXd5OlR9Pejt5o5bUGGWjecxR7VjpuobOrX9/sr75tmJArUaXmETJIeURgFxaStr7H6ANCSXl9dfNl6XNehuNvz9p77yDrFcepJSeOkXGl18SMHNmXf4Vbts1l6TKspwhy7JYjnALWvi7i4RwMyRJ6ap2eQlqSG8Y9QF0tWwKQ5sx4CqWpN5I38C+9Aio2EDV3LM5D7Z+kPtbVvSvcFA5MPWOqZQaSjGajFW9TYPk0rkzTT/8AKd27XDu2JGgOV9YdFgDwGikcOuWSq8t3LLVnBAu012MrXSdrbN2mQtBUAR1VVYa6UtBW76EsvtUZdVRzN/g1wa6TLJqiPZCo9Iwf+h8DqUdosxYRveA7mhVWl7r8RpDQ4cSlx9HN/9ufHn0Sx7f+DieDp681PUlxkZcXa+yYfIcPRrP0RUNcwyZmRRurXj0pvb2xm3w4Eq1j9yiBlC0ezeGlIq9Im6DBtZ+wDVMNNkR7Et+ivK4SVt57b1QffOOz2POkTnmY42kYcN9G/BzEQshribLMtkLfyDvzz/QNvaj8QvP49S6NRlz5pK1cCEYjXg/9CD+r7xC2YULZHz2GbpLl/AYMRLfx6cgqWyjwWV1m+yIpCDYptxESDoAgV2UDmwFabDsQWXMyQtGfQR3jLvx+wgWivXFOGuceXHbi2xO2Gxx7ushX9M3sK+VIrNPsl4PsozkYPubKmus85okSeuBCZcrl0qS5A38JMvynbcfpiBU4eRv8NtUMBlAUsHYryBhj5IQQOnT/Mfz0GI4OHlYN1Y7kVqUyiv/vMLRjKOEuIcwIHiAxXlnjTPtG4l9IDdL0ta/OmbVua/xv7KUdXlDnKa1F5LQ4G1+S0kIALIJNs2E9DOW1+iLIDehzkOzV7P3z+ZoxlEAEgoS2BS3icntJuPn7Edrn9Z8PvBzPB0b/J5UgepNNJskSQqSZTkJQJKkBl3e4kZ2X8jk570JOGpVTO3XjNYB4pPsTSvJsTwuzYXIKZBUUdMHzxBll7NQLaezLHtgpxSn8GjbR3mg1QM4a5xFQhDMqpMU3gR2SZK0BaXERRQgtkFW4URSHg8v2G+ugvr3qTS2vBxFY3exHvymdH4EdldMgtLpYeg7HQylcOYPZU/D0LeUpjxCtXRv0p3VF1abjyO8Inhj5xvsuLQDjUrDY20f47nOz1kxQsFW3DApyLL8lyRJ3YFe5UMz7L2Xcm3583iyRVnsgjIDW86mMb6buLm6KUNmgW8kxO8CXSGknYb1/wdRr8LgN2/8egFZlvk1+ld2XtpJhFcET3d8GqNsZHfyblp4t6CNbxu+P6k0qDeYDMw/MZ+hoUNF/aPbYCorQy4pQe3lVTFWWoo+MRGHsDC7mX+4XkG8SFmWoyVJuqN86HLzmyaSJDWRZblyl+oGzt+j8jLJJp6isudNU6mgy6OQlwjbP1TGEnZB2imYXKmFt1CFBScX8PnhzwHYnLCZk5kn+WboN+bzM3fPrPSa2LxYkRRuQNbp0KdnoA1siiRJ5vGcpUtJ//gTTIWFuEVFEfjxRxQfPsyll1/BlJeHxs+PoC+/xLl9OytGXz3Xu1P4DzAF+LKKczLQv1YismPjuwWz5lgyR8sL5Y1q34R+om/CrTu1yvI4YY+yNNXd3zrx2JE/LvxhcbwreRcZxRmcyjpFXF4crXxaWZx31jhb7IIWKivcvp3k/7yKMTsbh2bNCJo7F8dm4eiTk0l9+x0wKX1TCrdtI3PhQvJ++w1TntK3wpCeTtq77xL2i+03MrpmUpBleUr5f/vVXTj2zdVRw+//7s3xpDyctGpaNrG/pt02xStEabBzmZOn8iXcUCPnRlzMq+hs66Jx4bNDn7Hm4hpA2az2UOuHOJp+FHcHd57s8CS+zqKEyLXIBgPJr79uLnmhu3iRtPdnE/Ltt5TFxJgTwmVlZ85iSLbsgqeLj6+zeG9HdfYpOALTgL4odwg7gPmyLFdVUrtBKdEZORCXTaivC6G+SjldSZLoEOx1g1cK1TJ0FqSfhoIU0DjDyA/ETuZqeq7zczz191MU6AtQS2oeb/+4xQ5mg2wgOieapf+y/U+utsCYn48xI9NiTBdzAQDnTp1QubpiKqooFec2oD+yTkfRjh3mMfchQ+om2Nt0wx3NkiT9gtJT4afyoYmAsyzLE2o5tirZyo7ms6n5PDh/H1lFOiQJXhzSgucGR7LiYCKL9sThrFXz9MAIolqKsgG3xahXGu/4hCvlLYRqK9IXcSz9GM28miEhMWSl5S+lrv5dWThioZWisz+x94+n9HjFVKrH2DGYCgopO38ep1atMGRlYczJwfOusfhOm4YxN5eMzz+n9OQpXHv2oNEzz6Byst6HmhorcyFJ0mlZltvcaKyu2EpSeOqnQ6w7mWo+1qolvpjQiad+PmwxtuWlKIJ9RMVUwfpm/DODdXHrAFBJKj6N+pRBIYOsHJX90KekkP7hR5SeO4db3z4U7tqNLqai/4L3xIk0efP/WTHC66uxMhfAMUmSusmyfKD8jbsAR243QHuXUWD59ExvlNl8Jq3S2PboDB7sEVqXoQlClf7X739EBUcRnx/PgOABtPG1yuc6u6UNCCDwk48B0Kelk73oR4vzhbt2WiOsGledMhftgX2SJMVIkhQD7Ac6SJJ0RJKkwzd4bb11T+cgi+MOwV50C/epdF0LfzHZLNS81KJUzmaf5co7/WJ9MWezz6I3Vt0tV6PSMKrZKJ7q+NRNJwSdUcf2pO0cTmuw/8ublZ47j2wwoG5subLQqUWLa7zCvlTnTkEUWa/CxB4huDqq2XAqlVBfV57o1ww3Jw27YrL443gyGpXE5L7hdAurnCgE4XZ8fPBjFp1ahIxMa5/WzBs6j0Pph3h95+sU6Yto7NyYOYPn0Na37S29f3x+PNsStxHsHsyAoAHklOXwyLpHSCxIBKB/UH/mDpprsU6/ITDk5JA45XFKT58GtRr34cMpOXAAQ0YGjm1a4/d//7F2iDWiWqWzJUlqC1xemrpDluVTtRrVddjKnMK1GIwm/j6dhqujmv4txCSzULOic6K5Z809FmOPt3uc1RdWk1GSYR7r4t+FH0b8cN33SixIZObumRzLOEYnv07M6j2LxIJEntz0JHqTcrcxKnwUoR6hfH3sa4vXLhi2gO4B3WvmL2Un0j/+hKz58y3Gwv/4A423F5pGtr8fqSZLZz8D/Bu4vJNouSRJX8qy/NVtxljv5JXoGf/tHs6mFgDQv0VjFk7qhlrVsD5RCbUnpSil0lhiYaJFQgBIzE+84Xu9sfMNDqcrj4P2puzlv7v/i0alMScEgLWxaxnTfEyl1+aW5VYaq+/0l5IqjRlSU3CKjLBCNLWnOnMKTwDdZVl+TZbl14AewJO1G5Z9Wn4g0ZwQALafz2DbOVEmSqg5Xf274uNk+UjyzvA76RXQy2KsV9NefHjgQ97c9SZH0iuvC5Fl2ZwQLjucfhiTbKp07aDgQWikis+Pfs5+DbIZj/uw4RbHal9fVJ6eyuOkeqQ6cwoSoLviWF8+Jlwlq0hXrTFBuFUuWhcWDl/I/BPzyS7N5q6IuxgYMpBOfp2Ye3QuZ7PP0tmvM39e+JOMUuXuYc2FNSwauQhHtSOz988mLi+OQSGDaNeoHSczT5rfu0PjDjzU5iH2puzFKBsBJSEMDh3MopGL+D3md9y0bkxsNREXbcNbZu0xYjim2e+Rt2o1Gh9vDOkZxN8/HgCXnj0J/vYbVI72XxH5mnMKkiRpZFk2SJI0A3gA+LX81N3AUlmWP6qjGC3Y8pzCmZR8xs7dhc6ofNrydtGy+aUofFxtv1WfUH/8Hf8307dNtxgb12IcOy7tILWoYm/NXc3v4mL+RU5knKCTXyfe6fsOwe7BnM0+y9aErQS5BzEifARalX1U96xL+es3cOmFFyzGAt59F69777nGK6yvJuYU9gOdZVn+QJKkbShlLgCevLxnQbDUOsCD5U/2Yum+BJy0Kib1CRcJQahzXo6Vy6yoJJVFQgA4l3OO5aOXV7q2lU+rSgXzBEv61MpzO/q01CqutD/XSwrmR0SyLO9HSRLCDXQM9qKjqH10a5KPwpHFoHWB7k+AV7C1I7JL3Zp0Y3DIYDYnbAYgxD2Eye0msz5uPXlleebrRJnsW+c+ZAgZn32OXFoKKL2aPYYPv8Gr7MP1Hh8lAZ9c64WyLF/z3BXvMQL4HFAD38myPPsa190LrAS6ybJ83WdDtvz4SLgNqSdh/iAwlu8Ud/OHZw6Ck2hnequOZRyjQFdAjyY90Kq17Ly0k7f2vEVqUSo9AnrwQf8PKk1aC9VXcvw42Yt+RDYZ8XnoIVy6dLF2SNdVE4+P1IAbtzipLEmSGqUXw1AgCTggSdIaWZZPX3WdO/A8sO9Wvo9QTxxbWpEQAArT4PwGCLgDLmwFv1bQLMpa0dmlDo07WBz3DezLhns3UGoobZATxTXNkJmJPjkZ2WTEkF5/VhleLymkyLI86zbeuzsQI8vyRTBXWx0LXL1+623gfeCV2/heNi8lrwRHjdo8x/DT3ng+23SeEp2Rh3uF8X8jWja4HaIWnKt45JZ5Dn6fBuUrYej1DAx/t27jqgcS8xPZk7KHCK8IOvt3FgmhBpRFR5P07HNgVP5tXpr+EtrAQJzvuOMGr7R91ZpTuEWBwJU7aJJQ9jhUfANJ6gwEl/eBtrukcC61gLUnUgjwdOKuToE4aSs3ki/VG3lmyRE2nUlDo5KY1DuM+7oG8caqiqWA3/xzgbZNPRjdoWldhm9bujwGR5dAdnljmPABELO5IiEA7PsWBswQjXZuwvak7Ty/5XkMsgGAye0m82KXF60clf0r3L7DnBAAkGUK/9le75PC4Nr8xpIkqVDmLCZV49onUDbRERISUpthVdv+2Gwe/G4veqMyJ/P7kUssm9aLCxmFrDiYhJNWxQPdQ9h4Oo1N5dVTDSaZ73bGolZXzreHE3IadlJwbQT/3gsXtoCDK4T1g2+vavonm5QvodrmHZ9nTggAi08vZmr7qbg5uFkxKttnyMoif916VM5OeIwYgcrV1eK8Y0TzSq+pasweXa8dZ/Ztvvcl4MrlI0HlY5e5A+2AbeWPTZoAayRJGnP1ZLMsy/OAeaBMNN9mXDVi0Z44c0IA2BebzfqTKUxffoxinfIJ4pf9iQxqXbn+katWgyTBlXP8XUPFhB8aR2g5suK459Ow6orN8x0eEI12blKZ0bLEu1E2YjAZLMaOph/lwwMfcqnwEsPChvFy15dxUDfcpdT61FRi770PY1YWAFkLFxK+cqVFgxzX/v3xnvgAOcuWgyzjOXYs7sOGVfl+ZRcuoIuNxaV7d9Qetr9wojo7mm/VASBSkqRwlGQwAaVrGwCyLOcB5ipS5XshXr7R6iNboamintG2cxnmhACQml+Kh5Plj9hRo2JctyD8PR35bFM0xTojj/YK5c47Amo9ZrvT8QHwDoXov8GvDbSz3Y1Btmpiq4m8uftN8/GAoAF8cugTckpzGBsxln5B/Xhuy3PklOUAsPTsUrwdvXmq41PWCtnqclf+ak4IoLTdLNyyBbeoKEqOHcOhWTO0/v40efNNGj37LMgyGh8f9MnJZP/0M6aCfDzvuQeXTp3I+PJLMufMBUDl5kbI9wts/hFTrSWF8t3QzwAbUFYyfS/L8ilJkmYBB2VZXlNb37suTOkbzsZTaZTolSQwuJVflR3WOgZ78dn4jvy4Jw4XBw1PD4wgwNOZ8d1CGN/NNh6F2bTQ3sqXcEvujrybALcAdibtJNQjlC+PfklWqfILb1vSNmZ0m2FOCJftS93HUzTcpICp8iPKsvgEUgcOwpiXB2o1/q+9is+DD6LxVu5cTcXFxD0wEUOa8qg49/dVBM/7lqxvvq1428JCMr78kpBvv630/rakNu8UkGV5LbD2qrE3r3FtVG3GUtPuCPJi00sD2HgqlQBPJ4a09ievRM/S/Qkk5ZQASkIY1MofB42KuzoFWjliO1KUpexPUIvyCrcrrSiN3NJcxrUcR3x+vDkhXHY04yguGheKDcXmsVvtw1BfeN17Dzk//6wkAMAhLIziffvMxxiNpH/8CV53343KRfkgWLhjpzkhAGAwkLd6NbLesuGRMcf2q8vWalKo7wK9nHmsT7j52NfNkY0v9mfzmXSctGqiWjZGq65OIVoBgKJMWP4oxO8EF18Y9ZF4ZHQbtiRs4aV/XsJgMiAh8Xj7xytdE+wWzP/6/Y//7fsfGcUZDAgawJMdGnYRZG1gIOFr1pD/559Izk54jh5N/MQHLa6Ri4vRJSZStHMnssmENrjyXb9DYCAuPXpQvK9iC5bXPbb/77laTXZsidjRXI/9+SIc/L7iWOsKL50Vu5pv0d2r7yYmt6KxvIeDB3c2u5Nfzv6CjEyEVwTfDfsOX2dfTLIJnVGHk8bpOu/YcGXOn0/GxxVFHJy7dkGfmGS+O1B5e+Pcpg1Fu3YBoA0OJmzpEiQnJ3IWL6YsNhb3QYPwGDHCKvFDDTbZEYQ6k3ZVQz99EWSeh5w4KM6C1mPAQ0zIV1e+Lt/iuNhQzMtdX+bh1g+TU5ZDu0btUEnKnaxKUomEcB2+jz+O2sOTwn/+wTEiArW3N+nvv28+b8rJwbV3bxo9/TSmgnxcevVC5aCs4Gr0lH3Nz4ikINiO5oMh8YpqJ+5NYcPrkLhXOd7yDjy+CRq3tE58dua+yPv46lhFg8SooChe3PYi8fnxDAweSGuf1qjE481qkSQJ7/H34z3+fgByV66sfI1Wg0vnTnUdWo0TSUGwHf2mK3cHp1eDdzi0uxfWPFNxviwf9s+DOz+2Xox25MkOTxLkHsT+1P209mnN4tOLSSpUWkr+cOoHtCotz3V+zspR2if3ESPJ+m4Burg4ALRNm+IxerR1g6ohIikItqMkR3mElJsAjh5g0le+5qqNVw3N5oTNLDixAL1Jz8NtHjb3Tz6WcYzkwmR6N+2Np6NSBkSSJEY3H83o5qOJzYvlvf3vWbzXzks7RVK4RWo3V8J/XUne2rXIRiOed96J2t3d2mHVCJEUBNux7v8gZpPy59TjsGsOBHSElKPKmMYZuk6xXnxWdjH3Ii9te8ncKvP1na8T6BbI2otrWX5eaZbjrnVn4YiFtPSxfMTWxLUJ7lp3CvQVPcQjvSOv+/2SCpLwcfIRBfSuIWvBArIW/gAmE/r4BPxmvFIvilqKB4qC7Ui6qqFfzkW4fzEMmQl9p8NTu5RS2g3U7uTd5oRw2frY9aw4v8J8XKAvYMGJBRTri1lxfgULTiwguTAZZ40zs/rMwttR2WzVvlF7nuv0HMX6Yr44/AVTN07lm2PfoDPqSC1KZdwf4xj520gGrRjEmgt2vc+0VhTt3UvmV18jl5Qgl5WRvXAhhVu2WDusGiHuFGpAQakerVpVZZVU4SYE94C8Kwrr+kbA+v/Aub9AUoOhFEa8d+3X13MR3hGVxpq4NkHGcll5ni6PR9c/ytnsswDMPzGfJaOW0Nm/M6OajeJC7gVGhI3A39Wfl/95mQ1xGwDYm7KXzJJMSg2l5tcW6Yt4Z+87DAoeJIroXaH01Kkqxk7jPrhW64jWCZEUboPOYGLGymOsOZaMk1bNs4MieSqqflRKtIqR74OuCC5sBv92EDEYtn+onJONsPcraD26wZa96BnQk0fbPMrPZ39GlmXGNB/DpLaT2JK4heMZx83XtfNtx7wT88zHRfoilp9fzvGM45zIPAEoCaBIX8Sm+E0W32Nd7DrCPMMsxkoMJaQUpRDpcP3HTQ2JS/celcd6Vh6zRyIp3IZlBxJYdTQZgGKdkffXnyWqZWNaB4jNVrfEyRMih4J7E2g+CBL2Vr4m83yDTQoAL3d7mSc7PIlRNponlL8Z8g3Lzi0juTCZYWHDMJqMcMLydUW6InNCuGxd7DoauzQmtaii4XyAawADggZYJJlg92Cae4kPO1dybt+OgPfeI2vePGSTEd/HHsO1e3drh1UjRFK4DadTCiqNnU3NF0nhVq1+Bo7/ovz50ELodlVZBrWjspehgbv6MY67g7tFCQujyUhb37acyjplPj++1XjWxa2zKKXt7+rP1Dum8p8d/6HEUIK71p0Z3WbQxb8LRpORzQmbCfEI4blOz5k3uQkVvO6+C6+777J2GDVOlLm4DetOpPDUz4fNxw5qFc8MimDVkUs4atU8PziCEe3EDtxqKSuA2SGWTXR8I2DAf5S9CVpn6PcSNBtgvRjtSKmhlI3xG8kvy2dY2DD8XPz48dSPfHLoE4yyEV8nX+YPm0+kdyT5unwu5F6gpXdLsdLoNhjz8zHm5uJgI43ArlbdMhciKdymBTtj+XlfPO6OGga18uPTTdHmc2qVxN8v9qdZYzFBd0P6UvigmbJ57bKmnaDP8xC9CfxaQ7cpSnIQbllqUSqJBYnc0fgOHNWO1g6n3sha+AMZn32GXFaGU4c7CP7mG3NZbVtR3aQg7glv05S+4Wx5KYrVz/Qlv9RyY5XRJLPnYtY1XilY0Dop/ZcvU2mhSQdYMQmO/gQbX4dfK1f5FG5OE9cmdGvSTSSEGqRPSSH9ww+Ry5RHc6XHjpM1b76Vo7p1Yk6hBrWpYi6hqjHhGvq+ABFDIO0khPWFn8dZnj/7l9JrwdXXOvEJQhV0CYmVGvNcLn9hj0RSqEF3dQrkYHw2Kw4moVWrmNQ7lMIyA3nFejxdRMOYamnSTvkCZTXSlTROSh9n4YbOZZ/Dw8GDALfrz2mZZBMrz69kb8peWvm04uE2D+OsEY/orlaweTN5q9egadQI38enoG3aFABd0iXUPt6ofX0tWni6Dx5krVBvm5hTqAXFOgMbT6Ux49fj6AwmnLVqvn6oM1Et/awdmn2J3Q4/3w8GpZMdg99UJpuFa8rX5TNt4zROZp1EQuKBVg/wao9Xr3n93CNz+fZ4RXvIoaFD+STqk2te3xAVbNlK0r//bT7WNm1Ks7/+IuWNN8j/6y8AnHv0QO3qiiEjA8/R/8LnkUesFe41iX4KVuTioGH2urPoDMotZYneyP/WnhFJ4WaF94cXTkDcDmWi2a+1tSOyeUvOLOFk1kkAZGSWnF3C2IixtPFtw/mc83xw4APi8uIYFDKIl7q+VKmExeaEzRTri2+4Cskkmyg1lDaI1Up5f1j+jPTJyWR9v8CcEABK9u2j6Qfv4zlmTF2HV+NEUgBKdEY2nUlDq1YxqJUfDprbm383mmQyC8ssxtLyy65xddWKygwk5ZTQvLErmoZc896tsWjJeROSC5OrHGvp3ZJnNz9LcpFyfunZpThrnGnk3IiUohTzte4O7mhv0Bt7a8JW3tn7Dukl6fRp2of3+79v3khXH2n9Kn+YMxWXVBrTJSRWGrNHDfi3jSKnSMfwz7bz7NIjPPnTIe79ejeleuONX3iVxOxiJszbQ/PX1vLAvL0MamX5D+nuToHVfq/1J1Po8b/NDP9sO/0/2MrZ1Pwbv0hoEFKLUpm+bTojfh3BzN0zKdQVWpwfFjbM4tjT0ZOeAT2JL4g3J4TL9iTv4YXOL5jnENSSmuldpqNVVSQFnVHH79G/88XhLziZeZJifTGv7XyN9JJ0AHYl72Lukbm18Ve1GT6TJ6O9Yu+B94MP4nXfvUjaK5KnWo37kPqxsbLB3ymsOJRIQnax+fjEpTz+Pp3G6A5N0RlM1b5reGXlMfZezAZgf1w2nUO8eHlYC44m5tEj3IfH+oRV6330RhOv/36SwjJleWtyXinv/nWGxVPqR12Vm1aQCrHlj48uT0A3YC//8zLHMo4B8Gv0r+hNel7v8Tpzjsxhf+p+2vi2YWavmfwd/zcyMo2cGrHy/ErubHYnHg4eFi06W/q0JLEgkTqwLKQAACAASURBVHa+7dCqtTzd4Wnu8LOsQjt923T+SfoHgAUnFzCj6wwK9ZaJ6Ez2mVr+W1uX1t+f5n/9SfGRI2gaNcaxWTgAwd99R/b33yObTPg88ghOrVoBoE9PR9bpcAgKsmbYt6zBJ4ViXeW7guTcEsZ9s5sDcTm0auLOR+M60C5QuT3OK9bj4qhGe9UjncMJuRbHRxJz+e3ffaoVw8WMQlYeSsJZq2Z4O3+yinQW5+Oyiq7xynru4jZYMl6pjgow6A3o/4pVQ7KmIn2ROSFctjt5N7P3z+b3mN8BOJ9zntSiVJ7q8BRTNkzBICsfLv68+Cfv9H2Ht/e8TUZJBl38uxDhGcHMPTPN73Wp8BKrx65GrVKq/SbmJ5oTAijzCFsTt9LIuRGZJZnm8R4B9f8Di6TVVqpt5NqjO649LMdSZ80i55dlYDLh2r8fQV98gcrJvnpfN/ikcE+nIL7bEWv+ZO7n7sjOmEwOxOUAcDa1gBeWHWXFtF48s/Qwu2Ky8HV14K2xbfnXHU3N79MlxNtio1qnYC/mbonmaGIePZv5MKl3GKUGE6/+doL1J1MI8XHh7bva0cTDidFzdlJUnpyW7k+gS6g3h+JzzO81om2TuvhR2J5tsysSAsD2j6DHU+DYMHeIu2hcCHQL5FLhJfNYpFekxS9uUCqgejt6mxMCwLmcc7hr3dl470bydHn4OvsydeNUi9fF58f/f/bOOzyKan/jn63ZbHrvCRAIhN5770V6V8SCIkUEvfrDLla4glIUlCKi0lHpRYqA9E6AkISEFEjvm7abbJnfHxN2M1muyr0giLzP4yNz5uzMmcnuec/5lvdLbH4sDbwbUGYsQym3nx40Sg1fdv+ST898yo3iG/QM7ckLjV+4y0/690Dx/v3kLV8h7hSefgqljw8Fa9dZz5f+doTCn3/G8/HH7+Mo7xz/eFII9dKyfVpHNp29iVopZ0yrUB774oikT0J2CfP2xnEsQZz080oreG3TJTpH+OCqEe2Kn45ozGs/RnE2uYDmoR64aVXM23sNgP0xWaQV6lHIZGyPEu2613NKmbrmPKNahlgJAURz0eSu4UT4OXM1o5gudbx5sfs/VLLYoJMem8rF//6hpCCTyfiow0e8fuR1ssqyqO1emzfavMGs47PIN+Rb+wU5B+GkcrL7fEJhAm8fe5v0knQ6BHXASyNNAlTIFBjMBkZuH0lsfiy13WvTNbgrh1IPAaCWq3m6wdM08G7Ad/2+u6fP+qDDcO0aqS9Ntyatpb/6Gl4Tn7frZ0xJsWsrT0xCt30bChdX3IcNReHufs/Heyf4x5MCQE1vJ/6vbz3rceuanuy6bJMTbhjkSny21I6qN5pJzC5BZzBRpDfSrZ4v6ye2A8BktlDvnT2S/lsupFHDW/pDLSgzor+NU9vHRcPsdjX+18f6+6PFM7C7irkocqAtm7ksHxw94Fb5w+RjEL0Z3IKh5TP2iW8PCVr6t+SX4b9QUF6At6M3AG+0eYPpv04nvTQdT40n77V7Dz8nP/bd2IeuXCTWLsFdWHJxCQXl4g70aNpR+tXsR4hLCDeLb6KQKXix2YssOr/IWmAnoTABpUzJkh5LuFl8k87BnQl2+Xvaye8GypOSECqMaOpGUHrkqDSLWRCwGMqRaTQIBtvu1rla0Z3y+HiSRo1G0IvRS4UbN1Jz21bkavVf8gx/Bo9I4Tb4cHBDzBaB4wl5NAxy4+OhDdl5KYPTSbbVmI+LA3Or7B78XB3YPKUDge6OKOQyvJzVkjBUXxcNLcM8uFDF9+ChVTGxcy0OxGSTVih+SZqEuNtFLv0jUFEm6hvF7QGfCOg7B9pMBNcAiN8HvvWhxdOQHSvqIeXEgGc4jPgGSrJF38OtCmQx2+D5h6M04u2gkCushABQz7Meu4btIq0kjQCnAGtI6Y4hOzicehhvR2/8nPwYunWo5DrJumS2D9nO1byr+Dn54av1ZdmlZZI+sQWxdAjq8I+WzhYsFtJffZWiXbsB0LZti8e4J+z6aZs1xbV3L3KXLUMo0+PxxON2foiCTZushACiHEbpkSMPVMW2R6RwG3g5O7D0SWni36Su4RSXm9h1OYMQDy0DGgfw9pYr1vNZReV8fyKFV3tHcCIxj8dbh7L44HUqzGJG85sDImkZ5kF2cTm7r2RSw0vL+4MaEuyh5ZeXO7P/ahYaleKu5En8LfHrh3B2pfjv4nRYNxamnYeaXUDpKBKFSgM7ZoiEAJB/HbZMBfdQqFqSMu0cpF+EwKZ/+WPcLyjkCkJdQ9l+fTsnM04S6RnJqLqjGFx7MAAmiwk/rR9ZZVnWz7T0b4lCrqCRTyNy9bkkFCTQ2r+1xEfRyr/V35oQLGVl5K1aRXlMDNp27fAYMwaZ/PbPYyoooOzUKdQ1a6GpG2FtLz1yxEoIAGUnT+LSpw8eTzxBwQbRqew2dAguvXohUygIXbrU7tqC2YxgNiN3sJdpkakdMBcVIddqkSnv/5R8/0fwN4FKIefN/pG82V/Mqt13NcuuT0FpBf0XHeFalmhqalPTkyldw2ka4mHVPlo4phkLq33O2UHJkDvIY3gokXhIelyQBDHbYds0KNeBTC7uHjKrlRTLjr59prODyz0b6oOKZZeW8cWFLwDYdn0b0XnRzO4k1rRWypUs7LaQT05/QkpRCt2Cu/Fi0xcBWHR+ESuvrMQsmKnnUY8uwV24nHuZRt6NeLvt2/ftee4G0me+TvG+fQAU79uPKScH3+nT7fqVnT/PjeeeRygTw9O9p07FZ5r4fozp9gmBpowMnLt3Q+7mhrZ5M5w7dgQgd/lyCjdsRO7sjM9L03Dp3p38tWvJWbgIS1kZrr16ovD0xJwvWh00jRuTu2IF+lOnUHh54f/eu7j27m13v78Sf98lwH1GpzrehHraUvzVCjlaB4WVEABOJeVjsgiPxPD+DPyl8fFoveD0UpEQQCy+c+ADqNFJ2q9mF+j4MlTNqG06Drz+eeUjf47/WXK8O2k3ZUZbDk4D7wYMrDUQOXJ+SfmFr6O+JrEwkeWXl2MWRN9WbEEsER4RHB59mC97fIm/09838s1cUkrxfmkNat3Wrbftm/vlYishAOQtW4a5SMzpcO7aFZljFZFAhQJzfj43JzxH3pIl3HzueQo2baJo925yPvscY2oq5bGxpE6fQenJU2R9+BEWnQ6MRop27cbruecI+PgjghYtRBNZD/2pU+J48/LIeP0NzCX3NwT9nu4UZDJZX2AhoABWCIIwp9r5V4DnABOQAzwrCIK9u/4BhEal4Ocp7Vlz8gY6vZHhLYLYHpVh1696zsEj/Af0eh8KU+DGCXANgkGLYM+b0j4VJdDzfVA7QcpxCG4B/eaKfofpFyHhgOhoDmt3f57hPsNV7UoatnBVrVIrkayIy4/j41MfW4+/jf72ttdJLU69d4P8CyF3UKNwdcWss0WxKX18MOXkYC4qwiHctnC4RQC3IBiNWMrKULi6ogoIIOy7VeSt/BahogKP0aNJnTFD0j9vxQqc2rSVDsBopOiXPVBNdLQ88TqBH30EQP5K6d/AUlaG8eYNFJH3T+frnpGCTCZTAIuBXkAqcEYmk20TBOFqlW4XgJaCIJTJZLLJwKfA6Hs1prsNb2cHpve0hYsq5DJWHk2iwixGJXhoVfSK9Ltfw/t7wcUfnt0jhqGqXUAuh6aXYf8sW5/aPcG3ruhcrg6tJzQead/+D8K0ZtOYcXAGFZYKZMjoXaM3U/dPxVHpyLONnuVawTW7z+hNejwcPKxRSQA9w3r+lcO+Z5CpVPj+32tkvDcLTCbkWi3q0DDiu3YDsxnHJk0IWbYUhZsb7iNHknnF5iPUtm5NxptvUnrmLI6NGhHw0UcEL5gPiIRRfaLHIqCpbz+Ru/bsReGmH8FkyxnR1K1L7ldfgUyOpkkT9BcvWs8pfX1xqF377r6IO8Q9k86WyWTtgFmCIPSpPH4DQBCE2f+hfzPgS0EQfjcN+EGXzr6UWsi60zdwUCp4un0NuzDUR7gDCAJc+MEWfdT+xX+kr+BOkKvP5VzWOcwWM68feR2h0gHvqHTky+5f8tze56xtAJ93/ZwarjVYemkp+YZ8BocPtjqnHxYYs7Iov3YNuYsLKWPGSs55T5mC94tTKTt5ktLTpzFlZuFQpzYlJ05QdvSYtZ+mYUNq/rjJepz9+XzyltkitbxemIjC3YOyUycpOXoMuUaD94tT8Xr6aYr37ydn4ULMRcW4PvYYus2brT4FuZcXrj17UPLbEdQhIfjO/D8cGzS4J+/hQZDODgKqygamAr+XDz8B2P075/8WaBzsTuPgBysZ5W8LmQyajxf/e4Q/BW9Hb/rU6MOnZz6VTP56k560kjTeb/8+S6KWUG4qZ2y9sfQK6wXAvC7z7teQ7zlUfn6o/Pwo/vWg3bmK5GRSxj2J/vx5ABzq1MbvrTfJ/VoaQWS4cgWhogJBEMQdyCsvo23RHH10NEKZnrylNoLwmTEDrwnPIlOpsFRUYCktxW3IEFx696Z4/wErIQBY8vJwqBNBwPvv36Onv3M8EI5mmUw2DmgJzP0P5yfKZLKzMpnsbE5Ozl87uD/Azfwy9l/NorDske/gjnFxHazsC2vHiGGkj3DXEOxsn2SWb8hn8cXFZJZmEuERwai6o+7DyO4ftK1bo3CTJjUq/fyshABQHp+AbvMWHJtKw5kdIiNJmzmTuGbNie/UGd327Th36YLPlCkU7dol6Zu3ciUoKvWjnnue9Jmvkz13HokDB2HKzaU6ZOoHKxDlXpJCGhBS5Ti4sk0CmUzWE3gLGCQIwm2LDgiCsEwQhJaCILT08fG5J4P9b/D9iWS6zD3Ic9+fpf2cXzlxPe8PP/MIlYjbDVsmiY7la7vhu8FilvIfwVAECftB93A4Q+8VhtYZStsA0fEpQ8bQ2kP5Nvpba57CqcxTLLqw6H4O8S+HwtmJ0O9W4dKnD9pWrQiYPRt1WJhdP7NOR8AH76Nt1xYUChybNMGxRXOKd+8BiwVzXh7pb75lneAFo1Hy+VvH+qgoyk6ftrUbDJgyMlCF2WS41TVr4tp/wL143P8a99J8dAaoI5PJaiKSwRhAogxV6UdYCvQVBCH7Ho7lrsNgNDN3TxyWyh16WYWZeXvj+Gly+/s7sL8LYndIjyuKIekw1OkDpdngUcP+MzdOwZoRUF4EMgX0+ze0ttebeQTRh7C893KSdEloFBr0Zr1VSfUWYvIebsnr20FTrx7BCxdYj035+eQsWIC5UFQakDk6Yi4uInnMWOROTgR9Ng/Xvn25OWWq9EJGI4a4OLRaLe5jxpD7xRfWU+6jRlKRfPsgSplaTa2ffxZDZeVyXHr0QK59sKrX3TNSEATBJJPJXgR+QQxJXSkIQrRMJvsAOCsIwjZEc5EzsEkmatjcEAThb1HPrtxkoaTCJGnLKylnb3QmP5xMQaNSMKlLOC3CPO7TCB9weN4mjyD3OmyNEAnCvxGM3QBuVZL6DnwgEgKAYIb970OzcaD6Zxeazy7LZkvCFiyChSG1h0hyC2q6idr/ZouZAKcASZW135O8vph9kS0JW3BVu/J45ON/63yF34PS05MaGzdQsHYdgtGIKjCA7Lk2/0rav15F06ABTm1aU/KrTTpFrtVSfPAQqZMmIyDmMqhr18ZSVEThho0UfPc9DnXromnaBMNFUe5cptHgMW4ccicn3AY/uM78exZ9dK/wIEUfTfz+LHurZDaPbR3C+jM3rdFqGpWcQ692w9/t76Wn/pegvFiUskg+AnIltJ4I574DY5XEnaZPwJAltuMvWkJevPQ6ryXaRPL+gSgwFDBs2zBrfQMPBw9+HPQjXhovTmScoKSihM7BndGqtMTlxzHn9BwxozmkG6+1eg2N0v67eSnnEk/tfsoqve2r9WXH0B3WCm0PMzJmzaJw/QZJW8Ds2bgNGkj255+j27YNpY8Prv36k/PZZ5J+/p98TNZ7syTmJLdhw3Bq0xpTTg4uffqg9PW9rdTFX4EHIfroocfCMc1YeSyJqxlFdKnjw7WsIkn4ssFo4WBcNmNbh/7ni/xT4eACT++A/EQxL6EsF04ukfbJiRPDUc98A2ot1OoiJYXavf7RhACwJ3mPpOBNQXkBOxN3ciztGKcyxUxZP60fa/qvoa5nXb7ta5+wJggCZzLPkGfIo2NQR7Zd3yapxZBdls2xtGMPTf7C78GxYUMKkZKCpkF9ZAoFfq+9ht9rompv3jcr7T5rOH/Bzr9gTE3F7ZOPMcTEkDplKuXx8WiaNCZo3jzUISF213gQ8IgU/gc4qhVM7WZLNPnhRLJdn6pSGI9wG3jWEv+v9RL/nZ9oO+fXENaOEiUuAFRO0PsjuHkKfBuIeQv/cDgo7Fed6SXpVkIAyCrLYuO1jfSv2Z81MWuoMFcwqu4oGvuI0iKvHHqF/TdEOQhPjSe9w+y1d9wcHk4p8upwGzoUfXQ0uh9/QqbV4typE2nTXkIQBLyeeRqPsWKeg1P7dmLIdJVVoOuQwZSeOIExzRZPc6tuc/r/zaQ8XlzQGKIukfneLEJX3iYJ8wHAI1K4ixjZMoS9V7M4Ep+LTAajWoTQPvyfvZL905DL4YkfxQzm/ESo95iY3SxU0aw3loLGHUavvm/DfNDQp0YfVkWvIkmXBECISwiNvBuxPm69pF+uPpcndz1JsbEYgF1Ju9j42EbKzeVWQgAxbFVv0ksqvHUN6UpLvz+0OjwUkCkUBLz3Hv5vvok++iopY8ZYz2W+/wHq8HCcWrdGExlJ4Ly55C1fAWYzns88g1PLloR+s4KcRYuoSE3DtW9fPJ58Ekt5uZUQbkEfHf1XP9qfxiNSuItwUMr5YUIbEnNKcFApCHJ/+G2wdxVe4TD6B9vxya/s+7hXbrlLc0UCSb8INTuJ9ZvV/7zscSeVExse28CvN37FIljoEdoDmUzGkqgl1kldLVfj7ehtJQQAo8XIrqRdt3U2WwQL24Zs42TGSVzULjTzbfaXPc/dgrm4mPK4OBzq1UPhfOeV+mQqFfpz9r7LspOn0DZrhrmwELcBA3AbIA0nVdeoQdDnn1MeH0/mx5+Q9+1KXLp2RdOoEYbLNoXf6nUWHiQ8IoW7AIPRzFubr7AtKg0fZwfeHVifvg0D7vew/v5o9iRc3SrmMgA0HiOqogL89BwkVmaoZl0WHdeDv7w/47zPcFQ6MqCWdHJa2Wcl88/NxyyYmdh4oqSu8y14OXrR3Lc54W7hXNddB8SSnMMjhqNWqOkc3PkvGf/dRsnhw6S+/ApCWRlyrZaghQtx7tTxjq+juY3chCBYiO/WHXNuLpr69Qn+8guUAQEU79mD/tJltK1a4dy1Czenvojxxg0ACjf9iOvgwShcXNBHR+PUuhX+s977n5/zXuFR9NFdwOKDCcz9Jc56rFbKOflGDzydHpwSew80dKmwfQaknoaQtvDYfDEUtSgDDn0i7gZqdYWes0CuAKMePq4WIqn1hv+7/teP/QFEnj6PJ3Y9YSWCNgFtWNJjCdN+ncbx9OMARHhE4Knx5FTGKWq41qClf0tUchUDag2w+hr+rkjo3cc6IQOoa9UifNdO9FFRlMfH49SuHaogW6izWadD7uJy2+I7OYu+IO/bb8FiwWPsWHRbt2IusIkHuvTri8o/gPxvbQ58z+cmkL9C6i9Q1w4nfEe13Jxb9y8qovT4CVRBgTg2agSIxJb/3fegVOD17ASc2v6eQtCfw6Poo78QF24USI4rTBauphfRsY5YMvFUYh77Y7Ko6e3M8BZBOCgV92OYDy62TIak38R/x/8CW6fC+C2wbjRkiDHeZF4CB1fo8hooNWK1tULbDx/vOvbX/Ydi07VNkp3BqYxTnMk8w9JeS7mUc4kKcwU/XvuRnUk7AUgqSsJgNrB72G4U8r//d9OUIZWwN2Zmkr1gAXmVekYylYrgJUtwqFWT1BkvY7h8GWVgAIGz5+DURmrW8XlpGt6TJ4EgYMrJIX/VKsn58rhrlBw8JGnTbd2GwssLc55N4UBTvz5CRQV5q76j7OwZHBs3weu5CVSk3CBl/Hix3gLgPnYM7iNGcHPyFGsN6LITJ6m1fRvqGjXuwtv5YzwQ2kd/Z1SYLLSs4Slpc1QpaBQkRmvsuJTO6GUnWX4kiTc3X2bqmgt3fA+d3si/98QyYdUZfjiZwt9td/eHSD5W7fgoFCTbCOEWrm6FilK48hM0HiVGLAG4hYjZzY8AQHFFsV1bgaGAAzcOkKhLpI5HHS7nSivYZZRmkKN/sHTF/lu49u8vOXbp1Yv8KiGkgtFI7ldfkTl7ttXOb0rPIH3mTCzl5eQs+oLEQYNJnTaN8qQkZCoVMrUaZWAg6nBp0qVTxw7I1FKLgFyjIWjupygDRROytmVLfF99law5/ybn888p/e0IuV9+ScY775K3fLmVEAAK129At2WrlRBujbf40KG78m7+DB7tFP5LFBmM/GtjFPtjsghw1dC9ni9nkvPxcXagc4QPB+Oy6dvQn++PS9Pd98dkkVaovyMn9OTV5zheqat0IDabIr1REgr7t0dQc0g9Iz128gG1s1hY5xbcAuGrDmKpTgCfSHhyK/jVF81KjwDAoPBBrI9dT4VFFGn0cfRhY9xGLuSICxJPjSct/Vpyo9i20wp2DsZX6yu5jtli/lvuHPzfn4UqKAj9xYs4tmiO2/DhFG3bJulj0esxxUhL6poyM8n98ksxoggov3YNQ2wc4b/soXjvPnRbtqCuHY7SxwdjejrOXbrgNWECCjc3cr+o9GfJZHhPmYKmcRO0rdtQ8ttvoFRiKSpCt3On5H5Fu3fj1K5aQShBQOFpr4KgDrXXaKpITUUfFYVjkyaog+0FEP9bPCKFShjNFo4l5OKgVNC2lieVshsYjGYMRjPuWulqYMG+eGud5nSdgeJyEzte7MjIpSdYdTwZgHqHXfB2lsaRy2Vi6c4/wo28Mpw1SkwWi5UQbmHrxbSHixQGL4afnxd3BgFNxWO1k7j63/kqmPSiuci3AVz7xfa5nBjIugIBje7f2B9A1PWsy+r+q/k5/mecVE5EeEYw87eZ1vP5hny8Hb3pFdaLo2lHqe1em3favoNcJn4vCw2FvHXsLY6kHiHMNYz32r1HS/8HNyTVrNNhyslBHR6OTCZDrtHg89I0SR/Xfn0p2mVT5vd4fCyGS5fEAjiVcKhbl9IqAnYAxps3KVizlqyPbRXrlL6+1PplDznzPiOhR0+wWHDq1hXnDh3RtmqFpm4EGbNmUbRlCwBlJ0+S+uI0VL6+lFerAuc+cgSlR45Y2zSNGuH93HOUX71K8T4xVNh10ECcu3aRjEu3bRvpr78h7ijkcgLnzMZt0N1RCHpECojmmZFfH7fWV25by5PVE9qw5tQN5v4SR0m5iZ6Rfiwa2xStWnxlUamFkmsUG0wsP5JIdrFN6DU2s5gZPfw5nZxPhUncDo5rG4aPi0gURrOFrw9d5/C1HOoFuPByzwgcVAqe++4MJxPzUcplPNepJk5qBaUVZut1/VwfMtkMn7owYL7oW8i4CLtehaHLRF0jhVoUzwttL0YYVUeVGsSPYEOkVyRveb0FwOGbh+3OCwh83vXz2352wfkF/JYq+niSi5J57bfX2DtiLyr5gyXxDJC/Zg3Zn85FKC/HoU5tQpYvR+Xvj6mggIrr19HUr49cqyVwzhy0bdpQdvYsLr164dq7N+Z+/REEgdIjR3GoVxf/N8U6CoaoS9bry7Rays6ckdzTlJ1N/qpVFKxZY20rPXgI98GD0dSNAMTQ1aqoSE4mcO6nZL43C0tZGTIHB/zeeB3X3r2Rf7OC4j17UAUF4fH445hLSvCaOBGfV15B7uCAKjCQol27KD1xAofISDxGjiR7/gKbicliIXv+gkekcDex6exNKyEAnEzMZ8PZm7y/Pdqqgro/JotvjyVbV+hta3lyLsXmYPZyUuPqaP+jqeHtxMFXu/LLlUzOJOdzOU3HnN2xTO9Rh4UH4vn6sBgxczalgGuZJXSp68PJRFFC2mQR+PpwIi92q83Xh69jsgh4aFW81qfuvXoV9wcWC/z0rOhHAEg8BHtmQlgHkSBA9CfU6S0mrxkqCdnJFxoMvR8jfuCQUZLBd1e/I1+fz8DwgXQK7mQ91z6wPTVca5BclAyIWdDD6wyXfP5yzmWyy7JpF9iOqBypLydXn0tGSQahrg+WXIupoIDsOf+2SkuUxyeQu3gxTu3akf7Gmwjl5chdXQlZshilvz/5q76jIimJ4v0HsBQX4z58OAEffogh+ioKd3fUwUH4zJiOIS6W8qsxyJ2d8X/3HQxxcXb3thTZL1DK4xOgr/hvTf36VCQnW8+Jekn9cO7WDUP0VRwi6mApKSHzo4/FsYwYjrZVKwo2bSLrw48QKipQ+vsTumI5ucuWk/O5jcANly5L/BCA3fH/gkekABSWGe3armUWWwnhFmIyiojJKOJQXA4NAtwY2zqE3VcyCfXU8nT7GgS5O7L6ZApFBlE3JtRTS6/6fjg5KDmVlMcv0aK56cKNQvJLyzmVJK0fcDo5H19Xe9mChkGuHHu9O9dzSmgW4oGj+u9n5/1dlOXaCOEWUs9BTrWawvH7YOJhceegUIk7Ca3Uyf9PRIW5gqf2PGVVQN2dvJuven5FxyAxNl+lUFnNSfEF8eTqc/kq6iueiHyCVv6teO/4e/wc/zMg+hvaBrQloTDBen1/J3+CnIPsb3yfYUxPt9MaqkhJofjgIYRyccduKSoi69O5qENCqEgSfVGCXk/mRx/j2KYNadNeojwmBmQyPMaNw/+tNwl4/31yFi8GARReXng98wwlhw9TkXAdZDI8x4/HbcgQ8r/7zrZal8lw7iwSsVBRgUu/vlSkpmK4dAlVcDDadm1J6N4DmUaD95TJaBo0IHns45grazLoduwgZMVysmbPQagQfUGmzEyy5y+gPDZW8oy67dvxGDuWgtW2zH73UXevd0MBJgAAIABJREFUYNIjUgAGNw1k+ZFEyitNPB5aFc92qMmP51IlZhsvZzUDFh2xksXjbUI5OrM7z3x7mlc2iqurQU0CiPBzQaNS0KOeL+UmCxqVYPU/3MLuK5k0DXEnJc9m/vDQqujb0J8dl2whdS4OStrV8sZNq3r4zEa34OQDXnWkYndh7e2JQukgZj13f+svHd69gtFsZEfiDlG1NLQbTXyaAKJAXb4hHw+Nh9XO/3s4m3lWIokNsP36dsLdwtmZtBMHhQODwgfRPbQ7X1740uqAPpx6mIVdF1oJAUR/g1KmZECtARy6eYhabrV4q81b/5XDuaiiiO+jv7eqsvav1f+PP3QH0NSrhyokBONNW9Vf5y5dKTsjzWMyZWRINIpAJIb8b74RCQFAECj44Qecu3Yh9cVpCHo9AKVHj1JjwwZCv/2WnAULsZQbcOnTB03dCIIXLSRv+QoEiwXPp5/CsUkTKlJSSHnmGUzpGSCX4zXpBTSNGpE21abTlfHGm5jzC6yEAIDZjG7zZoQyqTnUmJaG3NUVqugpyR0d8X3tVTT16lJ2/gLa5s1wGzbsf3qXVfGIFIA6fi78NLk9607fwEGp4Kn2YYR5ObHy6VZ8tvcauaXljGgRzIGYbMnuYcOZm/i6qDmTbDMjbYvKYOMLYWy9mE7P+b8hCAIjWgQT7KHlRr7tDx7mpeWNfpHEZ50hs8iAVq3g/cENeaxxIGXlZjacvYmHVsW07nVw0z54tty7CpkMRn0HO16GrGgI7w59Z4v+hbVjwFzpp+n4CjjcuWTBg4p/Hf4XB2+KWdkrr6xkfrf5hLmE8crhV0jSJRHkHMSnnT/9w2QyL0d7fS2NQsPw7cOt4anrYtcxpPYQKyEAmCwm6/2rosxUxvxu8//j/YoqijiQcgClXEnPsJ44Kh0pMBSwPm49efo8Hqv1GE19mzLtwDTOZ4ulLvck76HEWHLXSoAKRiMVKSkEf7WEvK+XYkxNxaVvHzyfeoqyC+cp2X/A2td1wAAU7u4SmQl1eDiWUnt/VPG+/VZCAMBspmjXLkoOHKAiRYwkLN69h7BV36Jt2ZLCnzdTcugQeV9/jSogkMKNG0VCALBYyFu+AvfhUlMdgoApK9Pu3urQMDs5DNe+fXGIiCB1+nSo3BX5vDQNuYMD7iNG4D5iBBVpaRRu3IhTx453JQrpESlUomGQGx8PlUaxtKnlxcZJtpCx6qt9QRBILzTYXWtvdBZrTtnC/TaeTeXF7rX5/ngyRQYTXk5qZg1sQP1AV47M7EZcZjGhXlpcNeLkP6pVCKNaPZiyuncNpXlQnAG+9UUxPL8GMGGvtE94d5h+UUxs842EgCb3Z6z3AGklaZIJWUBgbcxaTBaTVdwurSSNd469w9YhW3/3WnU96zK8znB+iv8JgCDnILQqrSRf4WbxTfL09uVim/o25WLORau5SIaMoXX+s58mT5/H6B2jrWU9V15ZyQ/9fuCpPU9Zx73p2iY+6fiJlRBuYev1rXeFFIp//ZW06TNE05FCgf+s9/CYNxdTQQHl8fEEzJlD/rLlGKKjcWrXFs+nnxYXHnI5xQf241CjBt7TXqLieoIkVFXh6YlT+3YUrpeKCQoV5VZCAMBspvDHn5CpVdbCO+XxCaS9/DLqmjWlgzWbUQXbm95cBjyGMSOT4r3id15dsyYej4/FY8xocpcsofx6Is5du+L51Hhkcjm19+2l7MxZNJH1cKhtizzM/XopOQtsleS8X3oJnymT/9tXCzwihTvC851qMXXteetOdESLYIY2C2bTuVRrm4uDEq2D/VZbJZdz6s2eJOaWUGGyUFZhxmi2oFLIaRj0z5AltuLkV7DvXTBXiGajJzfbhO4sZmnOgWsgNBkj/Xx2jJjApvWGpo+DxhV0aXDuW1ECo/l4MaLpAYZSpkSGDAHb1lOlUHE176qkX6IukZi8GL658g25+lwGhw+2Ttpx+XHE5MfQ0q8ls9rP4onIJ8gz5NHCrwUrLq+wu2cT3yakl6Zz6OYhADoGdaR/zf50Du7M2ti1ZJdl0zmos50AXrIumRx9Dk19m7IlYYuVEAASChNYeWWllRBAFNQ7cOMAKrkKo8Vm8/fU3B3/T/rM122+BLOZzA8+xFJUTPaCBWA04lCnDiErVqD09kK3ZQuZH36EtlUrvCc+j/fE5ylPSKBo1y7UoSEEzv8c3ZYtKN098Jr4fGXN5P4U7doFgLZ1a1z69qNg9RrJGORurpQePy5pM2Vl4TFuHGUnT1rb1LXD8ZwwAXNBIQXr1iFTq/GePBlt40ZoFy1Ef/kKluIitK1bI1OK07H/u+8iCALF+/aRs2gRTm3b4dS2DS69e5Hz+ecU/3oQda2a+M2cSc4iaZ3t3C+//J9J4ZH20R3icqqOg3HZ1PF1pk8Df+RyGXujM1lz6gbOGiWTu4SjVMjov9Dme5DJYOvUDjQOdmf6+gtsvZgOQA0vLRsntcPX5SH1FdwOpbnwWT2oMlnQbBw0fxq2vQg5seIOYehS0LjBodmig9k3UtQ+KsmClf1sJiX/RjB+GyxpK54DUGnhhSPg/WDlciTqEnFVu+LtKMqfVHXwquVqvu71Netj17M3xbZjaunXkkRdIvkGW1DC3M5zySrLYt5ZsWykUqZkXpd59Ajrwe6k3RxNO4q/1p+f4n8izyDuDiI8Ilg7YC0OCgcSCxO5knuFK3lXCHQKZETECByVjnxw8gO2JmxFhowx9cYws/VMPj3zKT9cFZVrg5yD6Bnak++ufid5rkmNJ/H1pa8lbWPqjsHb0ZvFFxcjIOCqdmV57+XU96rP8bTjbEvchrfGmyfrP4mfk98dvceYepH2jdVqG3g8PhZkcknYqPfUqWhbNOfGxBfAJAaDuPTtS/CC+RRs3EjB6jXIHDV4T56MOiwMi8GAOS8fS5EO3U7RhARiJFHw0q8pXL+Bwo0brddXBgRQe/8+inbsoGjPL6iCAnHp3RtN3boo3NwQzGYMsbEUbtiITKnA44kncAgPp3j/frJmz8GUm4vboEH4v/M22Z99LjqyK+H/3rsY09LIq6KppAoLxZhSReqlEpGxt6+9/We1jx6Rwl3Ewdhs/r0nlvzSClqEeZBVZMAiiDuMAY0DOH+jgGFLpKuLqd3Cea1Pvfs04j9AUYZYE/lurrozomBpNfXNsI6ijpGuyhe80Uhw9oMTVZRPA5qCf0O4UK2eQofpcGyhtK3Tq9Djnbs37qxoKC+B4FaiuesOUFxRzNQDU7mQfQGFTEFj78YEugTSI6QHTionUopT6BTUiWCXYHTlOj498ylnM8/S0Lsh3UO78/qR1yXX6xXWi+PpxymtUrq0jkcdBocPthIFQOegznQJ6cLJjJNcK7iGr9aXl5q9hK5cx7Rfp1l3KY19GvNk5JO89ttrkvt80P4D3j3+rqRtcPhgDt48SFGFWCs7wCmAnwb9xKzjs6xk5qnxZHW/1YS4hpBSlEJKUQoKmYK5Z+aSpEvCgk3CIdg5mG1Dt91RDkR8126YMm02eZlWa+egderQgbIzZ6yRPAAKb2809SMp/e2IpG/A7NlkvPGGrUGlotbOHWR98CGlR48CIhH4vf0W5fHxFGzchDk7G4fISJQ+PpSdOoW6Rg3cBg/GtW8fVIGBGLOyuTlxIuVxccjUanxf/RfOnTuTOGQogkE0OctdXAhbvZrkkSMl4/SZMZ3cJV9J2lRhocg1jpRXD4+VyyWSGMjlRF69fa2GR4J49wC6MiMf77rKmeQCmoa489aASGvGcm5JOZNWn7NGMO2+ksnsYY0kpTjzSirsrplbbN/2QOCXt8TymIJFXLEbyyGkFQxaJFZIi9oAlzeBawB0+hd41IDiTHHCFgTRrON2mzBGv0b2FdZqdYWDH0n7pZ6F6pE3GRfB5zYE6uB6mzaX2z9XRRmc/16MbKo/SIxyMurht3mQchyCW0CXmWLS3M5XIGojKJS2JDn/xvDUdnB0v/31b4M1MWu4kC1KTJgFMxdyLnAh5wI7E3fyQfsPGBkxki0JW4i+Ek1r/9Z83NGWPZtSlGJ3vSDnIMpN5ZK2MmMZm+M3S9qOpB2hdUBr9qXss15ryv4pNPZpLDFbXcq5xEn3k1RHdK795FJqLOXHgT+yPXE7armaQbUH4aJ24bOun3Eu6xy5+lwsFguTD0wmT5/HwPCBvNziZfr/3F9SNvQWUktSOZd1jrYBbX/vFUrg89qrZPzrVeux1+TJFK5ba3PwAi69e2OIjsZclRScbl9vo+yMNIsZo5HCdeuthABgyslBHxVF0Y6dmLOzASiPiUEdHETwksWkTn2R7E8/Jfuzz/Cf9R6G6GjrBC5UVJA1dx7G7GwrIQBYiospWL9eMvkDlEVdAoXUBC1TqnCoGyEhBbmbG47Nm1N60OabcqqW+fzf4BEp3AHe3HyZnZfFL15Sbin5pRV896yoqng+pcBKCLdw/HqehBRa1fDAz9WBrCLxBy2XwdDmD178NxlR0hW6oTIxJvkIbJ4ErZ6HzRNt568fggn7YFlnmwnn1Fcw+Ti4VJO4lstFH8KhOVCQAg2GiNeLWislirAOUJYH+VXksLVe0H4aXNttG1OtrtD+JVH+IrXyx+1RU6zFAGA0iH1dKk0U68ZAUmWG76mvxSpu8b+IRAFw47i4a/FvZNuRVInYIfMSnFsFHWf8uXeJ6OT9T9iSsIVLuZf48Zoot/DjtR+5WXyTiY3F9+vj6MPkJpNZfmk5JsFEA68GTGg4geKKYqtjGWB03dFWX8EtaJQazmRIs3GLjcVUmO0XIm0D2kqup5ApGBExgqPpRyWKq4/VeowA5wDr+C5mX+SdY++gN+kZXXc0rfxb0WtTL2uU07rYdTipnG5LCLdwp76GrHeltQhy588n6LN5FP96EGNGOq79++PcsQNmnc6W9KVQ4DP9JRTu7pQeP2EzH/Xpg7ZFC3Q/SwlV4e1td19zXj6mSkK4BUNsHDmffW6b7M1mcuZ9hkNkNROX0T4XCkATWQ+Zg4M1rwLAqWULHCMjyV1SWbNcJsN74vNo27alIiUFQ9QlMX9i0gtkz54juV7pIfvs9TvFQ0UKh+KyuZyqo124l51y6d3AwTjpF+K3+BwsFgG5XEaDIDcUchnmKjGrTYJFB/LFm4W8uimKhOwSmgS50bmOD0azhVEtQ2hb6wEs11k9P6Aqbp4Cx2rvVncDji+0EQKIE/qljaKj+ORX4sq70ytQp5e4q+jziShb4VEp9DXqe9jxCmRfhdo9oc9HYoRSXoKYv6Bxh4ELRfPRi+fg0gaRcOoPEVfyz+wWicVQBK0mgMoRLq6D3TOhXCfKZPT6wEYIAAhwZgWkSaNkiNku7h7+E4oz/vO526BHaA+2Xd9223OuDq5sSdgiadsYt5H2ge1548gbJBcl09i7Md/3+x6tSkuOPoell5ZSz7Meb7d5m/jCeJr7NsdB6UDXkK5czbuKwSxOUFOaTKHEWMLhNNszK2QKJjSaQEx+DCVGMYu/d1hvZDIZM1vNZNt10ZTTu0Zv9t/Yz8iIkaSXppOvz6dNQBuicqOIyo1iZJ2ROCgdeH7v89b7nck8w4zmMyRhrwCJhYn4OvqSrZf+fgCG1xlOhEfEHb1PS2lptQYLaS+/gudTTxHw8UekTZ9B1gcfglyO64ABOHXqiLZFCyylpVjK9NT86SdKDh1CHRqCY8uWGFNTcX3sMYp27UKmVOL57DN4jnuCwvXrbTkQKhUe48ZhiImh/JotqdKpfXtR9K4KzMXFOHfrKnE4KwMD8J44kbLjJzBcFYMJHFu2wH3IEJTe3mT9+9+YcnJxGzgQx+bNUbi4oG3TBsPVqwgWC7nLl2NZsBCPMWMIWbKE/B9Wk/3xJ7d5ORb7tjvEQ0MK836J48uDlVmY++DjoQ15oo29suD/ggg/Fy7etGke1fZx5kxyPom5pXSO8GH20EbM2RNLkd5Ir/p+PN4mFEEQmLH+AsmVSWpRaTr83DQsG3/vBMaKDUaWHk4kJkOs6fBUuxrI5bI/f4GaXcDBTZxMqyOwObhVj4WWiQlodgPJhH1V7PrrTsOUU3B5Ixz5XHQ21+wMY9aCSyAENhXJI6AxqF3A0QO6vg7RWyCwmUgWRj3smAGxO8WJvyhN9CnsmG5b2cfugGHLxH6myhXcjeNwdqU41iqmE9RO4BEKmVWkpN2CIbQtxFcLkQWQKaDhiD/xEm3oHtqdTzp+ws/xP5Orz7XKTbiqXXmh8QucyzonCR91VjlbCQHgUu4llkQtoX/N/rx59E1rv56hPXm77duM2zWO1JJUAJr5NuPxeo8T5BLEldwrVJgraOXfijOZZ3BSOdG3Rl9i8mP4ovsXZJRmkFKUworLK9ibsheFTMEnHT/By9GLSfsmYRLE1XTbgLZ82OFDhm0bZh3nT9d+4oXGL1gJ4RZuFt+0izpq6tuUZxs9y0cnPyJJl0SX4C70rdmXUJdQ6nrePX9V/urVKHx8KLllTrFYKNq5E7dRI8meO88a/qmpX5/Q71ZRtGs313v0FCUlAgIIW7sGVUAAOV98QWL/Aahr18alRw8QLLgOHAQy8HvvPfKXLcMQfw3nDh3x+7/XUHp5krvEVjrWtW9fPMePRwYU7dqNKigIp86dKdz0I37vvougL8NSXk7hjz8S27wFDjVrEjh7Dg4RdbjxzLMUbtggXmfAALynv0Riv/5gFpNoc+bPR+njQ/430iI+dxMPBSkYzRaWH0mUtC0+mHDXSeHjoQ2ZvPo8N/LLCHTTUMfPmdHLxNWAWinnu2das35iGyb9cI7dVzK5eLOQDwc3tBLCLVxKvXs6JVVxa9cyY/1FDsSKq7IDsdnkl1bwr9538ONzdIent4t29uJM0BeIq/WgFjDka3EiTTworuJlctGn0Pp5iFoHuZWrKI+aSCZfAIsJLq6Bo1WE2JJ+g1NL4fpBSKm04aYcFZ26Tj7wS6UDMHY7pJ+HkDbipA+inX/fu6IGUlXn840TcGKJjRBuQXdTDFc9XxnVoXKCji+LobHrHxef08EVBnwuklVBiug30XqKobMaN2j5jOhbuUPU96rPwRsHifCIYHan2eSU5dDKvxXOamemNp3KnNOiGUApU/Jc4+d448gbks9fzbtKUXmRpG3/jf0EuwRbCQHgQvYFnox8kjeOvGH1R7g7uLPxsY2surLKaiJSyBQs6r6Irde3YhbECccsmFlwfgGRnpFWQgA4mXGSH67+ICGuoooiyX1v4dbzzTs7z6rDNC5yHCqFik0DNwEQlR3F4ouLCXIO4tVWr+KkusPa2tUijawwm6X5BJUo2X/ASggAhqtXyV+zlrxly2ySEhkZ5H/zDXJnF3SbRVOSMS0Ni05H0KKF3Hj2WVHmQqHA+4UXCFn6Nfnf/0BCn74IJhPOPXsiUymROThQdvYccc2a4zZwIGE/fE/usuVkvG4LFgj4+GP0UVHWBLvy+HjSpk/HY/yTksS1op07Ufr7WwnhFkpPnrCT97ibeChIAbCz52fqxAnhYFw2s7ZFk16oZ0CjAGYPa/xfawc1CHTj0KtdySo2IJfJaDfbljVZYbKw5FACZotAYq5IAhk6Ax/suEpdfxfiMm0/qLa1/nvTVoZOz8/n01DKZQxvEYy3swNbL6bx8c4YCsoqeKxRoJUQbuH7Eyl3RgogJooNXyFOvI4e4o9QVmW3MfU0pF8QI4Ru5Rjc0iUSLFDvMdvkXRWK20SZZEXbCOEWrm4BRTUdqNid4k6iOtLO2bdZTODsDyVVMkfr9BJ3FY1HiSay2j1tPo9XYsT8B+8IW9b0oEXif7kJoo+lohTkd/6TOXTjENMO2qScN17byNknzlqlI56IfIK2AW25mneVFn4tCHQO5IerP0hyFtr4t7GGl96CUqbEUJ34gNOZpyUO6sLyQnYk7mB3sk062iyYWRW9Cl213aCuXIfyNs/orLbPJI/0jGRM3TFsvLYRi2ChQ2AHhtUZhkapoU+NPrd9F1vit/DOcdvucev1rZwbd84qVf9noAoNxXibyd+pcyfcBw1Et2mTtU2mVqMKDLTra0y9aRexVJGWhilHWmhIf/EiuUuWiIQAYDaT+9VXqCMiyPrEZr4p2b+fwAULyJg50+ofKNy0CWVQIHkrV0qumbdsGTIH6XfblJNDxXXpwhZA7mL/3p1at8acm2eXJwFiJNb/ioei8lq50WzXZhHEqmfT1l4gJa8Mo1lgy8V0vjwYT1aRgZk/XmL4V8dZfFCcyM0WgZVHk3jm29N8uieWYoPIxEm5pSzcH8/3J5IpKTchl8sIcHPEbBHsBPMMRrNk8ge4kV/G56Oa0K6WF26OKvo38qdfwwDWn75BVpHtB73nSgYfbL/K9qh0a2W1taduMHrpCWasv0BiTgkZOj39Fx5h7i9xzN4dy6AvjhKfXcy/NkaRXVyO0Syw+aJ9gfZbz3JHOLsS5taGf9eEdWPFCdFUAaeWieUyr/wMwS1thACg1ooTbpMx4sTacIQYWopMNLu0fgHaThZNQ1UROVAknqpwD7NvU2rEiVzS5gitnqt2TRk0HAbjfoTwHuIqv/P/QbtK/ZkaHcXcCBd/uP4rbJ8uRlp5hYvjPv4lfBYJ8xvB8cXwTS8xMe7yRvhukCjWdweYeWSm5NhkMTH3zFxArF1gtBgJdw9nYPhAAp3FCWxe53m0D2yPh4MHfWv05c02bzKx8UTUchspjqs/jpF1R0rafB19aeBlX3D+P0261dVSh9UZxvgG49EobLkzvcJ6Mb7+eOp42Eqe1vWoS58afXir7VvsG7GPXUN38XWvr9Eo7XNu8g35ZJWK/qYPTnwgOWe0GFl8YfFtx/afcDtC8J35fwQvWoS2VSuC5n+OtmVLnDp2JGTFctyGDBb1g25BocBj9Gg0DRtKruHatx+aCOniSRUcjDG7WkU6QaDshP2EXHLggMRhDKC/cNHOzi+YzTg2lyYIKv38cB8xXBLuLHdxwWP0aPzefEOsIa1S4T56NG5DhxL8xSJ8XnnFbgzVie6/wUOxU3BQ3X7lfyQ+l5Jyk6TtfEohv107y+U0cYV0LqWACpMFiyDwxa+iT+JgXA5XM4qY2bcew5YcR19JOmtP3WDHtI4oFXIC3R3pGenH/irVm55sV4NDcdn8fN42Mbeu4UmDQDfWTRRD7l7bFMULq8VJxVGlYO3zbfjtWi7z91eaXY7BlXQdtX2ceXOzbSt5Kimfsa1DKaii6JquM7D6RAqm6uxUDXfgTRChSxWL21SaFYjbBScWixXPotaJbRdWiw7mFs+Ik+qtBLOBC8RdxplvxInUwU3UNarRSTQ1Ra2DRsMh77po3mn2JET0FX0Dv34sFtRx9hedwgYdrB1lCwftOhOaPSG2n/9eJI2OM8SqbW0miolvZqO4Szn/AxSmQP3B0Hqi6BxeOwpSTohkNmgRZFyCjU/anjtuD3R7A/ZWEdzba7PhA+I7ubxJDF39kygz2f9Q96bsJbYglnNZ5/DUePJ227fpFdYLXbmOfSn7cFQ6sqj7Ihwqd0upxam4qF3YMXQHJzJOUNOtJs18myEIAmsGrGFz/Ga0Ki2j647G3cGdVdGruK4TV7eeGk+ejHySfH0+2xO3A6L56Kn6T9ExqCM13WoSlRNFE58mDK8zHIVcwdYhWzl48yCBToF0DhbzSjYM2MCRtCPIZXLqetTl4M2D1HavbfULpJeksztpNy5qFwbUGoCTyok5p+ewPnY9ZsFMj9AeGAX7Bcqe5D282PxFu/Y7gduwYcg1GgSzGdd+/XDt109yvsbaNeStWoVQpsd99GgcGzUi5Ksl5CxZQkViEs7duuI5fjwuvXqS9tJ0yuPjUQYEEDj7E4yZWZRWylkAKP39ce3fn8INGyX3cO7ejeK9eyXEoG3ZAk2d2pKkM89nnsZt4EDMhTpKfv0VdXg4Ae+9i2PTpoQsX0bhho3IHR3xfPZZlB4eeI4fj8fjjyNYLMjVagyxscgdHfGe+LxEUvtu4aEgBZVCTvd6vvxazWzyWOMAdl3OoNhgI4a6/s6sqlYic8+VTOvEfwuH4nLwdXGQtMdmFnM0IZfYzGI2n0/Dy1nN5C7hVJgt1PDS8suVTPQVJrrX8+V6Tgk1vJwQBIG+C36jb0N/hjQNYtM5mx1WbzSz9HAiZ5KlEto/nEihbU2piSlDZ5AU8LmFmt5OqJVyaxEfAFeN0irfDdCxjn143e8iO9ZGCNYBXJRWPQMxNDMvEWIqI2vSz8PG8dD7IzHG/xbSzsGLp2H1cJvPQe0Cz/8qVk77rJ7o1PasDb1mQe1eou8h8RC0mSSSjX9jMSpoYVNw8ob+n4pO72VdRN8GgGsQPH8Qvu1rC29NOSaS0bU9kCBWsiLpsBhaq6q21U49LdZt+CM4+/5xnz9Akb6Ic3pxcZBvyOfto28T7hbOhL0TrOGbkdGRrOm/hrln57I+dj0CApGekSzvvZzU4lSGbRtGQkECbQPa8kmnT/DUeLIudh1HUsX8hBERI5DL5DT3a86upF009mlMu8B2ZJVl0dq/NRmlGRxLP8bwOsMZVXcU57POM2n/JIorihkeMZwnIp/gUs4lhm8bznXdddoGtGV2p9nEF8Tz2ObHrFFGLzV7iZ5hPRm7c6w1oW5D3AZea/Uaa2JsGcUHbhzgdlDJ/nfBx4Jt2ylctgxTYSFujz2G//uzMOfnkzV7DoboaLRtWuP3+uvInZ0pWL1GNAGFheE9ZQoqP19KT54kbfoMZGo1AbNno/L3QxAECtdvwJSfh9ekSRguXULp64tzt65UJCbi+dwEdD/+hGCxiNFGJ07iOmAAZadPY8rNxalDe8qvJ6LQOuL3zjuY83JRh9Wg9PQpSo8ew23IEIIXzMeUm0v2vM/IePc9nNq1JXDObFAqyZ43j5L9B1DXrInf6zNRBQaS/Myz6M+J3xu3wXenqE51PBSkADClay1OJeZRWmFGIYeXukfzz70JAAAWr0lEQVTQqY4PXz3Rgve3R5NW6VOY3iOCn86lUVxlBxHmpaWwzChRMXVzVKG5zQ7kaHwuK45W6rxkQXR6EZsmtWXQl8cwGMWJWSmXsXlKByavOUdqgRjaGJtZfFszjt5otruPRqUg1MsJsG1b5TIY2zqEg7HZpBWK16zt68yoViEEe2j5ZHcMeSUVjGgRzDPtw3h61RnSCw00DnZj+fg7dIyGtBIn7aoF4Gv3gORj0ogkjbsY1VMVBclitFBVmPRiWGquLZSPimI4+424c7h1zfwEcRWeEQW/zbX1rT8ETOW2BLeCJFE9tecsGyGAuNs4tkCa7wDiRF/d73DjxG0K9MhEZ/q5VdJm9zBx13HrmZs9yZ1AjlySxQugUCigCu+WmcpYE7NGEs8fkx/D2ti1rItdJ2lbfXU1O5N2WvMfTmScYM7pOdT1qMuiC6IWzrH0YzT3bc4HHT5g7I6xFBvFv2Vt99os6bGEZ355xpp/0My3GfO6zGPS/knoTeJ3K/pENO4O7nx29jNrv5MZJ/n09KdklmVKwk6XXVpGTlmOJMP6WsE1fr1hW13/Hpr4/u9Ch7lVymXqNm/GIbwWxYcOoT8r/t11qakIRiOayPpk//vfgFgmU3/xIgGffMyNCc9ZHbrFBw9Sa8d2bk54zlqDAYWC0G9Xor9wkbTplTkqcjmBcz9FMBrJeN0WGODUoQPBi78kecRIq0NY7uZGzS1bSB41EnOO+DcuOXgQ2VdLyF/5rbW6W/m1a1jK9Cjc3Sn4XpQXMaanc3PyFDyeHGclBADd1tuHOf+veGhIYcaGKGvtA7MFjibkML1nHTrW8WbfK9Isv1mDGvD2livojWaCPRz5v771KDIYeebbM+j0RlQKGW8PiKRFmAdbL6aj04t/2FY1PEjJk8ZI6/RGfjiRYiUEECumrTmVYiWEW4i6qaN9uJe15rJMBuP/v70zD4+qOhv478xCSEISskMWEAgmJAExYU2hiFJZlF0RtEJbW1txA0XFr1jx6/epQEVpvypirYgLtthWsGURrPIJsgjIHpagIPuSkAAhDFlO/ziTydxMgAxODJH39zzzcDn3zjvvfTNz33vPu5zuLckvPs8Tf9vsSagY16ct/TKasfqrfHYcOY3Dphj/o2vJSIhg0bieLN5yBIdd0S+zGSGNHPRJj6dPelX/mIff+5K8Y0bP1V8V8MbKr/llrza1N2bjCIhubS7OYO6orx0A5WVmRTQwAd8bJ5nYgvdFOKIFJGbB1vetMiOv8f2c8tKqIrRKTuyuoW5ggQlee1NabF1/oZJGoWBzWnsrVdZC7PVqb5CQZVpx5/27yil1vsdkJ53cZ5yYzWHiI2tfrXrfuULjZLr83PezL0BCaAIHiq2ZOj0Se/DRN1UZMfEh8TUGc4+d9c3t33tqr09B3Objm9lTuMcytuHYBuZsm+NxCGAa2M3YMMNSkPblsS95e/vbHodQydK9Sy3HgUmPDXYEW8Zc5S5LhXQl6VHpNLI1sjiQUEcoxWXW39CglMDf8Z79cqPHIVRS/NkKn+wk186dFL73F0uGjz571rNKm4fycgrnzePMx16OrqKCE6+8gj3MWlFfvHIlJ+cmWzKEKoqKKHjjDY9DqKRo/gKf5T5Pf/oJzljr02jpgQO4cq2L7dQV3wuncK603OcC7F1PUJ3h2Un0SY/n4MkSUpuFYXfn8K968kY27S8iJa6JZx3lZY/0YvG2I0SGOLk5vRm//3g3S3Orfqh2myIzsSlgbUyVGh9GY6fN4ixS4prw9MAM5q3fz/6Cs/TLbE52SxNMvS6pKev2FdAxuSkZCabobdHDPdlx5DQxTYI8+oQ3dl60rfbpc6Us2HTIMvb26n3+OYXdS6scApg5/SVPmthAmxtNVW/LHFOYlphtUjnzlkJsOzNXH59hnip2/stkEPUYD53ugU3vmSkmMB1Of/CwKYY7utXLcP1N8LfI66LXOMI9feR1Z6Rs5gK+c1HVsSEx5nNCoq1dWHs9YbKR/vEr83kJ18PQmaan08MbTXpt05Ym1gCmZ1Jvd1xhsTVIDJhKaD+cwuGzvsVu5yvOMyZ9DMu+WUZyWDITOk0g2BHMX3f+1VNUlhyWzM8yf8aCPQsodFV9nwe1GcSOgh2eOgaArPgsCl2FlhXTQhwhBDl8V/KrXlsApoiuOu2i2pFbkGv5nOz4bNKj0z0ptAB9Wvbh7vS7Wfj1Qk9PpPTodG5tY6qfX9vyGq4yF6PSRvkE3QHe2PoGWfFZPuP+oJxOy0U4pGsXXHm7LQ3jglJTcURFWtdgDgrC2eoaH3nOGtYlsIWF+aSC6nMubAnVnLnDgSPOt26nUQvf360zKRFH8+ZmIaBKPVu3wZmY6ClyA7CFhxMxdAhFCxZ4Ate2kBAqAhBYrk6dOgWlVD9gBmAH/qS1fr7a/iBgDpAN5AN3aK33BuKzS8t971yKSkppEuTAblNEBDuJCHZy9nwZNqVo7LQT0shB9zbRnC+r4IyrjCZBDmLDgri7W0u01pwqKePeXq3ZuL+QFXknCGlk58Eb2zL0+kSW7zrGwi0m/bFb6yiGZSUSFuzkmQXbOO0qIzMhnHt/2JrgRnZGd78GMOsxFJ0tJSLESWqzMFKbhfno2a65+bF661nJ+bIKzpdX0CSo6s9oxzeF+1CR70Xgomx8x3dspzudMfZa86qogJJCM79/l1fAraTQOIJR78KZYyZjSNlMjOKni0yaquuUCS4HhcOoubDsGTixE9r2NTGEVj+EuXeapwFlh14TTbbQkU2Q+0/zvl6Pm/TRe5fD5vfMRT99qOl51O0+aD8CTh8yjur8GVN7cc9HVa25SwrNOYREQaY7A8d1xqTMOoKqskC+sS7ADpgnCT8orx6fwaSN/uGmPzChs+nh4yp3UV5RzvuD3ufDPR8S7AhmYOuBOOwOZvebzetbXqfQVciwtsPoENuB3/X6Hb9d/Vt2ndxFTkIOD3Z8kJOuk+Tm55rV02wOxmePJzs+m/l58z0X69TIVEanj2blwZUe5xAXEsctrW7BVebi9a2vU1pRSo/EHgxKGUSXhC48v+Z5dhfuJichh3FZ4wh1hhIfEs+KgytoG9mWoSlDKdNlfDD4Az7a9xFhjcK4ueXNFJcW06VZF7o27+o575qcwtYTW33G/CVq4hOcnjuX8uMniBgyhIjBQ2iclsahJyZSdvgwjdq0Ie7xx1B2OyUbN1F66BDK6STu0UcIGzCAM//+t+fJIqxvXyIG3krJhvWcXmziaPaYGJqOuAOAwnerpvOajhpJcGYmZ9et92T+RI2+m4hhwzi95CNPr6LQHj0IHzCA8/v2mTbcWhPUNoWmI+4gtHMXDk2cSPnJkziTkoib8Cj2yEjO79tHyfr12GNiaPb0b2ickUHyzFc4+e5cbCHBRN1zD3uH+1dIWRvqrEuqUsoO7AJ+BBwAvgBGaa23ex0zFuigtf6VUmokMFRrfcfF5NbUJbX4XCkZk32rTzMSwnn5rixCGjm4/90NrP26gLiwIJ4b1p4bUuOY9MEW5q07gNNu474b2vDQTW15a/U+pi7ewRlXGf0zmzF9REe2HSpi/F828U3BWTITw/njnVmcdZXz1PytrNt3kvhwIzM5MoTpS3exdPtRnHYbY29ow897tuZPK75i1v9/RbGrjP6ZzXlhxHU1ygyEnmPfXsfR076xi73P31L7P970DDjlW5hEh5Ew+P9M8PaD+80xSZ3h9tlm/7yfmEyg8CQY8jK06A4LHjRxAmeIqU7OecCkfH76vHkCaX97zTIHvmSyg1bPNM6gUmZcBiyeCNv+7r/My9HTOxPJm8m1L0Bs/2b7Gsef6vYUI1JH8OqmV83FuLyUIW2HMKnrJFYeWsnkzydzvOQ4neI7Ma3XNEpKS3h0+aPkFuSS1CSJ53o+R1pUGr9e8WuWfbOMUGcoD3R8gLSoNJbvX87cnXMpLS+lb6u+pEelc7j4MEu+XsLxc8fJiM4gIzoDh83B2iNrySvMI6lJEpO6TSI5LJkZG2Z4ZI7LGhcwPb3bgldiw8amMZtqsFDN1Ng6OzqaVq/NwtmiJYcee4wzn36KPSKC2McfI6xnTwreetuzpnLEHSOIGDQI167dHH/xRcrz8wn9wQ+I/uW9VJw5w/EXX8K1ezeNWrUi+p6fgcPJqYULKf7sM2zh4YT374+tcWNKjx41i+xUVBAxdAjBnTtTduQoBbNnU56fT0hODlF33UVFyVnyX53lkRn32ARUSCgFs2dTvHy50XPCo4R27szJ9//GyTlzoKKCpqNGEvOr+yhetYqjzz7r0TNx+guUHT/OwfGP4NrtO4V6xbbOVkp1ByZrrfu6//8kgNb6Oa9jlriPWaWUcgBHgFh9EaVqcgrr9xxh+Gs15473ujaWuLAgS9ZPRLCTJ/unMfHvWyzHzvxxFmPf2WCpP3iiXyrvfbHfspayPzJf/XEW91WT+XjfVP6yru5leuOXU5h8kUV/+k81LSq8i8LShwDamrnTpJmZNqo+/TLyXVM9XCuZmCI2b5k9H4FFj19aZr8psOLFwOrpTQCcgk3ZmN5rOuM+tTbXe6b7M7yw/gXP3T3A0JShHCs5xsqDKz1jyWHJDEsZxowvZwRU5vC2w3lpw0tWmTdMZ9wngZNZnS1jtlxwX3VqdApAUFoaTW7oRf7MqhiQcjpJmDqFg+OtOf0J06Zy+DdPW5bejBx9N+e2bqNkw4Z6kjmVg+PHX5ZMb76tU6jL6aNEwDsadgDoeqFjtNZlSqkiIBqwRGOUUvcC9wK0aNGC6jzw7oWLibYfPsWJM9Z51aKSUp80UIAVefk+BWmbDxT5XGj9kflZ3glfmQf9kLnv8mXWCfvXWi+0YGIM1TlzxNyNVyevhrREf2TuX+s7XpPMA18EXs8AU6Er+PyQbxHU+mPrLRdagB0FO3yCzvtP72dbvrW9dSBkVp/OqdAVfH4wsDLrAteOHdirzeXr0lKKvRrTVVK8Zq11LWbAtT2Xc7nWC6pfMlevqbVMR5w1kKxLSyleteqyZQaSBlHRrLWepbXupLXuFBvrG8BZ9dSF74J7psTQI8Wap58UGcyAzOaWMbtNMTwrkeBq6aE3pMZ6up1W0sMfmdcn+cjs7YfM/v7ITK59j//LJu0WiKnWMqN1b9PC2pvYNEgbYB2zOaHjXebfupaZOiDwMgNMsCOYwSmDsSvr3/LmljeTEGptzdAtoRvdE7pbxtrHtKdnUs8GKbMuCOnWjSY5OZYxW1gY4QMHWlu0YJrN2aOtHYpDcroT2t2quz8yI26tvczQHOuYLSyMiEGDLltmIKnLJ4WDgHe4Pck9VtMxB9zTRxGYgHNAGJaVyNMDMwhy2DhXWs7S7UdpFRvKU7emk9YsnGcGZfDm53sJctp5+KYUrm8RyZ/GdGLakp3kF7u4PTuZEZ2SyWkTw+QF29h26BQ5KdEemSVuma29ZE4emM6cVfsIctp56MYUrm9pZE5dspOCYhe3ZV1cZq30bHlhPZ/5cBvLcq13aiH+ljRPLvKdQopsZZrBZQ4zaw0sesL0Cmrbx1Qfgwn27l5mis36T4GYtqY6et0bJoOo93+ZSuCR78Anz5p01MuRWbjf1BI0DjdZQjXJbD/cdFsNhJ6HN/raxw/W3L6GrvOsD8ntY9rzUNZDdIjtwLRe05i5aSaucpOh0yu5F81CmzHti2nsPbWX3sm9GXvdWM5XnEehWHN4De2i2/Fk1ydJCE3g6NmjzM+bT1TjqCte5ssbX/axjz9TRxci7Ed9iJ80CUdMDOUFJyn654c44+KJe2wCIdnZJEx5nvzXXkOXVxD105/QpHs3kme+wrEpUzl/8CDhffsS84tfUH76NEf/xzS1C76uwyVlnpg1C9wyQ7t3r5J54ADh/fpdlkxvPS0ya9Dz1MJFlzaOn9RlTMGBCTTfhLn4fwHcqbXe5nXM/UB7r0DzMK31iIvJvZKX4xQEQbhSqfeYgjtG8ACwBJMp+Wet9Tal1H8D67TWC4DXgbeUUnlAATCyrvQRBEEQLk2d1ilorRcCC6uN/cZr+xxwe13qIAiCINSeBhFoFgRBEL4bxCkIgiAIHsQpCIIgCB7EKQiCIAgexCkIgiAIHsQpCIIgCB7EKQiCIAge6qyiua5QSh0H/GtoXz/EUK2xn/CtEHsGDrFlYGko9myptfZtHleNBucUGgpKqXW1KSkXaofYM3CILQPL982eMn0kCIIgeBCnIAiCIHgQp1B3zKpvBb5niD0Dh9gysHyv7CkxBUEQBMGDPCkIgiAIHsQp1BKlVLJS6hOl1Hal1Dal1MPu8Sil1FKl1G73v5HucaWU+r1SKk8ptVkpleUe76iUWuWWsVkpdUd9nld9EUB79lZKbfR6nVNKDanPc/uuuQxbprm/gy6l1IRLybnaCKA9U6t9N08ppcbV13nVGq21vGrxApoDWe7tMMyqcunAVGCie3wiMMW9PQBYBCigG7DGPX4t0Na9nQAcBprW9/k1VHtWkxmFWawppL7P7wq3ZRzQGfhfYMKl5NT3+TVUe1aTaQeOYGoF6v0cL/aSJ4VaorU+rLXe4N4+DeQCicBg4E33YW8ClXepg4E52rAaaKqUaq613qW13u2Wcwg4BlyyoOT7RqDsWU3sbcAirfXZOj+BKwh/bam1Pqa1/gIoraWcq4pA2bMaNwF7tNZXfOGtOIXLQCl1DXA9sAaI11ofdu86AsS7txOB/V5vO0C1H5hSqgvQCNhTh+pe8QTKnpjlXOfWmaINgFra0l85Vy2BsicN6LspTsFPlFJNgL8B47TWp7z3afOcWKt0Lvdd7lvAT7XWFQFXtIEQYHu2x6wJflUSQFteUM7VRADt2QgYBMwLuJJ1gDgFP1BKOTFfkne01n93Dx+tnMZw/3vMPX4QSPZ6e5J7DKVUOPAv4NfuqZCrkkDZ080I4B9a64s9wn9v8dOW/sq56giUPd30BzZorY8GXtPAI06hliilFPA6kKu1nu61awEwxr09BpjvNT7anTXTDSjSWh923zX8AzM//v53pP4VR6Ds6fW+UTSQx/NAcxm29FfOVUWg7OlFw/pu1neku6G8gB6Yx8XNwEb3awAQDXwM7AaWAVHu4xXwR0y8YAvQyT3+Y0xAaqPXq2N9n19Dtad73zWYpwZbfZ9XA7FlM0xM5hRQ6N4Ov5Cc+j6/hmpP975QIB+IqO/zqu1LKpoFQRAEDzJ9JAiCIHgQpyAIgiB4EKcgCIIgeBCnIAiCIHgQpyAIgiB4EKcgCBdBKXXmEvuvUUpt9VPmbKXUbd9OM0GoG8QpCIIgCB7EKQhCLVBKNVFKfayU2qCU2qKUGuy126GUekcplauUel8pFeJ+T7ZSarlSar1SakkNXV0F4YpDnIIg1I5zwFCtdRbQG3jB3Q4BIBV4WWvdDlPVOtbdO+cPwG1a62zgz5h++4JwReOobwUEoYGggGeVUj8EKjBtuytbJ+/XWq90b78NPAQsBjKBpW7fYccsqCQIVzTiFAShdtyFWQwpW2tdqpTaCzR276veK0ZjnMg2rXX3705FQfj2yPSRINSOCOCY2yH0Blp67WuhlKq8+N8JrAB2ArGV40opp1Iq4zvVWBAuA3EKglA73gE6KaW2AKOBHV77dgL3K6VygUjgFa31eczyoFOUUpswnTZzvmOdBcFvpEuqIAiC4EGeFARBEAQP4hQEQRAED+IUBEEQBA/iFARBEAQP4hQEQRAED+IUBEEQBA/iFARBEAQP4hQEQRAED/8BOMxP15knfu0AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 2\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 3\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 4\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 5\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 6\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 7\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 8\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 9\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic 10\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "for i in range(0, model.num_topics()):\n", + " print(\"Topic {}\".format(i + 1))\n", + " sns.swarmplot(data=df, x='label', y=\"Topic {}\".format(i + 1))\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Topic Inference (Unseen Document)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try to figure out what topics are mentioned in a previously unseen document." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scan Order in Gibbs Sampling: Models in Which it\n", + "Matters and Bounds on How Much\n", + "Bryan He, Christopher De Sa, Ioannis Mitliagkas, and Christopher RĂ©\n", + "Stanford University\n", + "{bryanhe,cdesa,imit,chrismre}@stanford.edu\n", + "\n", + "Abstract\n", + "Gibbs sampling is a Markov Chain Monte Carlo sampling technique that iteratively\n", + "samples variables from their conditional distributions. There are two common scan\n", + "orders for the variables: random scan and systematic scan. Due to the benefits\n", + "of locality in hardware, systematic s...\n" + ] + } + ], + "source": [ + "doc = metapy.index.Document()\n", + "with open('data/6589-scan-order-in-gibbs-sampling-models-in-which-it-matters-and-bounds-on-how-much.txt') as f:\n", + " doc.content(f.read())\n", + "print(\"{}...\".format(doc.content()[0:500]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We first need to transform the unseen document into the same term-id space used by the topic model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "dvec = fidx.tokenize(doc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "...and then we can create an inferencer on top of our topic model output to infer the topic coverage for this new document:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r", + " > Loading topic term probabilities: [==> ] 10% ETA 00:00:00 \r", + " > Loading topic term probabilities: [=======================] 100% ETA 00:00:00 \n" + ] + } + ], + "source": [ + "inferencer = metapy.topics.GibbsInferencer('lda-pgibbs-nips.phi.bin', alpha=0.1)\n", + "props = inferencer.infer(dvec, max_iters=100, rng_seed=42)\n", + "print(props)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification with Topic Features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The topic proportion vectors are also often used as input to a classifier. In our case, since we see some differences between the years 2002 and 2017 in terms of topical coverage, let's see if we can learn to separate documents that were written in 2002 from documents that were written in 2017 on the basis of their topic proportions alone." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# First, create a lightweight view for shuffling\n", + "shuffled_view = metapy.classify.MulticlassDatasetView(dset)\n", + "shuffled_view.shuffle()\n", + "\n", + "# this dataset will use unigram words as features\n", + "words_dset = metapy.classify.MulticlassDataset(\n", + " [doc for doc in shuffled_view if dset.label(doc) == \"2002\" or dset.label(doc) == \"2017\"],\n", + " dset.total_features(),\n", + " lambda doc: metapy.learn.FeatureVector(doc.weights),\n", + " lambda doc: dset.label(doc)\n", + ")\n", + "\n", + "# this dataset will use topic proportions as features\n", + "topic_dset = metapy.classify.MulticlassDataset(\n", + " [doc for doc in shuffled_view if dset.label(doc) == \"2002\" or dset.label(doc) == \"2017\"],\n", + " model.num_topics(),\n", + " lambda doc: metapy.learn.FeatureVector((i, model.topic_probability(doc.id, i)) for i in range(0, model.num_topics())),\n", + " lambda doc: dset.label(doc)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll use a 50/50 training/test split setup." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "words_train = words_dset[0:int(len(words_dset)/2)]\n", + "words_test = words_dset[int(len(words_dset)/2):]\n", + "\n", + "topics_train = topic_dset[0:int(len(topic_dset)/2)]\n", + "topics_test = topic_dset[int(len(topic_dset)/2):]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Words:\n", + "\n", + " 2002 2017 \n", + " ------------------\n", + " 2002 | \u001b[1m0.883\u001b[22m 0.117 \n", + " 2017 | 0.0392 \u001b[1m0.961\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "2002 0.883 0.883 0.883 0.251 \n", + "2017 0.961 0.961 0.961 0.749 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.941\u001b[22m \u001b[1m0.941\u001b[22m \u001b[1m0.941\u001b[22m \n", + "------------------------------------------------------------\n", + "443 predictions attempted, overall accuracy: 0.941\n", + "\n", + "======\n", + "Topics:\n", + "\n", + " 2002 2017 \n", + " ------------------\n", + " 2002 | \u001b[1m0.613\u001b[22m 0.387 \n", + " 2017 | 0.0753 \u001b[1m0.925\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "2002 0.667 0.731 0.613 0.251 \n", + "2017 0.9 0.877 0.925 0.749 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.844\u001b[22m \u001b[1m0.841\u001b[22m \u001b[1m0.847\u001b[22m \n", + "------------------------------------------------------------\n", + "443 predictions attempted, overall accuracy: 0.847\n", + "\n" + ] + } + ], + "source": [ + "def make_linear_svm(training):\n", + " return metapy.classify.OneVsAll(training, metapy.classify.SGD, loss_id='hinge')\n", + "\n", + "words_sgd = make_linear_svm(words_train)\n", + "topics_sgd = make_linear_svm(topics_train)\n", + "\n", + "print(\"Words:\")\n", + "mtrx = words_sgd.test(words_test)\n", + "print(mtrx)\n", + "mtrx.print_stats()\n", + "\n", + "print(\"======\")\n", + "print(\"Topics:\")\n", + "mtrx = topics_sgd.test(topics_test)\n", + "print(mtrx)\n", + "mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While we don't beat unigram words, we still do very well for a model that is only using 10 features compared to the tens of thousands used by the words model:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "66479" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fidx.unique_terms()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also try a straight multiclass classification problem: given a document, predect the year from the topic proportions alone." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "topic_dset = metapy.classify.MulticlassDataset(\n", + " [doc for doc in shuffled_view],\n", + " model.num_topics(),\n", + " lambda doc: metapy.learn.FeatureVector((i, model.topic_probability(doc.id, i)) for i in range(0, model.num_topics())),\n", + " lambda doc: dset.label(doc)\n", + ")\n", + "\n", + "words_train = shuffled_view[0:int(len(shuffled_view)/2)]\n", + "words_test = shuffled_view[int(len(shuffled_view)/2):]\n", + "\n", + "topics_train = topic_dset[0:int(len(topic_dset)/2)]\n", + "topics_test = topic_dset[int(len(topic_dset)/2):]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Words:\n", + "\n", + " 2002 2007 2012 2017 \n", + " ------------------------------------\n", + " 2002 | \u001b[1m0.667\u001b[22m 0.176 0.0926 0.0648 \n", + " 2007 | 0.314 \u001b[1m0.195\u001b[22m 0.398 0.0932 \n", + " 2012 | 0.107 0.16 \u001b[1m0.487\u001b[22m 0.246 \n", + " 2017 | 0.0217 0.00619 0.0774 \u001b[1m0.895\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "2002 0.59 0.529 0.667 0.147 \n", + "2007 0.24 0.311 0.195 0.16 \n", + "2012 0.506 0.526 0.487 0.254 \n", + "2017 0.855 0.819 0.895 0.439 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.633\u001b[22m \u001b[1m0.62\u001b[22m \u001b[1m0.645\u001b[22m \n", + "------------------------------------------------------------\n", + "736 predictions attempted, overall accuracy: 0.645\n", + "\n", + "========\n", + "Topics:\n", + "\n", + " 2002 2007 2012 2017 \n", + " ------------------------------------\n", + " 2002 | \u001b[1m0.102\u001b[22m 0.148 0.352 0.398 \n", + " 2007 | 0.161 \u001b[1m0.153\u001b[22m 0.297 0.39 \n", + " 2012 | 0.0695 0.107 \u001b[1m0.203\u001b[22m 0.62 \n", + " 2017 | 0.0031 0.031 0.0495 \u001b[1m0.916\u001b[22m \n", + "\n", + "\n", + "------------------------------------------------------------\n", + "\u001b[1mClass\u001b[22m \u001b[1mF1 Score\u001b[22m \u001b[1mPrecision\u001b[22m \u001b[1mRecall\u001b[22m \u001b[1mClass Dist\u001b[22m \n", + "------------------------------------------------------------\n", + "2002 0.145 0.25 0.102 0.147 \n", + "2007 0.198 0.281 0.153 0.16 \n", + "2012 0.242 0.299 0.203 0.254 \n", + "2017 0.718 0.591 0.916 0.439 \n", + "------------------------------------------------------------\n", + "\u001b[1mTotal\u001b[22m \u001b[1m0.452\u001b[22m \u001b[1m0.417\u001b[22m \u001b[1m0.493\u001b[22m \n", + "------------------------------------------------------------\n", + "736 predictions attempted, overall accuracy: 0.493\n", + "\n" + ] + } + ], + "source": [ + "words_svm = make_linear_svm(words_train)\n", + "topics_svm = make_linear_svm(topics_train)\n", + "\n", + "words_mtrx = words_svm.test(words_test)\n", + "topics_mtrx = topics_svm.test(topics_test)\n", + "\n", + "print(\"Words:\")\n", + "print(words_mtrx)\n", + "words_mtrx.print_stats()\n", + "\n", + "print(\"========\")\n", + "print(\"Topics:\")\n", + "print(topics_mtrx)\n", + "topics_mtrx.print_stats()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is quite a bit harder!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}