From f9cffa229fdea34de3da05a4e4d67717e3a59f40 Mon Sep 17 00:00:00 2001 From: Ernest W Durbin III Date: Sun, 1 Mar 2020 12:23:17 -0500 Subject: [PATCH] init --- linehaul/__init__.py | 11 ++ linehaul/events/__init__.py | 11 ++ linehaul/events/parser.py | 235 +++++++++++++++++++++++++ linehaul/logging.py | 19 +++ linehaul/ua/__init__.py | 16 ++ linehaul/ua/datastructures.py | 66 +++++++ linehaul/ua/impl.py | 178 +++++++++++++++++++ linehaul/ua/parser.py | 312 ++++++++++++++++++++++++++++++++++ main.py | 71 ++++++++ requirements.txt | 4 + 10 files changed, 923 insertions(+) create mode 100644 linehaul/__init__.py create mode 100644 linehaul/events/__init__.py create mode 100644 linehaul/events/parser.py create mode 100644 linehaul/logging.py create mode 100644 linehaul/ua/__init__.py create mode 100644 linehaul/ua/datastructures.py create mode 100644 linehaul/ua/impl.py create mode 100644 linehaul/ua/parser.py create mode 100644 main.py create mode 100644 requirements.txt diff --git a/linehaul/__init__.py b/linehaul/__init__.py new file mode 100644 index 0000000..164f68b --- /dev/null +++ b/linehaul/__init__.py @@ -0,0 +1,11 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/linehaul/events/__init__.py b/linehaul/events/__init__.py new file mode 100644 index 0000000..164f68b --- /dev/null +++ b/linehaul/events/__init__.py @@ -0,0 +1,11 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/linehaul/events/parser.py b/linehaul/events/parser.py new file mode 100644 index 0000000..f4ea876 --- /dev/null +++ b/linehaul/events/parser.py @@ -0,0 +1,235 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +import logging +import posixpath + +from typing import Optional + +import arrow +import attr +import attr.validators +import cattr + +from pyparsing import Literal as L, Word, Optional as OptionalItem +from pyparsing import printables as _printables, restOfLine +from pyparsing import ParseException + +from linehaul.ua import UserAgent, parser as user_agents + + +logger = logging.getLogger(__name__) + + +_cattr = cattr.Converter() +_cattr.register_structure_hook( + arrow.Arrow, lambda d, t: arrow.get(d[5:-4], "DD MMM YYYY HH:mm:ss") +) + + +class UnparseableEvent(Exception): + pass + + +class NullValue: + pass + + +NullValue = NullValue() + + +printables = "".join(set(_printables + " " + "\t") - {"|", "@"}) + +PIPE = L("|").suppress() + +AT = L("@").suppress() + +NULL = L("(null)") +NULL.setParseAction(lambda s, l, t: NullValue) + +TIMESTAMP = Word(printables) +TIMESTAMP = TIMESTAMP.setResultsName("timestamp") +TIMESTAMP.setName("Timestamp") + +COUNTRY_CODE = Word(printables) +COUNTRY_CODE = COUNTRY_CODE.setResultsName("country_code") +COUNTRY_CODE.setName("Country Code") + +URL = Word(printables) +URL = URL.setResultsName("url") +URL.setName("URL") + +REQUEST = TIMESTAMP + PIPE + OptionalItem(COUNTRY_CODE) + PIPE + URL + +PROJECT_NAME = NULL | Word(printables) +PROJECT_NAME = PROJECT_NAME.setResultsName("project_name") +PROJECT_NAME.setName("Project Name") + +VERSION = NULL | Word(printables) +VERSION = VERSION.setResultsName("version") +VERSION.setName("Version") + +PACKAGE_TYPE = NULL | ( + L("sdist") + | L("bdist_wheel") + | L("bdist_dmg") + | L("bdist_dumb") + | L("bdist_egg") + | L("bdist_msi") + | L("bdist_rpm") + | L("bdist_wininst") +) +PACKAGE_TYPE = PACKAGE_TYPE.setResultsName("package_type") +PACKAGE_TYPE.setName("Package Type") + +PROJECT = PROJECT_NAME + PIPE + VERSION + PIPE + PACKAGE_TYPE + +TLS_PROTOCOL = NULL | Word(printables) +TLS_PROTOCOL = TLS_PROTOCOL.setResultsName("tls_protocol") +TLS_PROTOCOL.setName("TLS Protocol") + +TLS_CIPHER = NULL | Word(printables) +TLS_CIPHER = TLS_CIPHER.setResultsName("tls_cipher") +TLS_CIPHER.setName("TLS Cipher") + +TLS = TLS_PROTOCOL + PIPE + TLS_CIPHER + +USER_AGENT = restOfLine +USER_AGENT = USER_AGENT.setResultsName("user_agent") +USER_AGENT.setName("UserAgent") + +V1_HEADER = OptionalItem(L("1").suppress() + AT) + +MESSAGE_v1 = V1_HEADER + REQUEST + PIPE + PROJECT + PIPE + USER_AGENT +MESSAGE_v1.leaveWhitespace() + +V2_HEADER = L("2").suppress() + AT + +MESSAGE_v2 = V2_HEADER + REQUEST + PIPE + TLS + PIPE + PROJECT + PIPE + USER_AGENT +MESSAGE_v2.leaveWhitespace() + +V3_HEADER = L("download") +MESSAGE_v3 = V3_HEADER + PIPE + REQUEST + PIPE + TLS + PIPE + PROJECT + PIPE + USER_AGENT + +SIMPLE_HEADER = L("simple") +MESSAGE_SIMPLE = SIMPLE_HEADER + PIPE + REQUEST + PIPE + TLS + PIPE + PIPE + PIPE + PIPE + USER_AGENT + +MESSAGE = MESSAGE_SIMPLE | MESSAGE_v3 | MESSAGE_v2 | MESSAGE_v1 + + +@enum.unique +class PackageType(enum.Enum): + bdist_dmg = "bdist_dmg" + bdist_dumb = "bdist_dumb" + bdist_egg = "bdist_egg" + bdist_msi = "bdist_msi" + bdist_rpm = "bdist_rpm" + bdist_wheel = "bdist_wheel" + bdist_wininst = "bdist_wininst" + sdist = "sdist" + + +@attr.s(slots=True, frozen=True) +class File: + + filename = attr.ib(validator=attr.validators.instance_of(str)) + project = attr.ib(validator=attr.validators.instance_of(str)) + version = attr.ib(validator=attr.validators.instance_of(str)) + type = attr.ib(type=PackageType) + + +@attr.s(slots=True, frozen=True) +class Download: + + timestamp = attr.ib(type=arrow.Arrow) + url = attr.ib(validator=attr.validators.instance_of(str)) + file = attr.ib(type=File) + tls_protocol = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(str)), + ) + tls_cipher = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(str)), + ) + country_code = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(str)), + ) + details = attr.ib(type=Optional[UserAgent], default=None) + + +@attr.s(slots=True, frozen=True) +class Simple: + + timestamp = attr.ib(type=arrow.Arrow) + url = attr.ib(validator=attr.validators.instance_of(str)) + project = attr.ib(validator=attr.validators.instance_of(str)) + tls_protocol = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(str)), + ) + tls_cipher = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(str)), + ) + country_code = attr.ib( + default=None, + validator=attr.validators.optional(attr.validators.instance_of(str)), + ) + details = attr.ib(type=Optional[UserAgent], default=None) + + +def _value_or_none(value): + if value is NullValue or value == "": + return None + else: + return value + + +def parse(message): + try: + parsed = MESSAGE.parseString(message, parseAll=True) + except ParseException as exc: + raise UnparseableEvent("{!r} {}".format(message, exc)) from None + + data = {} + data["timestamp"] = parsed.timestamp + data["tls_protocol"] = _value_or_none(parsed.tls_protocol) + data["tls_cipher"] = _value_or_none(parsed.tls_cipher) + data["country_code"] = _value_or_none(parsed.country_code) + data["url"] = parsed.url + data["file"] = {} + data["file"]["filename"] = posixpath.basename(parsed.url) + data["file"]["project"] = _value_or_none(parsed.project_name) + data["file"]["version"] = _value_or_none(parsed.version) + data["file"]["type"] = _value_or_none(parsed.package_type) + + if parsed[0] == 'download': + result = _cattr.structure(data, Download) + elif parsed[0] == 'simple': + data["project"] = parsed.url.split('/')[2] + result = _cattr.structure(data, Simple) + else: + result = _cattr.structure(data, Download) + + try: + ua = user_agents.parse(parsed.user_agent) + if ua is None: + return # Ignored user agents mean we'll skip trying to log this event + except user_agents.UnknownUserAgentError: + print(f"Unknown User agent: {parsed.user_agent}") + else: + result = attr.evolve(result, details=ua) + + return result diff --git a/linehaul/logging.py b/linehaul/logging.py new file mode 100644 index 0000000..377a5ed --- /dev/null +++ b/linehaul/logging.py @@ -0,0 +1,19 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging as _logging + + +SPEW = 5 + + +_logging.addLevelName(SPEW, "SPEW") diff --git a/linehaul/ua/__init__.py b/linehaul/ua/__init__.py new file mode 100644 index 0000000..7beff50 --- /dev/null +++ b/linehaul/ua/__init__.py @@ -0,0 +1,16 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from linehaul.ua.datastructures import UserAgent + + +__all__ = ["UserAgent"] diff --git a/linehaul/ua/datastructures.py b/linehaul/ua/datastructures.py new file mode 100644 index 0000000..6314428 --- /dev/null +++ b/linehaul/ua/datastructures.py @@ -0,0 +1,66 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import attr + + +@attr.s(slots=True, frozen=True) +class Installer: + + name = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class Implementation: + + name = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class LibC: + + lib = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class Distro: + + name = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + id = attr.ib(type=Optional[str], default=None) + libc = attr.ib(type=Optional[LibC], default=None) + + +@attr.s(slots=True, frozen=True) +class System: + + name = attr.ib(type=Optional[str], default=None) + release = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class UserAgent: + + installer = attr.ib(type=Optional[Installer], default=None) + python = attr.ib(type=Optional[str], default=None) + implementation = attr.ib(type=Optional[Implementation], default=None) + distro = attr.ib(type=Optional[Distro], default=None) + system = attr.ib(type=Optional[System], default=None) + cpu = attr.ib(type=Optional[str], default=None) + openssl_version = attr.ib(type=Optional[str], default=None) + setuptools_version = attr.ib(type=Optional[str], default=None) + ci = attr.ib(type=Optional[bool], default=None) diff --git a/linehaul/ua/impl.py b/linehaul/ua/impl.py new file mode 100644 index 0000000..2af8c32 --- /dev/null +++ b/linehaul/ua/impl.py @@ -0,0 +1,178 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import collections +import logging +import random +import re + + +logger = logging.getLogger(__name__) + + +class UnableToParse(Exception): + pass + + +class UserAgentParser(metaclass=abc.ABCMeta): + @property + @abc.abstractmethod + def name(self): + """ + Returns the name of this parser, useful for things like logging etc. + """ + + @abc.abstractmethod + def __call__(self, ua): + """ + Actually parses the user agent, returning a dictionary containing all of the + relevant parsed information. If this method is unable to parse the user agent + then it can raise a ``UnableToParse`` exception to indicate that it can't parse + the given UA. + """ + + +class CallbackUserAgentParser(UserAgentParser): + def __init__(self, callback, *, name=None): + if name is None: + name = callback.__name__ + + self._callback = callback + self._name = name + + @property + def name(self): + return self._name + + def __call__(self, ua): + return self._callback(ua) + + +def ua_parser(fn): + return CallbackUserAgentParser(fn) + + +class RegexUserAgentParser(UserAgentParser): + def __init__(self, regexes, handler, *, name=None): + if name is None: + name = handler.__name__ + + self._regexes = [ + re.compile(regex) if isinstance(regex, str) else regex for regex in regexes + ] + self._handler = handler + self._name = name + + @property + def name(self): + return self._name + + def __call__(self, user_agent): + for regex in self._regexes: + matched = regex.search(user_agent) + + # If we've matched this particuar regex, then we'll break the loop here and + # go onto finishing parsing. + if matched is not None: + break + else: + # None of our regexes matched. + raise UnableToParse + + # We need to build up the args, and kwargs of our function, we call any unnamed + # group an arg, and pass them in, in order, and we call any named group a kwarg + # and we pass them in by name. + group_to_name = {v: k for k, v in matched.re.groupindex.items()} + args, kwargs = [], {} + for i, value in enumerate(matched.groups(), start=1): + name = group_to_name.get(i) + if name is not None: + kwargs[name] = value + else: + args.append(value) + + # Finally, we'll call our handler with our parsed arguments, and return whatever + # result it gives us. + return self._handler(*args, **kwargs) + + +def regex_ua_parser(*regexes): + def deco(fn): + return RegexUserAgentParser(regexes, fn) + + return deco + + +class ParserSet: + def __init__(self): + self._parsers = [] + + self._optimize_every = 1000000 + # Set the first optimize in to a reduced amount to get some basic optimization + # done early. + self._optimize_in = self._optimize_every * 0.25 + self._counts = collections.Counter() + + def register(self, parser, *, _randomize=True): + self._parsers.append(parser) + + # The use of random.shuffle here is a bit quirkly, it doesn't actually help us + # at runtime in any way. What it *does* do, is make it more likely that any + # ordering dependence in registered parsers shows up as test failures instead + # of being hard to find bugs in production. + # This does make registering a parser more heavy-weight than recorded (through + # minorly so), but this shouldn't matter since in our usage registerin is only + # done at the module level anyways. + if _randomize: + random.shuffle(self._parsers) + + return parser + + def _optimize(self): + # We're going to sort our list in place, using the value of how many times + # a parser function has been used as the parser for a user agent to put the + # most commonly used parsed first. + self._parsers.sort(key=lambda p: self._counts[p], reverse=True) + + # Reduce our recorded counts just to keep the size of our counts in checks. + # This will also implicitly act as a decay so that historical data is less + # relevant than new data. + self._counts.subtract({k: int(v * 0.5) for k, v in self._counts.items()}) + + # Reset our marker + self._optimize_in = self._optimize_every + + def __call__(self, user_agent): + # Decrement our counter for how long until we will implicitly call optimize + # on our ParserSet, and check to see if it's time to optimize or not. + self._optimize_in -= 1 + if self._optimize_in <= 0: + self._optimize() + + # Actually go through our registered parsers and try to use them to parse. + for parser in self._parsers: + try: + parsed = parser(user_agent) + + # Record a "hit" for this parser. + self._counts[parser] += 1 + + return parsed + except UnableToParse: + pass + except Exception: + logger.error( + "Error parsing %r as a %s.", user_agent, parser.name, exc_info=True + ) + + raise UnableToParse diff --git a/linehaul/ua/parser.py b/linehaul/ua/parser.py new file mode 100644 index 0000000..f220614 --- /dev/null +++ b/linehaul/ua/parser.py @@ -0,0 +1,312 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import re + +import cattr +import packaging.version + +from packaging.specifiers import SpecifierSet + +from linehaul.ua.datastructures import UserAgent +from linehaul.ua.impl import ParserSet, UnableToParse, ua_parser, regex_ua_parser + + +logger = logging.getLogger(__name__) + + +class UnknownUserAgentError(ValueError): + pass + + +# Note: This is a ParserSet, not a ParserList, parsers that have been registered with +# it may be called in any order. That means that all of our parsers need to be +# ordering independent. +_parser = ParserSet() + + +@_parser.register +@ua_parser +def Pip6UserAgent(user_agent): + # We're only concerned about pip user agents. + if not user_agent.startswith("pip/"): + raise UnableToParse + + # This format was brand new in pip 6.0, so we'll need to restrict it + # to only versions of pip newer than that. + version_str = user_agent.split()[0].split("/", 1)[1] + version = packaging.version.parse(version_str) + if version not in SpecifierSet(">=6", prereleases=True): + raise UnableToParse + + try: + return json.loads(user_agent.split(maxsplit=1)[1]) + except (json.JSONDecodeError, UnicodeDecodeError, IndexError): + raise UnableToParse from None + + +@_parser.register +@regex_ua_parser( + ( + r"^pip/(?P\S+) (?P\S+)/(?P\S+) " + r"(?P\S+)/(?P\S+)$" + ) +) +def Pip1_4UserAgent(*, version, impl_name, impl_version, system_name, system_release): + # This format was brand new in pip 1.4, and went away in pip 6.0, so + # we'll need to restrict it to only versions of pip between 1.4 and 6.0. + if version not in SpecifierSet(">=1.4,<6", prereleases=True): + raise UnableToParse + + data = {"installer": {"name": "pip", "version": version}} + + if impl_name.lower() != "unknown": + data.setdefault("implementation", {})["name"] = impl_name + + if impl_version.lower() != "unknown": + data.setdefault("implementation", {})["version"] = impl_version + + if system_name.lower() != "unknown": + data.setdefault("system", {})["name"] = system_name + + if system_release.lower() != "unknown": + data.setdefault("system", {})["release"] = system_release + + if impl_name.lower() == "cpython": + data["python"] = impl_version + + return data + + +@_parser.register +@regex_ua_parser(r"^Python-urllib/(?P\d\.\d) distribute/(?P\S+)$") +def DistributeUserAgent(*, python, version): + return {"installer": {"name": "distribute", "version": version}, "python": python} + + +@_parser.register +@regex_ua_parser( + r"^Python-urllib/(?P\d\.\d) setuptools/(?P\S+)$", + r"^setuptools/(?P\S+) Python-urllib/(?P\d\.\d)$", +) +def SetuptoolsUserAgent(*, python, version): + return {"installer": {"name": "setuptools", "version": version}, "python": python} + + +@_parser.register +@regex_ua_parser(r"pex/(?P\S+)$") +def PexUserAgent(*, version): + return {"installer": {"name": "pex", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^conda/(?P\S+)(?: .+)?$") +def CondaUserAgent(*, version): + return {"installer": {"name": "conda", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^Bazel/(?:release\s+)?(?P.+)$") +def BazelUserAgent(*, version): + return {"installer": {"name": "Bazel", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^bandersnatch/(?P\S+) \(.+\)$") +def BandersnatchUserAgent(*, version): + return {"installer": {"name": "bandersnatch", "version": version}} + + +@_parser.register +@regex_ua_parser(r"devpi-server/(?P\S+) \(.+\)$") +def DevPIUserAgent(*, version): + return {"installer": {"name": "devpi", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^z3c\.pypimirror/(?P\S+)$") +def Z3CPyPIMirrorUserAgent(*, version): + return {"installer": {"name": "z3c.pypimirror", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^Artifactory/(?P\S+)$") +def ArtifactoryUserAgent(*, version): + return {"installer": {"name": "Artifactory", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^Nexus/(?P\S+)") +def NexusUserAgent(*, version): + return {"installer": {"name": "Nexus", "version": version}} + + +@_parser.register +@regex_ua_parser(r"^pep381client(?:-proxy)?/(?P\S+)$") +def PEP381ClientUserAgent(*, version): + return {"installer": {"name": "pep381client", "version": version}} + + +# TODO: We should probably consider not parsing this specially, and moving it to +# just the same as we treat browsers, since we don't really know anything +# about it-- including whether or not the version of Python mentioned is +# the one they're going to install it into or not. The one real sticking +# point is that before pip 1.4, pip just used the default urllib2 UA, so +# right now we're counting pip 1.4 in here... but pip 1.4 usage is probably +# low enough not to worry about that any more. +@_parser.register +@regex_ua_parser(r"^Python-urllib/(?P\d\.\d)$") +def URLLib2UserAgent(*, python): + return {"python": python} + + +# TODO: We should probably consider not parsing this specially, and moving it to +# just the same as we treat browsers, since we don't really know anything +# about it and the version of requests isn't very useful in general. +@_parser.register +@regex_ua_parser(r"^python-requests/(?P\S+)(?: .+)?$") +def RequestsUserAgent(*, version): + return {"installer": {"name": "requests", "version": version}} + + +@_parser.register +@regex_ua_parser( + ( + r"^Homebrew/(?P\S+) " + r"\(Macintosh; Intel (?:Mac OS X|macOS) (?P[^)]+)\)(?: .+)?$" + ) +) +def HomebrewUserAgent(*, version, osx_version): + return { + "installer": {"name": "Homebrew", "version": version}, + "distro": {"name": "OS X", "version": osx_version}, + } + + +# TODO: It would be nice to maybe break more of these apart to try and get more insight +# into the OSs that people are installing packages into (similiar to Homebrew). +@_parser.register +@regex_ua_parser( + re.compile( + r""" + (?: + ^fetch\ libfetch/\S+$ | + ^libfetch/\S+$ | + ^OpenBSD\ ftp$ | + ^MacPorts/? | + ^NetBSD-ftp/ | + ^slapt-get | + ^pypi-install/ | + ^slackrepo$ | + ^PTXdist | + ^GARstow/ | + ^xbps/ + ) + """, + re.VERBOSE, + ) +) +def OSUserAgent(): + return {"installer": {"name": "OS"}} + + +@_parser.register +@regex_ua_parser( + re.compile( + r""" + ^ + (?: + Mozilla | + Safari | + wget | + curl | + Opera | + aria2 | + AndroidDownloadManager | + com\.apple\.WebKit\.Networking/ | + FDM\ \S+ | + URL/Emacs | + Firefox/ | + UCWEB | + Links | + ^okhttp | + ^Apache-HttpClient + ) + (?:/|$) + """, + re.IGNORECASE | re.VERBOSE, + ) +) +def BrowserUserAgent(): + return {"installer": {"name": "Browser"}} + + +# TODO: It would be kind of nice to implement this as just another parser, that returns +# None instead of a dictionary. However given that there is no inherent ordering +# in a ParserSet, and we want this to always go last (just incase an ignore +# pattern is overlly broad) we can't do that. It would be nice to make it possible +# to register a parser with an explicit location in the parser set. +_ignore_re = re.compile( + r""" + (?: + ^Datadog\ Agent/ | + ^\(null\)$ | + ^WordPress/ | + ^Chef\ (?:Client|Knife)/ | + ^Ruby$ | + ^Slackbot-LinkExpanding | + ^TextualInlineMedia/ | + ^WeeChat/ | + ^Download\ Master$ | + ^Java/ | + ^Go\ \d\.\d\ package\ http$ | + ^Go-http-client/ | + ^GNU\ Guile$ | + ^github-olee$ | + ^YisouSpider$ | + ^Apache\ Ant/ | + ^Salt/ | + ^ansible-httpget$ | + ^ltx71\ -\ \(http://ltx71.com/\) | + ^Scrapy/ | + ^spectool/ | + Nutch | + ^AWSBrewLinkChecker/ | + ^Y!J-ASR/ | + ^NSIS_Inetc\ \(Mozilla\)$ | + ^Debian\ uscan | + ^Pingdom\.com_bot_version_\d+\.\d+_\(https?://www.pingdom.com/\)$ | + ^MauiBot\ \(crawler\.feedback\+dc@gmail\.com\)$ + ) + """, + re.VERBOSE, +) + + +def parse(user_agent): + try: + return cattr.structure(_parser(user_agent), UserAgent) + except UnableToParse: + # If we were not able to parse the user agent, then we have two options, we can + # either raise an `UnknownUserAgentError` or we can return None to explicitly + # say that we opted not to parse this user agent. To determine which option we + # pick we'll match against a regex of UAs to ignore, if we match then we'll + # return a None to indicate to our caller that we couldn't parse this UA, but + # that it was an expected inability to parse. Otherwise we'll raise an + # `UnknownUserAgentError` to indicate that it as an unexpected inability to + # parse. + if _ignore_re.search(user_agent) is not None: + return None + + raise UnknownUserAgentError from None diff --git a/main.py b/main.py new file mode 100644 index 0000000..7d527e3 --- /dev/null +++ b/main.py @@ -0,0 +1,71 @@ +import arrow +import cattr + +import os +import json +import gzip +import sys + +from collections import defaultdict +from contextlib import ExitStack +from pathlib import Path + +from linehaul.events.parser import parse, Download, Simple + +_cattr = cattr.Converter() +_cattr.register_unstructure_hook(arrow.Arrow, lambda o: o.format('YYYY-MM-DD HH:mm:ss ZZ')) + +input_file = sys.argv[1] +identifier = os.path.basename(input_file).split('-')[-1].split('.')[0] + +class OutputFiles(defaultdict): + + def __init__(self, stack, *args, **kwargs): + self.stack = stack + super(OutputFiles, self).__init__(*args, **kwargs) + + def __missing__(self, key): + Path(os.path.dirname(key)).mkdir(parents=True, exist_ok=True) + ret = self[key] = self.stack.enter_context(open(key, 'wb')) + return ret + +prefix = { + Simple.__name__: 'simple_requests', + Download.__name__: 'downloads', +} + +with ExitStack() as stack: + f = stack.enter_context(gzip.open(input_file, 'rt')) + output_files = OutputFiles(stack) + for line in f: + try: + res = parse(line) + if res is not None: + partition = res.timestamp.format('YYYYMMDD') + output_files[f'results/{prefix[res.__class__.__name__]}/{partition}/{identifier}.json'].write(json.dumps(_cattr.unstructure(res)).encode() + b'\n') + else: + output_files[f'results/unprocessed/{identifier}.txt'].write(line.encode() + b'\n') + except Exception as e: + output_files[f'results/unprocessed/{identifier}.txt'].write(line.encode() + b'\n') + + +#with open('downloads-result.json', 'wb') as wf: +# with gzip.open('logs/downloads/2020/02/29/22/00/2020-02-29T22:00:00.000-RR11GaIPOBYjuohiUdWt.log.gz', 'rt') as f: +# for line in f: +# try: +# res = parse(line) +# if res is not None: +# wf.write(json.dumps(_cattr.unstructure(res)).encode() + b'\n') +# except: +# print(line) + +#with open('simple-result.json', 'wb') as wf: +# with gzip.open('logs/simple/2020/02/29/22/00/2020-02-29T22:00:00.000-J6jH0weiN3a7yBa6zZY-.log.gz', 'rt') as f: +# for line in f: +# try: +# res = parse(line) +# if res is not None: +# wf.write(json.dumps(_cattr.unstructure(res)).encode() + b'\n') +# except Exception as e: +# print(e) +# print(line) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4306e9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +arrow==0.15.5 +cattrs==1.0.0 +packaging==20.1 +pyparsing==2.4.6