Skip to content

Commit d2cac08

Browse files
authored
Merge pull request #2545 from tim427/develop
Added support for JSON containing multiple events
2 parents 9c71a9c + 729182a commit d2cac08

File tree

8 files changed

+190
-23
lines changed

8 files changed

+190
-23
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
2323
- New parameter `stop_retry_limit` to gracefully handle stopping bots which take longer to shutdown (PR#2598 by Lukas Heindl, fixes #2595).
2424
- `intelmq.lib.datatypes`: Remove unneeded Dict39 alias (PR#2639 by Nakul Rajpal, fixes #2635)
2525
- `intelmq.lib.mixins.http`: Only set HTTP header 'Authorization' if username or password are set and are not both empty string as they are by default in the Manager (fixes #2590, PR#2634 by Sebastian Wagner).
26+
- `intelmq.lib.message.Message.from_dict`: Do not modify the dict parameter by adding the `__type` field and raise an error when type is not determinable (PR#2545 by Sebastian Wagner).
2627

2728
### Development
2829

@@ -49,6 +50,9 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
4950
#### Parsers
5051
- `intelmq.bots.parsers.cymru.parser_cap_program`: Add mapping for TOR and ipv6-icmp protocol (PR#2621 by Mikk Margus Möll).
5152
- Remove `intelmq.bots.collectors.blueliv` as it is obsolete with the removed collector (PR#2632 by Sebastian Wagner).
53+
- `intelmq.bots.parser.json.parser`:
54+
- Support data containing lists of JSON Events (PR#2545 by Tim de Boer).
55+
- Add default `classification.type` with value `undetermined` if input data has now classification itself (PR#2545 by Sebastian Wagner).
5256

5357
#### Experts
5458
- `intelmq.bots.experts.asn_lookup.expert`:

docs/user/bots.md

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1925,12 +1925,69 @@ also <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>). Defaults to `htm
19251925

19261926
---
19271927

1928-
### JSON (TODO) <div id="intelmq.bots.parsers.json.parser" />
1928+
### JSON <div id="intelmq.bots.parsers.json.parser" />
19291929

1930-
TODO
1930+
Parses JSON events that are already in IntelMQ format.
1931+
If the input data did not contain the field `classification.type`, it is set to `undetermined`.
1932+
1933+
Supports multiple different modes:
1934+
1935+
#### Input data is one event
1936+
Example:
1937+
```json
1938+
{ INTELMQ data... }
1939+
```
1940+
or:
1941+
```
1942+
{
1943+
INTELMQ data...
1944+
}
1945+
```
1946+
1947+
Configuration:
1948+
* `splitlines`: False
1949+
* `multiple_events`: False
1950+
1951+
#### Input data is in JSON stream format
1952+
Example:
1953+
```json
1954+
{ INTELMQ data... }
1955+
{ INTELMQ data... }
1956+
{ INTELMQ data... }
1957+
```
1958+
1959+
Configuration:
1960+
* `splitlines`: True
1961+
* `multiple_events`: False
1962+
1963+
#### Input data is a list of events
1964+
Example:
1965+
```json
1966+
[
1967+
{ INTELMQ data... },
1968+
{ INTELMQ data... },
1969+
...
1970+
]
1971+
```
1972+
1973+
Configuration:
1974+
* `splitlines`: False
1975+
* `multiple_events`: True
1976+
1977+
#### Configuration
19311978

19321979
**Module:** `intelmq.bots.parsers.json.parser`
19331980

1981+
**Parameters:**
1982+
1983+
**`splitlines`**
1984+
1985+
(optional, boolean) When the input file contains one JSON dictionary per line, set this to `true`. Defaults to `false`.
1986+
1987+
**`multiple_events`**
1988+
1989+
(optional, string) When the input file contains a JSON list of dictionaries, set this to `true`. Defaults to `false`.
1990+
19341991
---
19351992

19361993
### Key=Value Parser <div id="intelmq.bots.parsers.key_value.parser" />

intelmq/bots/parsers/json/parser.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,48 @@
1-
# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik
1+
# SPDX-FileCopyrightText: 2016 by Bundesamt für Sicherheit in der Informationstechnik, 2016-2021 nic.at GmbH, 2024 Tim de Boer, 2025 Institute for Common Good Technology
22
#
33
# SPDX-License-Identifier: AGPL-3.0-or-later
44
"""
55
JSON Parser Bot
66
Retrieves a base64 encoded JSON-String from raw and converts it into an
77
event.
8-
9-
Copyright (C) 2016 by Bundesamt für Sicherheit in der Informationstechnik
10-
Software engineering by Intevation GmbH
118
"""
129
from intelmq.lib.bot import ParserBot
1310
from intelmq.lib.message import MessageFactory
1411
from intelmq.lib.utils import base64_decode
12+
from json import loads as json_loads, dumps as json_dumps
1513

1614

1715
class JSONParserBot(ParserBot):
1816
"""Parse IntelMQ-JSON data"""
19-
splitlines = False
17+
splitlines: bool = False
18+
multiple_events: bool = False
19+
20+
def init(self):
21+
if self.multiple_events and self.splitlines:
22+
raise ValueError("Modes 'splitlines' and 'multiple_events' are not possible at the same time. Please use either one.")
2023

2124
def process(self):
2225
report = self.receive_message()
23-
if self.splitlines:
24-
lines = base64_decode(report['raw']).splitlines()
26+
if self.multiple_events:
27+
lines = json_loads(base64_decode(report["raw"]))
28+
elif self.splitlines:
29+
lines = base64_decode(report["raw"]).splitlines()
2530
else:
26-
lines = [base64_decode(report['raw'])]
31+
lines = [base64_decode(report["raw"])]
2732

2833
for line in lines:
29-
new_event = MessageFactory.unserialize(line,
30-
harmonization=self.harmonization,
31-
default_type='Event')
3234
event = self.new_event(report)
33-
event.update(new_event)
34-
if 'raw' not in event:
35-
event['raw'] = line
35+
if self.multiple_events:
36+
event.update(MessageFactory.from_dict(line,
37+
harmonization=self.harmonization,
38+
default_type="Event"))
39+
event["raw"] = json_dumps(line, sort_keys=True)
40+
else:
41+
event.update(MessageFactory.unserialize(line,
42+
harmonization=self.harmonization,
43+
default_type="Event"))
44+
event.add('raw', line, overwrite=False)
45+
event.add("classification.type", "undetermined", overwrite=False) # set to undetermined if input has no classification
3646
self.send_message(event)
3747
self.acknowledge_message()
3848

intelmq/lib/message.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,19 @@ def from_dict(message: dict, harmonization=None,
4949
MessageFactory.unserialize
5050
MessageFactory.serialize
5151
"""
52-
if default_type and "__type" not in message:
53-
message["__type"] = default_type
52+
if not default_type and '__type' not in message:
53+
raise ValueError("Message type could not be determined. Input message misses '__type' and parameter 'default_type' not given.")
5454
try:
55-
class_reference = getattr(intelmq.lib.message, message["__type"])
55+
class_reference = getattr(intelmq.lib.message, message.get("__type", default_type))
5656
except AttributeError:
5757
raise exceptions.InvalidArgument('__type',
5858
got=message["__type"],
5959
expected=VALID_MESSSAGE_TYPES,
6060
docs=HARMONIZATION_CONF_FILE)
6161
# don't modify the parameter
6262
message_copy = message.copy()
63+
if default_type and "__type" not in message_copy:
64+
message_copy["__type"] = default_type
6365
del message_copy["__type"]
6466
return class_reference(message_copy, auto=True, harmonization=harmonization)
6567

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
[
2+
{
3+
"extra.dataset_collections": "0",
4+
"extra.dataset_files": "1",
5+
"extra.dataset_infected": "false",
6+
"extra.dataset_ransom": "null",
7+
"extra.dataset_rows": "0",
8+
"extra.dataset_size": "301",
9+
"protocol.application": "https",
10+
"protocol.transport": "tcp",
11+
"source.asn": 12345689,
12+
"source.fqdn": "fqdn-example-1.tld",
13+
"source.geolocation.cc": "NL",
14+
"source.geolocation.city": "Enschede",
15+
"source.geolocation.latitude": 52.0000000000000,
16+
"source.geolocation.longitude": 6.0000000000000,
17+
"source.geolocation.region": "Overijssel",
18+
"source.ip": "127.1.2.1",
19+
"source.network": "127.1.0.0/16",
20+
"source.port": 80,
21+
"time.source": "2024-12-16T02:08:06+00:00"
22+
},
23+
{
24+
"extra.dataset_collections": "0",
25+
"extra.dataset_files": "1",
26+
"extra.dataset_infected": "false",
27+
"extra.dataset_ransom": "null",
28+
"extra.dataset_rows": "0",
29+
"extra.dataset_size": "615",
30+
"extra.os_name": "Ubuntu",
31+
"extra.software": "Apache",
32+
"extra.tag": "rescan",
33+
"extra.version": "2.4.58",
34+
"protocol.application": "https",
35+
"protocol.transport": "tcp",
36+
"source.asn": 12345689,
37+
"source.fqdn": "fqdn-example-2.tld",
38+
"source.geolocation.cc": "NL",
39+
"source.geolocation.city": "Eindhoven",
40+
"source.geolocation.latitude": 51.0000000000000,
41+
"source.geolocation.longitude": 5.0000000000000,
42+
"source.geolocation.region": "North Brabant",
43+
"source.ip": "127.1.2.2",
44+
"source.network": "127.1.0.0/16",
45+
"source.port": 443,
46+
"time.source": "2024-12-16T02:08:12+00:00"
47+
},
48+
{
49+
"extra.dataset_collections": "0",
50+
"extra.dataset_files": "1",
51+
"extra.dataset_infected": "false",
52+
"extra.dataset_ransom": "null",
53+
"extra.dataset_rows": "0",
54+
"extra.dataset_size": "421",
55+
"protocol.application": "http",
56+
"protocol.transport": "tcp",
57+
"source.asn": 12345689,
58+
"source.geolocation.cc": "NL",
59+
"source.geolocation.city": "Enschede",
60+
"source.geolocation.latitude": 52.0000000000000,
61+
"source.geolocation.longitude": 6.0000000000000,
62+
"source.geolocation.region": "Overijssel",
63+
"source.ip": "127.1.2.3",
64+
"source.network": "127.1.0.0/16",
65+
"source.port": 9000,
66+
"time.source": "2024-12-15T21:09:49+00:00"
67+
}
68+
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SPDX-FileCopyrightText: 2024 Tim de Boer
2+
SPDX-License-Identifier: AGPL-3.0-or-later

intelmq/tests/bots/parsers/json/test_parser.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import base64
77
import os
88
import unittest
9+
from json import loads as json_loads, dumps as json_dumps
910

1011
import intelmq.lib.test as test
1112
from intelmq.bots.parsers.json.parser import JSONParserBot
@@ -51,6 +52,21 @@
5152
NO_DEFAULT_EVENT = MULTILINE_EVENTS[1].copy()
5253
NO_DEFAULT_EVENT['raw'] = base64.b64encode(b'{"source.ip": "127.0.0.2", "classification.type": "c2-server"}\n').decode()
5354

55+
with open(os.path.join(os.path.dirname(__file__), 'ncscnl.json'), 'rb') as fh:
56+
NCSCNL_FILE = fh.read()
57+
NCSCNL_RAW = base64.b64encode(NCSCNL_FILE).decode()
58+
NCSC_EVENTS = json_loads(NCSCNL_FILE)
59+
for i, event in enumerate(NCSC_EVENTS):
60+
NCSC_EVENTS[i]['raw'] = base64.b64encode(json_dumps(event, sort_keys=True).encode()).decode()
61+
NCSC_EVENTS[i]['classification.type'] = 'undetermined'
62+
NCSC_EVENTS[i]['feed.name'] = 'NCSC.NL'
63+
NCSC_EVENTS[i]['__type'] = 'Event'
64+
65+
NCSCNL_REPORT = {"feed.name": "NCSC.NL",
66+
"raw": NCSCNL_RAW,
67+
"__type": "Report",
68+
}
69+
5470

5571
class TestJSONParserBot(test.BotTestCase, unittest.TestCase):
5672
"""
@@ -70,8 +86,7 @@ def test_oneline_report(self):
7086
def test_multiline_report(self):
7187
""" Test if correct Event has been produced. """
7288
self.input_message = MULTILINE_REPORT
73-
self.sysconfig = {"splitlines": True}
74-
self.run_bot()
89+
self.run_bot(parameters={"splitlines": True})
7590
self.assertMessageEqual(0, MULTILINE_EVENTS[0])
7691
self.assertMessageEqual(1, MULTILINE_EVENTS[1])
7792

@@ -81,6 +96,14 @@ def test_default_event(self):
8196
self.run_bot()
8297
self.assertMessageEqual(0, NO_DEFAULT_EVENT)
8398

99+
def test_ncscnl(self):
100+
""" A file containing a list of events (not per line) """
101+
self.input_message = NCSCNL_REPORT
102+
self.run_bot(parameters={'multiple_events': True})
103+
self.assertMessageEqual(0, NCSC_EVENTS[0])
104+
self.assertMessageEqual(1, NCSC_EVENTS[1])
105+
self.assertMessageEqual(2, NCSC_EVENTS[2])
106+
84107

85108
if __name__ == '__main__': # pragma: no cover
86109
unittest.main()

intelmq/tests/lib/test_bot_library_mode.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"destination.ip": "192.0.43.8", # iana.org.
3333
"time.observation": "2015-01-01T00:00:00+00:00",
3434
}
35+
EXAMPLE_IP_OUTPUT = MessageFactory.from_dict(EXAMPLE_IP_INPUT, default_type='Event') # adds __type = Event
3536

3637

3738
class BrokenInitExpertBot(ExpertBot):
@@ -130,15 +131,15 @@ def test_bot_multi_message():
130131

131132
def test_bot_raises_and_second_message():
132133
"""
133-
The first message raises an error and the second message
134+
The first message raises an error and the second message is processed correctly
134135
This test is based on an issue where the exception-raising message was not cleared from the internal message store of the Bot/Pipeline instance and thus re-used on the second run
135136
"""
136137
raises_on_first_run = RaisesOnFirstRunExpertBot('raises', settings=BotLibSettings)
137138
with raises(ValueError):
138139
raises_on_first_run.process_message(EXAMPLE_DATA_URL)
139140
queues = raises_on_first_run.process_message(EXAMPLE_IP_INPUT)
140141
assert len(queues['output']) == 1
141-
assertMessageEqual(queues['output'][0], EXAMPLE_IP_INPUT)
142+
assertMessageEqual(queues['output'][0], EXAMPLE_IP_OUTPUT)
142143

143144

144145
if __name__ == '__main__': # pragma: no cover

0 commit comments

Comments
 (0)