Skip to content

Commit 982c25d

Browse files
committed
Added pruning by duplicate id.
1 parent 1703199 commit 982c25d

File tree

1 file changed

+39
-0
lines changed

1 file changed

+39
-0
lines changed

scripts/XMLPruning.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from lxml import etree
2+
3+
4+
class XMLPruning:
5+
def __init__(self, xml_document_location):
6+
self.xml_document_location = xml_document_location
7+
self.root = self.get_root()
8+
9+
def get_root(self) -> etree.Element:
10+
tree = etree.parse(self.xml_document_location)
11+
return tree.getroot()
12+
13+
def remove_duplicates_by_id(self, element_type):
14+
visited = set()
15+
16+
for element in self.root.iter(element_type):
17+
if 'id' in element.attrib:
18+
self.check_visited_id(element, visited)
19+
20+
self.write_to_file()
21+
return visited
22+
23+
def check_visited_id(self, element, visited):
24+
current = element.get('id')
25+
if current in visited:
26+
print("Removing element with id " + current)
27+
element.getparent().remove(element)
28+
else:
29+
visited.add(current)
30+
31+
def write_to_file(self):
32+
with open("new-" + self.xml_document_location, 'wb') as doc:
33+
doc.write(etree.tostring(self.root, pretty_print=True))
34+
35+
36+
if __name__ == "__main__":
37+
# usage example
38+
xmlP = XMLPruning("statii-ratt-format.xml")
39+
print(len(xmlP.remove_duplicates_by_id("TransportStation")))

0 commit comments

Comments
 (0)