-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.py
executable file
·439 lines (410 loc) · 13 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
# convert antville helma to wordpress import/export XML
# author leo sauermann
# date 4.11.2017
# license: https://opensource.org/licenses/MIT because its so short and easy
# doc see README.md
# Doc good to have when coding
# https://codex.wordpress.org/Importing_Content
import sys
from datetime import *
from collections import Counter
from collections import defaultdict
import cachehtmlimages
###
### -------------------- A LOT OF CONFIGURATION --------------------
###
# Wrap Body in CDATA section. True/false
cfg_wrapbodywithcdata = True
# Cache for downloaded images
cfg_localprefix='/2017/imgcache/'
cfg_cachedir='cachedir'
cfg_enablecache=True
cfg_channelboilerplate='''<channel>
<title>Export file</title>
<link>http://www.example.com/blog</link>
<description>Just another WordPress site</description>
<pubDate>Fri, 20 Oct 2017 21:08:14 +0000</pubDate>
<language>en-US</language>
<wp:wxr_version>1.2</wp:wxr_version>
<wp:base_site_url>http://www.example.com/blog</wp:base_site_url>
<wp:base_blog_url>http://www.example.com/blog</wp:base_blog_url>
<wp:author><wp:author_id>1</wp:author_id><wp:author_login><![CDATA[leobard]]></wp:author_login><wp:author_email><![CDATA[[email protected]]]></wp:author_email><wp:author_display_name><![CDATA[leobard]]></wp:author_display_name><wp:author_first_name><![CDATA[]]></wp:author_first_name><wp:author_last_name><![CDATA[]]></wp:author_last_name></wp:author>
'''
cfg_wpposttype='post'
# add the TITLE(s) of blogpost(s) here if you want to get an XML file with just these posts
cfg_importonly=[]
spammers = []
spammerwithcomments = defaultdict(list)
whitelist = {'leobard': 68, 'Chris Mark': 5, 'lucasswonn': 5, 'Donmoss': 4, 'humtum': 3, 'peter andersson': 3, 'antheque': 2, 'Tony Lara': 2, 'Guenther (guest)': 2, 'Martincrow': 2, 'darrensy': 2, 'Lars Zapf (guest)': 2, 'danja': 2, 'sherman': 2, 'Fernandro': 2, 'kollektivtraeumer': 1, 'Max Völkel (guest)': 1, 'jip': 1, 'Anja Jentzsch (guest)': 1, 'BrettspielBrowser': 1, 'Jeen (guest)': 1, 'heimwege (guest)': 1, 'Richard Cyganiak (guest)': 1, 'kabsi (guest)': 1, 'Gunnar (guest)': 1, 'tim finin (guest)': 1, 'D.E.R. Kabsi (guest)': 1, 'Arne (guest)': 1, 'Gunnar Grimnes (guest)': 1, 'typekey:Dan Brickley': 1, 'xamde': 1, 'guenthernoack': 1, 'Dan Connolly (guest)': 1, 'Heimwege (guest)': 1,
'020200 (guest)':1, "anonymous (guest)":1, 'Sramana Mitra (guest)':1,
"karl (guest)":1,
"020200":1,
"Laurens Holst (guest)":1,
"Webmaster (guest)":1,
"Bernhard (guest)":1,
"komm.rip":1,
"Damian (guest)":1,
"Jan (guest)":1,
"smi":1,
"Laurent Szyster (guest)":1,
"Frank (guest)":1,
"pavel (guest)":1,
"simsplayah":1,
"reubenk":1,
"Andreas (guest)":1,
"Dave Brondsema (guest)":1,
"jariep":1,
"laurentszyster":1,
"Dominik (guest)":1,
"Maggi (guest)":1,
"SiENcE (guest)":1,
"Ed Davies (guest)":1,
"Pavel9":1,
"Martin (guest)":1,
"D.E.R.":1,
"feedbuzz":1,
}
blacklist = {
"musicindance",
"richardphillips8416",
"myspainholidays",
"Alstair",
"samsameer",
"aliyog",
"enthusiast225",
"rosepdtr",
"hyunnonu",
"boulevardsg",
"flywithme",
"bankhing",
"Martink23",
"rodhildred",
"julioman",
"Ben67",
"adam434",
"arnoldcarrington",
"Bojobycy",
"jonz",
"John_smith",
"ashish9454",
"milos01011984",
"nanziecane",
"jmorris14",
"denzie",
"Caouette",
"recadolee",
"dmolvik",
"smithbell",
"rockyhenry",
"tsonga1198",
"AwningsForHomes",
"buntagna",
"kumar",
"larrzz0jh",
"stenlyanson",
"thomasfernandus",
"jansenkoe (guest)",
"tradetuber",
"plantar exercises",
"jeanalbert2018",
"mercymac19",
"benostill",
"pele092131",
"pdambc163",
"granicefer11",
"laneejakona",
"catarina09183",
"nike2636",
"thestars",
"longma",
"artedavida",
"jolina",
"johntlr",
"stevensegal60",
"Latos354",
"amhash",
"razibjohn",
"anna9871",
"geogia987",
"JackWitson",
"heartofqueen",
"voansumer",
"tqbcl36",
"zakoment",
"mliege",
"erwinhector",
"miltonm103",
"flashwd",
"Caouette1",
"NatalieWilliam99",
"fyodorsteven",
"johnluv",
"lisaok",
"mario98121",
"jonathan200",
"mahmoud321",
"Anna Marie",
"davidchair34",
"macmercy",
"juz2maryu",
"hamza123",
"lhc67",
"alat kesehatan",
"pollockshaun",
"fgermanyer",
"speednews.it (guest)",
"davidguez",
"netbrainiac",
"Kerson",
"isha",
"jackdiamond",
"jameskeaton",
"adelaidetaylor",
"alinsmith",
"JamesRobert",
"yuzz3n",
"yoyo7",
"JonathanAquino",
"reigna",
"christina12",
"bista675",
"raffyprince",
"alex46 (guest)",
"ssasoo",
"sinthia_jaman",
"evawagner87 ",
"authority216",
"jamezlee",
"nursebebo",
"Swampthing (guest)",
"starplayer",
"usabta",
"Supreme",
"Bratish175",
"helentarlow",
"sdfdsf (guest)",
"golfwholesaler18",
"Marry444",
"beckham1291",
"johnrock",
"zinlosho",
"patrickbest",
"DavidJohn",
"george22","thomasdelange","george22","alton100","Roberdro", "miketyson986", "miketyson986","jerry.nic","alec7334","jauhis", "jessiejames","esmi25","mortage","alton100", "miketyson986",'susanbell84','danleyvila','Rainer (guest)','nabihahayat','wify','sarahjames786','Samii', "jesika",
"johndecoza",
"Yuichi Junigida",
"vanessa198012","alex0012",
"attain98",
"bellegardner",
"celinedion201982","asim",
"123akshay",
"geraldrss",
"sads",
"swatbolish",
"Alex11",
"katebroad21",
"StevenJ",
"jessica098",
"Leetony2598",
"Emma Lue",
"lam",
"PSteve",
"rsweeting123",
"mortage35",
"coach001",
"Allysonrock3",
"fdgdf (guest)",
"smackshown",
"NathalieD",
"rihanna09413",
"lindasmithsons",
"Jared H.",
"Duagiibii",
"keantommy",
"Amandavarley",
"seenathkumar",
"joshkurtis",
"mandybeck",
"Larah",
"lacomunicacion",
"SonicB",
"JorkRodan",
"yesyes",
"jhonmgt745",
"reactor67",
"ethak",
"fenny",
"tintly",
"alijutt8",
"mendadesantos",
"ladykippy",
"mammer",
"waxx7",
}
# replace these image filenames in the body because they lacked the extension #hack
# TODO: this hack could be replaced by making the cachehtmlimages.py evaluate the file-ending, the content-type, and doing the right thing when downloading. Would take more than 60 minutes though to code and this was 5 mintues.
cfg_rplc={
'http---leobard.twoday.net-getfile-name-IMG_8468':'http---leobard.twoday.net-getfile-name-IMG_8468.jpg',
'http---leobard.twoday.net-getfile-name-IMG_8468':'http---leobard.twoday.net-getfile-name-IMG_8468.jpg',
'http---wordle.net-thumb-wrdl-216762-leobards_delicious':'http---wordle.net-thumb-wrdl-216762-leobards_delicious.jpg',
'http---www.w3.org-Icons-w3c_home':'http---www.w3.org-Icons-w3c_home.png',
'http---www.flickr.com-images-buddyicon.jpg-91744463-N00':'http---www.flickr.com-images-buddyicon.jpg-91744463-N00.jpg',
'http---static.flickr.com-23-buddyicons-96533977-N00.jpg-1123189502':'http---static.flickr.com-23-buddyicons-96533977-N00.jpg-1123189502.jpg',
'http---bugs.kde.org-attachment.cgi-id-11622-action-view':'http---bugs.kde.org-attachment.cgi-id-11622-action-view.jpg',
'http---ep.yimg.com-ca-I-yhst-81571646212247_2099_760690':'http---ep.yimg.com-ca-I-yhst-81571646212247_2099_760690.gif',
'http---ep.yimg.com-ca-I-yhst-81571646212247_2101_64972':'http---ep.yimg.com-ca-I-yhst-81571646212247_2101_64972.gif',
}
###
### -------------------- CONFIGURATION ENDS, THE REST IS CODE --------------------
###
# remove funny characters in a body and fix image filenames where we dont have an extension #hack
def sanitizebody(body):
for i,ii in cfg_rplc.items():
body=body.replace(i,ii)
body=body.replace('', ' ')
return body
def nicename(s):
return s.lower().replace(' ','-')
filename = sys.argv[1]
with open(filename) as fp:
contents = fp.read()
print ('<?xml version="1.0" encoding="UTF-8" ?>')
print ('<!-- generator="Helma-Antville to wordpress Converter" created="'+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'" -->')
print ('''<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.2/"
>''')
print (cfg_channelboilerplate)
print ('<generator>https://www.leobard.net/2017/helma-to-wordpress</generator>')
for entry in contents.split('\n-----\n--------\n'):
# do something with entry
if len(entry)==0:
continue
headerAndBody = entry.split('\n-----\nBODY:\n')
header = headerAndBody[0]
try:
body = headerAndBody[1]
except Exception as e:
print (e)
print (headerAndBody)
raise (e)
# parse headers
headeritems={}
for headerline in header.split('\n'):
try:
key,value = headerline.split(': ',1)
try:
if (key=='DATE'):
value = datetime.strptime(value, '%m/%d/%Y %I:%M:%S %p')
finally:
headeritems[key]=value
except Exception as e:
print (e)
print (headerline)
raise (e)
if (len(cfg_importonly) > 0) and (not(headeritems['TITLE'] in cfg_importonly)):
continue
# print item
print (' <item>')
print (' <title><![CDATA['+headeritems['TITLE']+']]></title>')
# Tue, 08 Feb 2011 00:00:00 +0000
print (' <pubDate>'+headeritems['DATE'].strftime('%a, %d %b %Y %H:%M:%S +0000')+'</pubDate>')
print (' <dc:creator><![CDATA['+headeritems['AUTHOR']+']]></dc:creator>')
isodate = headeritems['DATE'].strftime('%Y-%m-%d %H:%M:%S')
print (' <wp:post_date><![CDATA['+isodate+']]></wp:post_date>')
print (' <wp:post_date_gmt><![CDATA['+isodate+']]></wp:post_date_gmt>')
print (' <wp:post_type><![CDATA['+cfg_wpposttype+']]></wp:post_type>')
# nicename
print (' <category domain="category" nicename="'+nicename(headeritems['CATEGORY'])+'"><![CDATA['+headeritems['CATEGORY']+']]></category>')
bodyAndComments = body.split('\n-----\nCOMMENT:\n')
if not (bodyAndComments) or (len(bodyAndComments)==1):
body = bodyAndComments[0]
comments = []
else:
body = bodyAndComments[0]
comments = bodyAndComments[1:]
# BODY: cache images?
if cfg_enablecache:
try:
body = cachehtmlimages.localifyHtmlImages(body, cfg_localprefix, cfg_cachedir)
except Exception as e:
sys.stderr.write(str(e)+'\n')
sys.stderr.write('This was during cachehtmlimages of item with date '+isodate+'. Continuing...\n')
# BODY: sanitize XML
body = sanitizebody(body)
print (' <content:encoded>')
if (cfg_wrapbodywithcdata):
print ('<![CDATA[')
print (body)
if (cfg_wrapbodywithcdata):
print (']]>')
print (' </content:encoded>')
# THERE ARE COMMENTS. TURN THEM INTO HTML
if len(comments)>0:
tt = ''
for comment in comments:
commentheaders = {}
commentlines = comment.split('\n')
if (commentlines[0]=='REPLY:'):
reply=1
i=1
else:
reply=0
i=0
while i<len(commentlines):
try:
key,value = commentlines[i].split(': ',1)
except Exception as e:
sys.stderr.write(str(e))
sys.stderr.write('\n'+commentlines[i]+'\n')
raise e
if key=='AUTHOR':
author=value
if key=='DATE':
# unlike with the item, I transform the comment date directly to ISO as I don't need it in any other format
try:
m2 = datetime.strptime(value, '%m/%d/%Y %I:%M:%S %p')
value = m2.strftime('%Y-%m-%d %H:%M:%S')
except Exception as e:
sys.stderr.write(str(e))
sys.stderr.write('\n'+commentlines[i]+'\n')
# DATE also marks the end of the header of the comment, the rest is content-body
commentbody = '\n'.join(commentlines[(i+1):])
i = len(commentlines) # exit this loop
commentheaders[key]=value
i += 1
if author in blacklist:
continue
# Spam detection: Read the comment and find hrefs to collect a list of possible spammers
spamham = commentbody.lower()
if ('href' in spamham):
if not author in whitelist:
# make commentbody easier to read to judge if spam
spamham = spamham.replace('\n', ' ')
spamham = spamham[spamham.index('href'):]
spammers.append(author)
spammerwithcomments[author].append(spamham)
continue
# print the comment
print (' <wp:comment>')
print (' <wp:comment_author><![CDATA['+commentheaders['AUTHOR']+']]></wp:comment_author>')
print (' <wp:comment_author_url><![CDATA['+commentheaders['EMAIL']+']]></wp:comment_author_url>')
print (' <wp:comment_date><![CDATA['+commentheaders['DATE']+']]></wp:comment_date>')
print (' <wp:comment_date_gmt><![CDATA['+commentheaders['DATE']+']]></wp:comment_date_gmt>')
print (' <wp:comment_content><![CDATA['+sanitizebody(commentbody)+']]></wp:comment_content>')
print (' <wp:comment_approved><![CDATA[1]]></wp:comment_approved>')
print (' </wp:comment>')
print (' </item>')
# break
print ('</channel>')
print ('</rss>')
# Print out the stuff about people who posted <a href> but are not in whitelist or blacklist
for k,l in spammerwithcomments.items():
sys.stderr.write('"'+k+'", \n')
for i in l:
sys.stderr.write(' '+i[:110]+'\n')
if len(spammers) > 0:
sys.stderr.write(str(Counter(spammers)))