-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathomicsBot.py
More file actions
executable file
·376 lines (340 loc) · 14.4 KB
/
omicsBot.py
File metadata and controls
executable file
·376 lines (340 loc) · 14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
#!/usr/bin/env python3
"""A bot for handling predatory journal titles: adding redirects and hatnotes.
It expects an input file starting with a few configuration lines, then
a single line '---' and then titles, one per line (as in omicsLists/).
For example:
target = Title of page to which created redirects will redirect to
category = Category in which to place the redirects (without 'Category:')
publisher = Title of page of the predatory publisher to mention in hatnotes
---
Journal of Foos
Journal of Bar: International Research
...
"""
import logging
import re
import sys
from typing import List, Optional
import pywikibot
import pywikibot.data.api
from pywikibot import Site
from utils import initLimits, trySaving
from abbrevIsoBot import state
# We share the state (with computed ISO-4 abbrevs) with abbrevIsoBot.
STATE_FILE_NAME = 'abbrevIsoBot/abbrevBotState.json'
def main() -> None:
"""Execute the bot."""
logging.basicConfig(level=logging.WARNING)
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0]} filename.txt')
return
filename = sys.argv[1]
# Initialize pywikibot.
assert Site().code == 'en'
initLimits(
editsLimits={'create': 600, 'talk': 600, 'fix': 600, 'hatnote': 0},
brfaNumber=6,
onlySimulateEdits=False,
botTrial=False
)
state.loadOrInitState(STATE_FILE_NAME)
configEnd: Optional[int] = None
configLines: List[str] = []
numLines = sum(1 for line in open(filename) if line.rstrip())
with open(filename) as f:
for i, line in enumerate(f):
line = line.strip()
if not line:
continue
if configEnd is None:
print(f'Config line {i}/{numLines} \t [{filename}]')
if line == '---':
configEnd = i
config = Config(configLines)
else:
configLines.append(line)
else:
print(f'Title line {i - configEnd}/{numLines - configEnd} \t '
f'[{filename}]')
if config.lang:
parts = list(map(lambda x: x.strip(), line.split(';')))
assert len(parts) == 2
doOmicsRedirects(parts[1], config, parts[0])
else:
doOmicsRedirects(line, config)
if config.publisher:
doOmicsHatnotes(line, config.publisher)
sys.stdout.flush()
state.saveState(STATE_FILE_NAME)
class Config:
"""Configuration read from the list file."""
def __init__(self, lines: List[str]):
"""Parse the config part of the input file and check sanity."""
self.rTarget: str
self.rCat: str
self.publisher: Optional[str] = None
self.anchor: bool = False # Whether redirects should contain anchor.
# (Anchors are guess trivially, as the first character).
self.lang: bool = False # Whether each title is given with language.
# (The format of each line is then like "ger;Journal of Foo").
rTarget: Optional[str] = None
rCat: Optional[str] = None
for line in lines:
key, value = line.split('=', 2)
key = key.strip()
value = value.strip()
if key == 'target':
rTarget = value
elif key == 'category':
rCat = value
elif key == 'publisher':
self.publisher = value
elif key == 'anchor':
self.anchor = (value.lower() not in ['false', 'no', '0', ''])
elif key == 'lang':
self.lang = (value.lower() not in ['false', 'no', '0', ''])
else:
raise Exception(f'Unrecognized configuration key "{key}".')
if not rTarget:
raise Exception(f'No target configured!')
self.rTarget = rTarget
if not rCat:
raise Exception(f'No category configured!')
self.rCat = rCat
targetPage = pywikibot.Page(Site(), rTarget)
if (not targetPage.exists()
or targetPage.isRedirectPage()
or targetPage.isCategoryRedirect()
or targetPage.isDisambig()):
raise Exception(f'Target [[{rTarget}]] does not exists '
f'or is a redirect.')
catPage = pywikibot.Page(Site(), 'Category:' + rCat)
if (not catPage.exists()
or not catPage.is_categorypage()
or catPage.isCategoryRedirect()):
raise Exception(f'[[Category:{rCat}]] does not exist '
f'or is not category or is redirect.')
if self.publisher:
pubPage = pywikibot.Page(Site(), self.publisher)
if not pubPage.exists():
raise Exception(f'Publisher [[{self.publisher}]] does not '
f'exist.')
print(f'Redirect target = [[{self.rTarget}]]')
print(f'Redirect cat = [[Category:{self.rCat}]]')
print(f'Redirect publisher = [[{self.publisher}]]')
print(f'Anchor = {"true" if self.anchor else "false"}')
print(f'Lang = {"true" if self.lang else "false"}')
def doOmicsRedirects(title: str,
config: Config,
lang: Optional[str] = None) -> None:
"""Create redirects for given OMICS journal."""
# If [[title]] exists, add '(journal)', unless its a redirect
# (either one we did, maybe to be fixed, or an unexpected one we'll skip).
addJournal = False
if '(journal)' in title:
title = title.replace('(journal)', '').strip()
addJournal = True
if '(' in title:
print(f'Skip: [[{title}]] has unexpected disambuig.')
page = pywikibot.Page(Site(), title)
if page.exists() and not page.isRedirectPage():
addJournal = True
if 'journal' in title.lower():
print(f'Skip: [[{title}]] already exists, '
'title already has "journal".')
return
for cat in page.categories():
if 'journal' in cat.title().lower():
print(f'Skip: [[{title}]] already exists, '
'has category containing "journal".')
return
# List of redirect pages to create, together with their type.
rTitles = set([(title, 'plain')])
# Handle 'and' vs '&' variant.
if ' and ' in title:
rTitles.add((title.replace(' and ', ' & '), 'and'))
elif ' & ' in title and 'Acta' not in title:
rTitles.add((title.replace(' & ', ' and '), 'and'))
# Handle variant without 'The' at the beginning.
if title.startswith('The '):
rTitle = title.replace('The ', '')
rTitles.add((rTitle, 'the'))
if ' and ' in rTitle:
rTitles.add((rTitle.replace(' and ', ' & '), 'theand'))
elif ' & ' in rTitle:
if not lang or 'eng' in lang:
rTitles.add((rTitle.replace(' & ', ' and '), 'theand'))
# Handle ISO-4 abbreviated variants.
state.saveTitleToAbbrev(title)
if lang == 'ger':
lang = 'ger,eng,fra,lat'
if lang:
state.saveTitleToAbbrev(title, lang)
try:
cLang = lang or 'all'
cAbbrev = state.getAbbrev(title, cLang)
# cEngAbbrev = state.getAbbrev(title, 'eng')
except state.NotComputedYetError as err:
print(err.message)
return
if cAbbrev != title:
rTitles.add((cAbbrev, 'iso4'))
rTitles.add((cAbbrev.replace('.', ''), 'iso4'))
# Deprecated:
# if cAbbrev != cEngAbbrev and cEngAbbrev != title:
# rTitles.add((cEngAbbrev, 'uniso4'))
# rTitles.add((cEngAbbrev.replace('.', ''), 'uniso4'))
# Skip if any of the redirect variants exists and is unfixable.
for (rTitle, rType) in rTitles:
if addJournal and (rType != 'iso4'):
rTitle = rTitle + ' (journal)'
r = createOrFixOmicsRedirect(rTitle, rType, config, tryOnly=True)
if r == 'unfixable':
print(f'Skip: [[{title}]] unfixable.')
return
# Create or replace the redirects.
for (rTitle, rType) in rTitles:
if addJournal and (rType != 'iso4'):
rTitle = rTitle + ' (journal)'
createOrFixOmicsRedirect(rTitle, rType, config, tryOnly=False)
def doOmicsHatnotes(title: str, publisher: str) -> None:
"""Create hatnotes for given OMICS journal."""
# Create hatnotes for misleading (predatory) titles.
suffixes = [': Open Access',
'-Open Access',
': An Indian Journal',
': Current Research',
': Advances and Applications',
': Development and Therapy',
': Evidence and Research',
': Research and Reviews',
': Research and Reports',
': Targets and Therapy']
aTitle = ''
for s in suffixes:
if title.endswith(s):
aTitle = title[:-len(s)].strip()
if aTitle:
aPage = pywikibot.Page(Site(), aTitle)
if aPage.exists():
isJournal = False
for cat in aPage.categories():
if 'journal' in cat.title().lower():
isJournal = True
break
if isJournal:
if not aPage.isRedirectPage():
addOmicsHatnote(aTitle, title, publisher)
else:
aTitle = aTitle + ' (journal)'
aPage = pywikibot.Page(Site(), aTitle)
if aPage.exists() and not aPage.isRedirectPage():
addOmicsHatnote(aTitle, title, publisher)
def addOmicsHatnote(aTitle: str, title: str, publisher: str) -> None:
"""Add hatnote to [[aTitle]] about confusion risk with OMICS [[title]]."""
page = pywikibot.Page(Site(), aTitle)
if '{{Confused|' in page.text or '{{confused|' in page.text:
print(f'Skip: {{{{confused}}}} hatnote already on [[{aTitle}]]')
return
print(f'Adding hatnote to [[{aTitle}]]')
hatnote = (f'{{{{Confused|text=[[{title}]],'
f' published by the [[{publisher}]]}}}}\n')
trySaving(page, hatnote + page.text, overwrite=True, limitType='hatnote',
summary='Add hatnote to predatory journal clone.')
def createOrFixOmicsRedirect(title: str, rType: str,
config: Config, tryOnly: bool) -> str:
"""Attempt to create or fix redirect from [[title]] to [[target]].
We return 'create' if non-existing, 'done' if basically equal to what we
would add, 'fix' if exists but looks fixable, 'unfixable' otherwise.
Also create talk page with {{WPJournals}} when non-existing.
"""
rText = '#REDIRECT[[' + config.rTarget + ']]\n'
rCat = '[[Category:' + config.rCat + ']]\n' if config.rCat else ''
rIsoCat = '{{R from ISO 4}}\n'
rSortTitle = title
if rSortTitle.startswith('The ') and '(' not in title:
rSortTitle = rSortTitle.replace('The ', '') + ', The'
if ' & ' in rSortTitle:
rSortTitle = rSortTitle.replace(' & ', ' and ')
if rSortTitle != title:
rSort = '{{DEFAULTSORT:' + rSortTitle + '}}\n'
if config.anchor:
rText = '#REDIRECT[[' + config.rTarget + '#' + rSortTitle[0] + ']]\n'
rNewContent = rText
if rSortTitle != title:
rNewContent += rSort
if rType == 'plain':
rNewContent += rCat
if rType == 'iso4':
rNewContent += '{{R from ISO 4}}\n'
rPage = pywikibot.Page(Site(), title)
rTalkPage = rPage.toggleTalkPage()
if not rPage.exists():
if rType == 'uniso4':
return 'ignore'
if not tryOnly:
print(f'Creating redirect from: [[{title}]].')
trySaving(rPage, rNewContent,
'Create redirect from journal to publisher.',
overwrite=False, limitType='create')
if rType == 'plain' and not rTalkPage.exists():
content = '{{WPJournals|class=redirect}}'
trySaving(rTalkPage, content,
'Mark new redirect into {{WPJournals}}.',
overwrite=False, limitType='talk')
return 'create'
# If rPage exists, check if we would add basically the same.
text = rPage.text
textStripped = re.sub(r'\s', '', text, re.M).strip()
rNewStripped = re.sub(r'\s', '', rNewContent, re.M).strip()
if textStripped == rNewStripped:
if not tryOnly:
if rTalkPage.exists():
print(f'Done: [[{title}]].')
elif rType == 'plain':
print(f'Done, but creating talk page: [[{title}]].')
content = '{{WPJournals|class=redirect}}'
trySaving(rTalkPage, content,
'Mark redirect into {{WPJournals}}.',
overwrite=False, limitType='talk')
return 'done'
# If rPage exists but not the same, check if it is a fixable case.
if rCat:
text = text.replace(rCat.strip(), '')
text = text.replace(rIsoCat.strip(), '')
text = re.sub(r'\{\{DEFAULTSORT:[^\}]*\}\}', '', text)
# Strip link anchors and whitespace before comparing
regex = r'(' + re.escape(config.rTarget) + r')\#.'
textStripped = re.sub(regex, r'\1', text, re.M)
textStripped = re.sub(r'\s', '', textStripped, re.M).strip()
rTextStripped = re.sub(regex, r'\1', rText, re.M)
rTextStripped = re.sub(r'\s', '', rTextStripped, re.M).strip()
if textStripped != rTextStripped:
print(f'Not fixable: [[{title}]] (type={rType}).')
print('---IS-------------')
print(rPage.text)
print('---SHOULD BE------')
print(rNewContent)
print('==================')
return 'unfixable'
# If it is fixable, fix it.
if not tryOnly:
if rType == 'uniso4':
print(f'Removing iso4 tag from: [[{title}]].')
print(f'Fixing redirect from: [[{title}]] (type={rType}).')
print('---WAS------------')
print(rPage.text)
print('---WILL BE--------')
print(rNewContent)
print('==================')
trySaving(rPage, rNewContent,
'Fix redirect from journal to publisher.',
overwrite=True, limitType='fix')
if rType == 'plain' and not rTalkPage.exists():
content = '{{WPJournals|class=redirect}}'
trySaving(rTalkPage, content,
'Fix redirect from journal to publisher.',
overwrite=False, limitType='talk')
return 'fix'
if __name__ == "__main__":
main()