Wikipedia:Dugnadskontor/Feil anførselstegn (fikse med bot eller lage kategori)/check for quotes.py
Dette er et pywikibot-script som kan brukes for å søke igjennom en XML-dump etter feil type anførselstegn:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
check_for_quotes.py - a quick script checking for quoting-signs in wikitext from XML-dump on nowiki
"""
#
# Copyright (C) 2016 Stig Meireles Johansen (stigmjATgmail.com)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import re
import urllib
import pywikibot
from pywikibot import xmlreader
from pywikibot import textlib
checklist = [
re.compile(u'[^=]\s*((?P<first>[\"\“\”\‘\’\`\´])([^\"\“\”\‘\’\`\´\=\n\>]*)(?P=first))', re.I), # «"», «“», «”», «‘», «’», «`» eller «´»
]
def escape(s):
"""Escape HTML-special characters."""
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("&", "&") # Must be last
return s
pywikibot.output("<!DOCTYPE html>\n<html lang=\"nb\" dir=\"ltr\">\n<head>\n<meta charset=\"UTF-8\">\n<title>Artikler med «\"», «“», «”», «‘», «’», «`» eller «´»</title>\n</head>\n<body>\n<ol>")
for page in xmlreader.XmlDump(sys.argv[1]).parse():
output = ""
if page.ns == "0" and not page.isredirect:
pagetext = textlib.unescape(page.text)
output = "%s" % (page.title)
found = False
findings = set()
for checkR in checklist:
m = checkR.finditer(pagetext)
if m:
for check in m:
found = 1
findings.add(check.group(1))
if found:
pywikibot.output( '<li><a href="https://no.wikipedia.org/wiki/%s">%s</a><small> - %s</small></li>' % (urllib.quote(output.encode('utf-8')), output, escape(", ".join(findings))))
pywikibot.output("</ol>\n</body>\n</html>\n")