Hopp til innhold

Wikipedia:Dugnadskontor/Feil anførselstegn (fikse med bot eller lage kategori)/check for quotes.py

Fra Wikipedia, den frie encyklopedi

Dette er et pywikibot-script som kan brukes for å søke igjennom en XML-dump etter feil type anførselstegn:

# -*- coding: utf-8  -*-
check_for_quotes.py - a quick script checking for quoting-signs in wikitext from XML-dump on nowiki

#    Copyright (C) 2016  Stig Meireles Johansen (stigmjATgmail.com)
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    GNU General Public License for more details.
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys
import re
import urllib

import pywikibot
from pywikibot import xmlreader
from pywikibot import textlib

checklist = [
		re.compile(u'[^=]\s*((?P<first>[\"\“\”\‘\’\`\´])([^\"\“\”\‘\’\`\´\=\n\>]*)(?P=first))', re.I),    # «"», «“», «”», «‘», «’», «`» eller «´»

def escape(s):
    """Escape HTML-special characters."""
    s = s.replace("<", "&lt;")
    s = s.replace(">", "&gt;")
    s = s.replace("&", "&amp;")  # Must be last
    return s

pywikibot.output("<!DOCTYPE html>\n<html lang=\"nb\" dir=\"ltr\">\n<head>\n<meta charset=\"UTF-8\">\n<title>Artikler med «\"», «“», «”», «‘», «’», «`» eller «´»</title>\n</head>\n<body>\n<ol>")

for page in xmlreader.XmlDump(sys.argv[1]).parse():
	output = ""
	if page.ns == "0" and not page.isredirect:
		pagetext = textlib.unescape(page.text)
		output = "%s" % (page.title)
		found = False
		findings = set()
		for checkR in checklist:
			m = checkR.finditer(pagetext)
			if m:
				for check in m:
					found = 1
		if found:
			pywikibot.output( '<li><a href="https://no.wikipedia.org/wiki/%s">%s</a><small> - %s</small></li>' % (urllib.quote(output.encode('utf-8')), output, escape(", ".join(findings))))