Modul:Squash
Moduldokumentasjon
[opprett]
--- Squashing repeated parts of sentences
-- This is a proof of concept for the spec extension, and is not ment to be
-- the final version.
-- © John Erling Blad, Creative Commons by Attribution 3.0
local squash = {}
squash.stopWords = {
['en']=true, ['én']=true, ['ein']=true,
['ei']=true, ['éi']=true,
['et']=true, ['ét']=true, ['eit']=true, ['ett']=true, ['første']=true,
['to']=true, ['andre']=true,
['tre']=true, ['tredje']=true,
['fire']=true, ['fjerde']=true,
['fem']=true, ['femte']=true,
['seks']=true, ['sjette']=true,
['sju']=true, ['syv']=true, ['syvende']=true,
['åtte']=true, ['åttende']=true,
['ni']=true, ['niende']=true,
['ti']=true, ['tiende']=true
}
squash.stopPatterns = { "^%d+%a+", "^%d+%." }
squash.patterns = { "^[-%a]+", "^%d+%a+", "^%d+%.", "^%b{}", "^%b[]", "^%b()", "^%b<>", "^%p+", "^%s+" }
squash._comma = ', '
squash._and = ' og '
squash.isStopWord = function( token )
local str = mw.ustring.lower( token )
return squash.stopWords[mw.ustring.lower( str )]
end
squash.isStopPattern = function( token )
local halt = false
for _,pattern in ipairs( squash.stopPatterns ) do
halt = (mw.ustring.match( token, pattern ) and true) or halt
if halt then
break
end
end
return halt
end
squash.tokenize = function( str )
local fragments = {}
local start = 1
repeat
local fragment = nil
for _,pattern in ipairs( squash.patterns ) do
fragment = mw.ustring.match( str, pattern, start )
if not fragment then
-- this is a fallback if the ordinary patterns fail
fragment = mw.ustring.match( str, "^[^%s]+", start )
end
if fragment then
start = mw.ustring.len(fragment) + start
table.insert( fragments, fragment )
break
end
end
until not fragment
return fragments
end
squash.extract = function( tokens )
local collected = {}
for i,token in ipairs( tokens[1] ) do
if squash.isStopWord( token ) or squash.isStopPattern( token ) then
break
end
local found = 0
for j=1,#tokens do
if token == tokens[j][i] then
found = 1 + found
end
end
if found ~= #tokens then
break
end
table.insert( collected, token )
end
return collected
end
squash.reverse = function( sequence )
local reversed = {}
local count = #sequence
for k, v in ipairs( sequence ) do
reversed[count + 1 - k] = v
end
return reversed
end
squash.list = function( sequence )
if #sequence == 0 or #sequence == 1 then
return sequence
end
local last = table.remove( sequence )
return table.concat( { table.concat( sequence, squash._comma ), last }, squash._and )
end
squash.analysis = function( ... )
local strings = { ... }
if 1 == #strings and type( strings[1] ) == 'table' then
strings = strings[1]
end
local tokenized = {}
for i,str in ipairs(strings) do
tokenized[i] = squash.tokenize( str )
end
local head = squash.extract( tokenized )
local reversed = {}
for i,list in ipairs(tokenized) do
reversed[i] = squash.reverse( list )
end
local tail = squash.reverse( squash.extract( reversed ) )
return table.concat( head, '' ), table.concat( tail, '' )
end
squash.synthesis = function( ... )
local strings = { ... }
if 1 == #strings and type( strings[1] ) == 'table' then
strings = strings[1]
end
local head, tail = squash.analysis(strings)
local fragments = {}
local lang = mw.getContentLanguage()
for i,str in ipairs( strings ) do
fragments[i] = mw.ustring.sub( str,
mw.ustring.len( head ) + 1,
mw.ustring.len( str ) - mw.ustring.len( tail ) )
if i ~= 1 then
fragments[i] = lang:lcfirst( fragments[i] )
end
end
return head .. squash.list( fragments ) .. tail
end
squash.run = function( frame )
return mw.message.newRawMessage( squash.synthesis( frame.args ), #(frame.args) )
end
return squash