Modul:Squash

Fra Wikipedia, den frie encyklopedi
Moduldokumentasjon
--- Squashing repeated parts of sentences
-- This is a proof of concept for the spec extension, and is not ment to be
-- the final version.
-- © John Erling Blad, Creative Commons by Attribution 3.0

local squash = {}

squash.stopWords = {
	['en']=true, ['én']=true, ['ein']=true,
	['ei']=true, ['éi']=true,
	['et']=true, ['ét']=true, ['eit']=true, ['ett']=true, ['første']=true,
	['to']=true, ['andre']=true,
	['tre']=true, ['tredje']=true,
	['fire']=true, ['fjerde']=true,
	['fem']=true, ['femte']=true,
	['seks']=true, ['sjette']=true,
	['sju']=true, ['syv']=true, ['syvende']=true,
	['åtte']=true, ['åttende']=true,
	['ni']=true, ['niende']=true,
	['ti']=true, ['tiende']=true
}

squash.stopPatterns = { "^%d+%a+", "^%d+%." }

squash.patterns = { "^[-%a]+", "^%d+%a+", "^%d+%.", "^%b{}", "^%b[]", "^%b()", "^%b<>", "^%p+", "^%s+" }

squash._comma = ', '

squash._and = ' og '

squash.isStopWord = function( token )
	local str = mw.ustring.lower( token )
	return squash.stopWords[mw.ustring.lower( str )]
end

squash.isStopPattern = function( token )
	local halt = false
	for _,pattern in ipairs( squash.stopPatterns ) do
		halt = (mw.ustring.match( token, pattern ) and true) or halt
		if halt then
			break
		end
	end
	return halt
end

squash.tokenize = function( str )
	local fragments = {}
	local start = 1
	repeat
		local fragment = nil
		for _,pattern in ipairs( squash.patterns ) do
			fragment = mw.ustring.match( str, pattern, start )
			if not fragment then
				-- this is a fallback if the ordinary patterns fail
				fragment = mw.ustring.match( str, "^[^%s]+", start )
			end
			if fragment then
				start = mw.ustring.len(fragment) + start
				table.insert( fragments, fragment )
				break
			end
		end
	until not fragment
	return fragments
end

squash.extract = function( tokens )
	local collected = {}
	for i,token in ipairs( tokens[1] ) do
		if squash.isStopWord( token ) or squash.isStopPattern( token ) then
			break
		end
		local found = 0
		for j=1,#tokens do
			if token == tokens[j][i] then
				found = 1 + found
			end
		end
		if found ~= #tokens then
			break
		end
		table.insert( collected, token )
	end
	return collected
end

squash.reverse = function( sequence )
	local reversed = {}
    local count = #sequence
    for k, v in ipairs( sequence ) do
        reversed[count + 1 - k] = v
    end
	return reversed
end

squash.list = function( sequence )
	if #sequence == 0 or #sequence == 1 then
		return sequence
	end
	local last = table.remove( sequence )
	return table.concat( { table.concat( sequence, squash._comma ), last }, squash._and )
end
	
squash.analysis = function( ... )
	local strings = { ... }
	if 1 == #strings and type( strings[1] ) == 'table' then
		strings = strings[1]
	end
	
	local tokenized = {}
	for i,str in ipairs(strings) do
		tokenized[i] = squash.tokenize( str )
	end
	
	local head = squash.extract( tokenized )
	
	local reversed = {}
	for i,list in ipairs(tokenized) do
		reversed[i] = squash.reverse( list )
	end
	
	local tail = squash.reverse( squash.extract( reversed ) )

	return table.concat( head, '' ), table.concat( tail, '' )
end

squash.synthesis = function( ... )
	local strings = { ... }
	if 1 == #strings and type( strings[1] ) == 'table' then
		strings = strings[1]
	end
	local head, tail = squash.analysis(strings)
	local fragments = {}
	local lang = mw.getContentLanguage()
	for i,str in ipairs( strings ) do
		fragments[i] = mw.ustring.sub( str,
			mw.ustring.len( head ) + 1,
			mw.ustring.len( str ) - mw.ustring.len( tail ) )
		if i ~= 1 then
			fragments[i] = lang:lcfirst( fragments[i] )
		end
	end
	return head .. squash.list( fragments ) .. tail
end

squash.run = function( frame )
	return mw.message.newRawMessage( squash.synthesis( frame.args ), #(frame.args) )
end

return squash