Modul:Vorlage:Personendaten/plugin

aus Wikipedia, der freien Enzyklopädie
Zur Navigation springen Zur Suche springen
Vorlagenprogrammierung Diskussionen Lua Test Unterseiten
Modul Deutsch English


local Serial = "2017-01-01"
--[=[
Vorlage:Personendaten/plugin
]=]



local HandleRomanNumeral

local function RomanNumeralInPD( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
	-- Rückgabewerte:
	--    1 -- true: NAME= wahrscheinlich falsch angesetzt
	--    2 -- true: römische Zahl durch numerischen Wert ersetzen
	local r1, r2 = false, false

	local prnLemmTable, prnNameTable, success, errorInName =
	      HandleRomanNumeral( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )

	-- provisional:
	if #prnNameTable > 0 then
		r2 = prnNameTable[1].isRN
	end

	r1 = errorInName

	return r1, r2
end -- RomanNumeralInPD()

local testHandleRomanNumeral = true

-- 'PRN'  means 'potential roman numeral' everywhere.
--        As PRN counts any nonempty sequence of the 'digits'
--            'I', 'V', 'X', 'L'
--        which is delimited
--            at the left  by blank, '(', '.', or '/',     i.e. [ (./],
--            at the right by blank, ')', '.', ',' or '/', i.e. [ )./,],
--        where the string's begin and end are handled as blanks.
-- 'RN'   means 'roman numeral' everywhere.
-- 'name' means the value of the NAME-field 
--        in the template 'Personendaten', where appropriate.

local test   = false      -- run in test mode?
local fname  = "?"        -- function name, for error messages
local errors = {[0] = ""} -- table of error messages (with sentinel)

local function handleErrors()
	-- Print collected error messages, then clear message table 'errors'.
	if test then
		table.sort(errors)
		for i, errstring in ipairs(errors) do
			if errstring ~= errors[i-1] then
				print( fname .. ": " .. errstring )
			end
		end
	end
	errors = {[0] = ""}
end -- handleErrors

local function wanted( condition, errstring )
	-- Check condition and note, if it is invalid.
	-- Parameter:
	--    condition  -- Boolean, condition to check for validity
	--    errstring  -- string, message to note in case of invalidity
	-- Returns:
	--    condition
	if not condition and test then
		errors[#errors + 1] = errstring
	end
	return condition
end -- wanted

local function normalize( s )
	-- Strip blanks from both ends and replace them by exactly one blank.
	-- Parameter:
	--    s -- string, to handle
	-- Returns:
	--    modified string
	while s:sub( 1,  1) == " " do s = s:sub(2)     end
	while s:sub(-1, -1) == " " do s = s:sub(1, -2) end
	return " " .. s .. " "
end -- normalize

local function split( lon )
	-- Find and collect PRNs in given lemma or name.
	-- Parameter:
	--    lon -- string, lemma or name
	-- Returns:
	--    table of tables, one per found PRN
	--    The contained tables have 6 components:
	--       lon   -- string, lemma or name,
	--                normalized by function 'normalize';
	--       ixA   -- number, index immediately left  of found PRN;
	--       ixZ   -- number, index immediately right of found PRN;
	--       prn   -- string, found PRN;
	--       isRN  -- Boolean, does it mean a RN?;
	--       value -- number, value of the RN, if isRN says, it is one.
	local res = {}
	local ixA, ixZ, prn
	ixZ = 1
	while true do
		ixA, ixZ, prn = lon:find( "[ (./]([IVXL]+)[. ,)/]", ixZ )     
		if not ixA then break end
		res[#res + 1] = {lon = lon, ixA = ixA, ixZ = ixZ, prn = prn}
	end
	return res
end -- split

local function year( geb )
	-- Compute birth year. (May be very rough.)
	-- Parameters:
	--    geb -- string, field GEBURTSDATUM from template 'Personendaten'
	-- Returns:
	--    number, rough birth year,
	--       only important: 'less/equal than 1810 or greater?'
	--       (For now: '19. Jahrhundert' counts as 1900;
	--                 default value is 2000.)
	local res
	if geb:find( "Chr" ) then
		res = 0
	else
		res = ( geb:match( "[12][0-9][0-9][0-9]" )
		     or geb:match( "[1-9][0-9][0-9]"     )
		     or geb:match( "([12][0-9])%.? Jahrhundert" )
		     or geb:match( "([1-9])%.? Jahrhundert"     )
		     or geb:match( "([1-9])[0-9]"        ) -- avoid nn*100>1810
		     or geb:match( "[0-9]"               )
		     or "2000" ) -- default, prefers 'PRN is not a RN'
		    + 0          -- convert to number
	end
	if res < 100 then res = 100 * res end
	return res
end -- year

local function RomanNumeralToNumber( numeral )
	-- Compute the value of a roman numeral (range I - LXXXIX)
	-- and check its validity.
	-- Parameter:
	--    numeral  -- string, nonempty sequence of romandigits,
	--                where roman digits are I, V, X, L.
	-- Returns:
	--    number   -- value of the handled numeral, if it is a valid one
	--    Boolean  -- is it a valid roman numeral?
	--                (...IIII counts for valid, ...XXXX... not)
	local v = 0  -- value

	local j = 1
	if      numeral:sub(j, j+1) == "XL" then j = j + 2; v = v + 40
	else
		if  numeral:sub(j, j  ) == "L"  then j = j + 1; v = v + 50 end
		if  numeral:sub(j, j  ) == "X"  then j = j + 1; v = v + 10 end
		if  numeral:sub(j, j  ) == "X"  then j = j + 1; v = v + 10 end
		if  numeral:sub(j, j  ) == "X"  then j = j + 1; v = v + 10 end
	end
	if      numeral:sub(j, j+1) == "IX" then j = j + 2; v = v +  9
	elseif  numeral:sub(j, j+1) == "IV" then j = j + 2; v = v +  4
	else
		if  numeral:sub(j, j  ) == "V"  then j = j + 1; v = v +  5 end
		if  numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
		if  numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
		if  numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
		if  numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
		-- accept XIIII
	end

	--[[ -- alternative:
	if numeral:sub(j, j+1) == "XL" then j = j + 2; v = v + 40; goto l end
	if numeral:sub(j, j  ) == "L"  then j = j + 1; v = v + 50 end
	if numeral:sub(j, j  ) == "X"  then j = j + 1; v = v + 10 end
	if numeral:sub(j, j  ) == "X"  then j = j + 1; v = v + 10 end
	if numeral:sub(j, j  ) == "X"  then j = j + 1; v = v + 10 end
	::l::
	if numeral:sub(j, j+1) == "IX" then j = j + 2; v = v +  9; goto f end
	if numeral:sub(j, j+1) == "IV" then j = j + 2; v = v +  4; goto f end
	if numeral:sub(j, j  ) == "V"  then j = j + 1; v = v +  5 end
	if numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
	if numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
	if numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
	if numeral:sub(j, j  ) == "I"  then j = j + 1; v = v +  1 end
	-- accept XIIII
	::f::
	--]]

	return v, ( j == #numeral + 1 ) -- valid, if numeral is exhausted
end

------------------------------ HandleRomanNumeral (head)

HandleRomanNumeral = function (Lemma, Name, Kurz, Geburt)
	-- Analyze roman numerals in Personendaten and prepare SORTIERUNG.
	-- Parameter:
	--    Lemma    -- string, lemma of an article in WP-de on a person
	--    Name     -- string, NAME from Personendaten of that article
	--    Kurz     -- string, KURZBESCHREIBUNG from Personendaten
	--    Geburt   -- string, GRBURTSDATUM from Personendaten
	-- Returns:
	--    1  prnLemmTable  -- table of tables for the lemma,
	--                        see above on function split;
	--                        isRN and value are set now
	--    2  prnNameTable  -- table of tables for the name, ditto
	--    3  success       -- Boolean, did function work successfully?
	--    4  errorInName   -- Boolean, 'NAME seems to be errorneous?'
	--                        (only first trial yet, to answer,
	--                        shouldn't be handled here anyway)

	local savedFname = fname
	fname = "HandleRomanNumeral"

	local savedTest  = test
	test = testHandleRomanNumeral

	local prnLemmTable, prnNameTable = {}, {}  -- results
	local success = true

	local errorInName = false  -- only one case checked yet

	local err = {
		argType        = "invalid argument type, string wanted",
		lemmPRNopen    = "PRN in lemma could not be resolved",
		namePRNopen    = "PRN in NAME could not be resolved",
		contradiction  = "lemma and NAME seem to contradict",
		numeralsDiffer = "different numerals in lemma and NAME",
	}

	-- error: argument not of type 'string':
	success = wanted( type(Lemma)  == "string"
	              and type(Name)   == "string"
	              and type(Kurz)   == "string"
	              and type(Geburt) == "string", err.argType )
	          and success
	if not success then  -- hard error, quick return
--		goto ::retour::   -- would be better than the following 4 lines
		handleErrors()
		test  = savedTest
		fname = savedFname
		return prnLemmTable, prnNameTable, success, errorInName
	end
	
	local lem = normalize( Lemma )
	local nam = normalize( Name  )
	local krz = Kurz
	local geb = Geburt

	prnLemmTable = split( lem )
	prnNameTable = split( nam )

------------------------------ handlePrn, local in HandleRomanNumeral

	local function handlePrn( prnInfoTable, containingTable )
		-- Compute, is a PRN a RN?
		-- Parameters:
		--    prnInfoTable    -- table with information on the PRN,
		--                       see above on function split;
		--                       on return, isRN and value should be set,
		--                       but it's possible, they are not.
		--    containingTable -- table, containing the prnInfoTable, only
		--                       to indicate, we handle a lemma or a name

		local lon = prnInfoTable.lon
		local ixA = prnInfoTable.ixA
		local ixZ = prnInfoTable.ixZ
		local prn = prnInfoTable.prn
		local isRN   -- is it a roman numeral?

		local value, valid = RomanNumeralToNumber( prn )

		-- a RN should be valid, <=75, not at the beginning of lemma/name
		if not valid
			or  value > 75   -- 75 == greatest roman numeral to expect
			or  ixA == 1     -- neither a lemma nor a name starts with a RN
		then
			isRN = false

		-- Handle long PRNs, i.e. such with more than 1 digit;
		-- they are RNs with some exceptions (artist names).
		elseif #prn > 1 then
			-- Long PRNs, not being a RN, seem to contain an 'L' always,
			-- RNs with an 'L' are >= 40, they are expected only after
			--   'Heinrich' (XL - LXXV),
			--   'Günther'  (XL - XLIV).
			if value >= 40 then
				isRN =    lon:sub(ixA - 8, ixA) == "Heinrich "
				       or lon:sub(ixA - 8, ixA) == "Günther " --'ü' counts 2 
			else
				isRN = true
			end

		-- Handle short PRNs, i.e. such with 1 digit only;
		-- the decision here is one between letter or digit.

		-- Handle the case PRN = 'L' first,
		-- only Heinrich L. should have a numeral 'L'.
		elseif prn == "L" then
			isRN = ( lon:sub(ixA - 8, ixZ + 1) == "Heinrich L. " )

		-- Only PRN = 'X', 'V', or 'I' remain to be handled.

		-- A PRN in front of or after an abbreviation is a letter;
		-- mostly that is the abbreviation of a first or middle name.
		-- To avoid problems with multibyte-coded majuscules,
		-- we check the periods near the PRN only.

		-- 'period any period' right of PRN -> letter
		elseif    lon:sub(ixZ    , ixZ    ) == "."
			  and lon:sub(ixZ + 2, ixZ + 2) == "."    then
			isRN = false
		-- 'period blank any period' right of PRN -> letter
		elseif    lon:sub(ixZ    , ixZ + 1) == ". "
			  and lon:sub(ixZ + 3, ixZ + 3) == "."    then
			isRN = false
		-- For 2-byte coded majuscules and for 2-letter abbreviations
		-- as Th., Ch., etc. holds
		-- 'period blank any any period' right of PRN -> letter.
		-- This test can be misguided by 'Jr.', 'Sr.',
		-- but that does no harm, because from that follows 'letter' too.
		-- A little danger remains from 'St.'.
		-- (Abbreviations with 3-byte-coded majuscules (as Ḫ) beside a PRN
		-- are unlikely.)
		elseif    lon:sub(ixZ    , ixZ + 1) == ". "
		      and lon:sub(ixZ + 4, ixZ + 4) == "."    then
			isRN = false

		-- 'period' immediately left of PRN -> letter
		elseif    lon:sub(ixA    , ixA    ) == "."    then
			isRN = false
		-- 'period blank' left of PRN -> letter,
		-- exception: Dutch patronym or matronym abbreviations
		elseif    lon:sub(ixA - 1, ixA    ) == ". "
		      and lon:sub(ixA - 3, ixA    ) ~= "sz. "
		      and lon:sub(ixA - 3, ixA    ) ~= "dr. " then
			isRN = false

		-- Some words  make a digit very unlikely:
		-- 'junior', 'senior', 'Jr.', 'Sr.', etc. in  PRN -> letter.
		-- (But note William Wrigley junior II.!)
		elseif    lon:match( "[ (]junior[ ),]"  )
			   or lon:match( "[ (]senior[ ),]"  )
			   or lon:match(   "%(Junior%)"     )
			   or lon:match(   "%(Senior%)"     )
			   or lon:match( "[ (][Jj]r[ .),]"  )
			   or lon:match( "[ (][Ss]r[ .),]"  )
			   or lon:match( "[ (][Jj]un%."     )
			   or lon:match( "[ (][Ss]en%."     )     then
			isRN = false
		-- ('Junior' and 'Senior' are more difficult.)

		-- Some contexts argue for digit:
		-- at the right:
		--    '[period] blank openingParenthese',
		--    'period slash',
		--    '[period] closingParenthese';
		-- at the left:
		--    'period slash'.
		elseif    lon:sub(ixZ    , ixZ + 2) == ". ("
		       or lon:sub(ixZ    , ixZ + 1) == " ("
               or lon:sub(ixZ    , ixZ + 1) == "./"
		       or lon:sub(ixZ    , ixZ + 1) == ".)"
		       or lon:sub(ixZ    , ixZ    ) == ")"
		       or lon:sub(ixA - 1, ixA    ) == "./"   then
			isRN = true

		-- Some contexts argue for letter:
		-- at the left:
		--    'comma blank'.
		-- (The exception, '[period] closingParenthese' follows,
		-- has been handled already.)
		elseif    lon:sub(ixA - 1, ixA    ) == ", "   then
			isRN = false

		-- A PRN without period at the end of lemma or name
		-- can be digit or letter.
		elseif    lon:sub(ixZ - 2, ixZ    ) == " " .. prn .. " "
		      and ixZ == #lon                         then

			-- A 'X' means letter.
			if     prn == "X" then
				isRN = false

			-- In a multiword lemma or name: digit (with two exceptions).
			elseif lon:match( ". .+ . $" ) then
				isRN = (   lon ~= " Florencia de la V "
			           and lon ~= " Prince Far I "     )
				
			-- In a one-word lemma or name, a 'V' means letter.
			elseif prn == "V" then
				isRN = false

			-- In a one-word name, a 'I' means digit.
			elseif containingTable == prnNameTable then
				isRN = true

			-- In a one-word lemma, a 'I' means letter,
			-- with one exception.
			else
				isRN = ( lon == " Crooked I " )
			end

		-- A name without comma indicates 'digit',
		-- some exceptions are handled already.
		elseif containingTable == prnNameTable
			   and not lon:find( "," )                then
			isRN = true

		-- Some prepositions immediately after the PRN
		-- are used for untitled persons,
		-- but not exclusively, so this may go wrong.
		elseif    lon:sub(ixZ    , ixZ + 4) == ". De "
		       or lon:sub(ixZ    , ixZ + 4) == ". Le "
		       or lon:sub(ixZ    , ixZ + 4) == ". du "
		       or lon:sub(ixZ    , ixZ + 5) == ". Van "  then
			isRN = false

		-- The next decisions will be done for the lemma only.
		elseif containingTable == prnLemmTable        then

			-- If lemma ~= firstName blank PRN[.] blank lastName,
			-- where the names are sequences of nonblanks, PRN is a digit.
			-- (But with one exception: Haskell V. Anderson III .)
			if not lon:match( "^ [^ ]+ [IVX]%.? [^ ]+ $" ) then
				isRN = ( lon ~= " Haskell V. Anderson III " )

			-- If the lemma has the form, given above, and
			-- it is a 'US-amerikanisch' marked person, PRN is a letter.
			-- (But with one exception: James I. Roosevelt.)
			elseif krz:find( "US%-amerikanisch" )  then
				isRN = ( lon == " James I. Roosevelt " )

			-- Similarily, a 'Patriarch' will be followed by a digit.
			elseif krz:find( "Patriarch" )         then
				isRN = true

			-- The use of roman numerals decreases in the 18./19. century,
			-- the use of middle names increases.
			-- So, but it can fail:
			else
				isRN = year( geb ) <= 1810
			end

		else
			--
		end

		prnInfoTable.isRN  = isRN
		prnInfoTable.value = value

		return
	end -- handlePrn

------------------------------ HandleRomanNumeral (tail)

	-- Handle found PRNs in lemma and name independently.
	for i, prnInfoTable in ipairs( prnLemmTable ) do
		handlePrn( prnInfoTable, prnLemmTable )
	end
	for i, prnInfoTable in ipairs( prnNameTable ) do
		handlePrn( prnInfoTable, prnNameTable )
	end

	-- Fill missing results for lemma from that of name and reverse.
	-- (The implementation is a very naive one,
	-- but the case of two equal PRNs in lemma or name
	-- is very seldom and seems to be restricted on two cases:
	--    equal initials, as in 'Hubert L. L. Busard',
	--    artist names, as 'M.I.K.I'.)
	local usedAlready = 0  -- Avoid association of a PRN in name
	                       -- with 2 PRNs in lemma.
	for i, L in ipairs( prnLemmTable ) do
		for j, N in ipairs( prnNameTable ) do

			if L.prn == N.prn
				and not ( L.isRN and N.isRN == false )
				and not ( N.isRN and L.isRN == false )
				and usedAlready ~= j
			then
				if L.isRN == nil then L.isRN = N.isRN end
				if N.isRN == nil then N.isRN = L.isRN end

				usedAlready = j
				break
			end

		end
	end

	-- Sometimes lemma and name contain different numerals,
	-- mostly but not always errorneously;
	-- if there is only one in both, they can be associated.
	-- (A message seems to be appropriate, but not to deny the success.)
	if #prnLemmTable == 1 and #prnNameTable == 1 then
		local L = prnLemmTable[1]
		local N = prnNameTable[1]

		if L.prn ~= N.prn
		   and not ( L.isRN and N.isRN == false )
		   and not ( N.isRN and L.isRN == false )
		then
			if L.isRN == nil then L.isRN = N.isRN end
			if N.isRN == nil then N.isRN = L.isRN end

			wanted( false, err.numeralsDiffer )
			errorInName = true
		end
	end

	-- Maybe, the question "RN or not' couldn't be solved;
	-- in doubt, we take conservative action and choose 'not a RN'.
	for j, L in ipairs( prnLemmTable ) do
		success = wanted( L.isRN ~= nil, err.lemmPRNopen )
		          and success
		L.isRN = L.isRN or false
	end

	for j, N in ipairs( prnNameTable ) do
		success = wanted( N.isRN ~= nil, err.namePRNopen )
		          and success
		N.isRN = N.isRN or false
	end

	-- The results for lemma and name may be different,
	-- that need not be an error, but probably it is.
	-- If there is exactly one PRN in both,
	-- this situation should be noted, but not deny the success.
	if #prnLemmTable == 1 and #prnNameTable == 1 then
		local L = prnLemmTable[1]
		local N = prnNameTable[1]

		wanted( L.isRN == N.isRN, err.contradiction )
	end

--	::retour::
	handleErrors()
	test  = savedTest
	fname = savedFname
	return prnLemmTable, prnNameTable, success, errorInName
end -- HandleRomanNumeral



-- Export
local p = {}

function p.facette( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
    return RomanNumeralInPD( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
end -- p.facette()

function p.failsafe()
    return Serial
end

return p