Modul:Vorlage:Personendaten/plugin
Zur Navigation springen
Zur Suche springen
Vorlagenprogrammierung | Diskussionen | Lua | Test | Unterseiten | |||
---|---|---|---|---|---|---|---|
Modul | Deutsch | English
|
Modul: | Dokumentation |
local Serial = "2017-01-01"
--[=[
Vorlage:Personendaten/plugin
]=]
local HandleRomanNumeral
local function RomanNumeralInPD( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
-- Rückgabewerte:
-- 1 -- true: NAME= wahrscheinlich falsch angesetzt
-- 2 -- true: römische Zahl durch numerischen Wert ersetzen
local r1, r2 = false, false
local prnLemmTable, prnNameTable, success, errorInName =
HandleRomanNumeral( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
-- provisional:
if #prnNameTable > 0 then
r2 = prnNameTable[1].isRN
end
r1 = errorInName
return r1, r2
end -- RomanNumeralInPD()
local testHandleRomanNumeral = true
-- 'PRN' means 'potential roman numeral' everywhere.
-- As PRN counts any nonempty sequence of the 'digits'
-- 'I', 'V', 'X', 'L'
-- which is delimited
-- at the left by blank, '(', '.', or '/', i.e. [ (./],
-- at the right by blank, ')', '.', ',' or '/', i.e. [ )./,],
-- where the string's begin and end are handled as blanks.
-- 'RN' means 'roman numeral' everywhere.
-- 'name' means the value of the NAME-field
-- in the template 'Personendaten', where appropriate.
local test = false -- run in test mode?
local fname = "?" -- function name, for error messages
local errors = {[0] = ""} -- table of error messages (with sentinel)
local function handleErrors()
-- Print collected error messages, then clear message table 'errors'.
if test then
table.sort(errors)
for i, errstring in ipairs(errors) do
if errstring ~= errors[i-1] then
print( fname .. ": " .. errstring )
end
end
end
errors = {[0] = ""}
end -- handleErrors
local function wanted( condition, errstring )
-- Check condition and note, if it is invalid.
-- Parameter:
-- condition -- Boolean, condition to check for validity
-- errstring -- string, message to note in case of invalidity
-- Returns:
-- condition
if not condition and test then
errors[#errors + 1] = errstring
end
return condition
end -- wanted
local function normalize( s )
-- Strip blanks from both ends and replace them by exactly one blank.
-- Parameter:
-- s -- string, to handle
-- Returns:
-- modified string
while s:sub( 1, 1) == " " do s = s:sub(2) end
while s:sub(-1, -1) == " " do s = s:sub(1, -2) end
return " " .. s .. " "
end -- normalize
local function split( lon )
-- Find and collect PRNs in given lemma or name.
-- Parameter:
-- lon -- string, lemma or name
-- Returns:
-- table of tables, one per found PRN
-- The contained tables have 6 components:
-- lon -- string, lemma or name,
-- normalized by function 'normalize';
-- ixA -- number, index immediately left of found PRN;
-- ixZ -- number, index immediately right of found PRN;
-- prn -- string, found PRN;
-- isRN -- Boolean, does it mean a RN?;
-- value -- number, value of the RN, if isRN says, it is one.
local res = {}
local ixA, ixZ, prn
ixZ = 1
while true do
ixA, ixZ, prn = lon:find( "[ (./]([IVXL]+)[. ,)/]", ixZ )
if not ixA then break end
res[#res + 1] = {lon = lon, ixA = ixA, ixZ = ixZ, prn = prn}
end
return res
end -- split
local function year( geb )
-- Compute birth year. (May be very rough.)
-- Parameters:
-- geb -- string, field GEBURTSDATUM from template 'Personendaten'
-- Returns:
-- number, rough birth year,
-- only important: 'less/equal than 1810 or greater?'
-- (For now: '19. Jahrhundert' counts as 1900;
-- default value is 2000.)
local res
if geb:find( "Chr" ) then
res = 0
else
res = ( geb:match( "[12][0-9][0-9][0-9]" )
or geb:match( "[1-9][0-9][0-9]" )
or geb:match( "([12][0-9])%.? Jahrhundert" )
or geb:match( "([1-9])%.? Jahrhundert" )
or geb:match( "([1-9])[0-9]" ) -- avoid nn*100>1810
or geb:match( "[0-9]" )
or "2000" ) -- default, prefers 'PRN is not a RN'
+ 0 -- convert to number
end
if res < 100 then res = 100 * res end
return res
end -- year
local function RomanNumeralToNumber( numeral )
-- Compute the value of a roman numeral (range I - LXXXIX)
-- and check its validity.
-- Parameter:
-- numeral -- string, nonempty sequence of romandigits,
-- where roman digits are I, V, X, L.
-- Returns:
-- number -- value of the handled numeral, if it is a valid one
-- Boolean -- is it a valid roman numeral?
-- (...IIII counts for valid, ...XXXX... not)
local v = 0 -- value
local j = 1
if numeral:sub(j, j+1) == "XL" then j = j + 2; v = v + 40
else
if numeral:sub(j, j ) == "L" then j = j + 1; v = v + 50 end
if numeral:sub(j, j ) == "X" then j = j + 1; v = v + 10 end
if numeral:sub(j, j ) == "X" then j = j + 1; v = v + 10 end
if numeral:sub(j, j ) == "X" then j = j + 1; v = v + 10 end
end
if numeral:sub(j, j+1) == "IX" then j = j + 2; v = v + 9
elseif numeral:sub(j, j+1) == "IV" then j = j + 2; v = v + 4
else
if numeral:sub(j, j ) == "V" then j = j + 1; v = v + 5 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
-- accept XIIII
end
--[[ -- alternative:
if numeral:sub(j, j+1) == "XL" then j = j + 2; v = v + 40; goto l end
if numeral:sub(j, j ) == "L" then j = j + 1; v = v + 50 end
if numeral:sub(j, j ) == "X" then j = j + 1; v = v + 10 end
if numeral:sub(j, j ) == "X" then j = j + 1; v = v + 10 end
if numeral:sub(j, j ) == "X" then j = j + 1; v = v + 10 end
::l::
if numeral:sub(j, j+1) == "IX" then j = j + 2; v = v + 9; goto f end
if numeral:sub(j, j+1) == "IV" then j = j + 2; v = v + 4; goto f end
if numeral:sub(j, j ) == "V" then j = j + 1; v = v + 5 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
if numeral:sub(j, j ) == "I" then j = j + 1; v = v + 1 end
-- accept XIIII
::f::
--]]
return v, ( j == #numeral + 1 ) -- valid, if numeral is exhausted
end
------------------------------ HandleRomanNumeral (head)
HandleRomanNumeral = function (Lemma, Name, Kurz, Geburt)
-- Analyze roman numerals in Personendaten and prepare SORTIERUNG.
-- Parameter:
-- Lemma -- string, lemma of an article in WP-de on a person
-- Name -- string, NAME from Personendaten of that article
-- Kurz -- string, KURZBESCHREIBUNG from Personendaten
-- Geburt -- string, GRBURTSDATUM from Personendaten
-- Returns:
-- 1 prnLemmTable -- table of tables for the lemma,
-- see above on function split;
-- isRN and value are set now
-- 2 prnNameTable -- table of tables for the name, ditto
-- 3 success -- Boolean, did function work successfully?
-- 4 errorInName -- Boolean, 'NAME seems to be errorneous?'
-- (only first trial yet, to answer,
-- shouldn't be handled here anyway)
local savedFname = fname
fname = "HandleRomanNumeral"
local savedTest = test
test = testHandleRomanNumeral
local prnLemmTable, prnNameTable = {}, {} -- results
local success = true
local errorInName = false -- only one case checked yet
local err = {
argType = "invalid argument type, string wanted",
lemmPRNopen = "PRN in lemma could not be resolved",
namePRNopen = "PRN in NAME could not be resolved",
contradiction = "lemma and NAME seem to contradict",
numeralsDiffer = "different numerals in lemma and NAME",
}
-- error: argument not of type 'string':
success = wanted( type(Lemma) == "string"
and type(Name) == "string"
and type(Kurz) == "string"
and type(Geburt) == "string", err.argType )
and success
if not success then -- hard error, quick return
-- goto ::retour:: -- would be better than the following 4 lines
handleErrors()
test = savedTest
fname = savedFname
return prnLemmTable, prnNameTable, success, errorInName
end
local lem = normalize( Lemma )
local nam = normalize( Name )
local krz = Kurz
local geb = Geburt
prnLemmTable = split( lem )
prnNameTable = split( nam )
------------------------------ handlePrn, local in HandleRomanNumeral
local function handlePrn( prnInfoTable, containingTable )
-- Compute, is a PRN a RN?
-- Parameters:
-- prnInfoTable -- table with information on the PRN,
-- see above on function split;
-- on return, isRN and value should be set,
-- but it's possible, they are not.
-- containingTable -- table, containing the prnInfoTable, only
-- to indicate, we handle a lemma or a name
local lon = prnInfoTable.lon
local ixA = prnInfoTable.ixA
local ixZ = prnInfoTable.ixZ
local prn = prnInfoTable.prn
local isRN -- is it a roman numeral?
local value, valid = RomanNumeralToNumber( prn )
-- a RN should be valid, <=75, not at the beginning of lemma/name
if not valid
or value > 75 -- 75 == greatest roman numeral to expect
or ixA == 1 -- neither a lemma nor a name starts with a RN
then
isRN = false
-- Handle long PRNs, i.e. such with more than 1 digit;
-- they are RNs with some exceptions (artist names).
elseif #prn > 1 then
-- Long PRNs, not being a RN, seem to contain an 'L' always,
-- RNs with an 'L' are >= 40, they are expected only after
-- 'Heinrich' (XL - LXXV),
-- 'Günther' (XL - XLIV).
if value >= 40 then
isRN = lon:sub(ixA - 8, ixA) == "Heinrich "
or lon:sub(ixA - 8, ixA) == "Günther " --'ü' counts 2
else
isRN = true
end
-- Handle short PRNs, i.e. such with 1 digit only;
-- the decision here is one between letter or digit.
-- Handle the case PRN = 'L' first,
-- only Heinrich L. should have a numeral 'L'.
elseif prn == "L" then
isRN = ( lon:sub(ixA - 8, ixZ + 1) == "Heinrich L. " )
-- Only PRN = 'X', 'V', or 'I' remain to be handled.
-- A PRN in front of or after an abbreviation is a letter;
-- mostly that is the abbreviation of a first or middle name.
-- To avoid problems with multibyte-coded majuscules,
-- we check the periods near the PRN only.
-- 'period any period' right of PRN -> letter
elseif lon:sub(ixZ , ixZ ) == "."
and lon:sub(ixZ + 2, ixZ + 2) == "." then
isRN = false
-- 'period blank any period' right of PRN -> letter
elseif lon:sub(ixZ , ixZ + 1) == ". "
and lon:sub(ixZ + 3, ixZ + 3) == "." then
isRN = false
-- For 2-byte coded majuscules and for 2-letter abbreviations
-- as Th., Ch., etc. holds
-- 'period blank any any period' right of PRN -> letter.
-- This test can be misguided by 'Jr.', 'Sr.',
-- but that does no harm, because from that follows 'letter' too.
-- A little danger remains from 'St.'.
-- (Abbreviations with 3-byte-coded majuscules (as Ḫ) beside a PRN
-- are unlikely.)
elseif lon:sub(ixZ , ixZ + 1) == ". "
and lon:sub(ixZ + 4, ixZ + 4) == "." then
isRN = false
-- 'period' immediately left of PRN -> letter
elseif lon:sub(ixA , ixA ) == "." then
isRN = false
-- 'period blank' left of PRN -> letter,
-- exception: Dutch patronym or matronym abbreviations
elseif lon:sub(ixA - 1, ixA ) == ". "
and lon:sub(ixA - 3, ixA ) ~= "sz. "
and lon:sub(ixA - 3, ixA ) ~= "dr. " then
isRN = false
-- Some words make a digit very unlikely:
-- 'junior', 'senior', 'Jr.', 'Sr.', etc. in PRN -> letter.
-- (But note William Wrigley junior II.!)
elseif lon:match( "[ (]junior[ ),]" )
or lon:match( "[ (]senior[ ),]" )
or lon:match( "%(Junior%)" )
or lon:match( "%(Senior%)" )
or lon:match( "[ (][Jj]r[ .),]" )
or lon:match( "[ (][Ss]r[ .),]" )
or lon:match( "[ (][Jj]un%." )
or lon:match( "[ (][Ss]en%." ) then
isRN = false
-- ('Junior' and 'Senior' are more difficult.)
-- Some contexts argue for digit:
-- at the right:
-- '[period] blank openingParenthese',
-- 'period slash',
-- '[period] closingParenthese';
-- at the left:
-- 'period slash'.
elseif lon:sub(ixZ , ixZ + 2) == ". ("
or lon:sub(ixZ , ixZ + 1) == " ("
or lon:sub(ixZ , ixZ + 1) == "./"
or lon:sub(ixZ , ixZ + 1) == ".)"
or lon:sub(ixZ , ixZ ) == ")"
or lon:sub(ixA - 1, ixA ) == "./" then
isRN = true
-- Some contexts argue for letter:
-- at the left:
-- 'comma blank'.
-- (The exception, '[period] closingParenthese' follows,
-- has been handled already.)
elseif lon:sub(ixA - 1, ixA ) == ", " then
isRN = false
-- A PRN without period at the end of lemma or name
-- can be digit or letter.
elseif lon:sub(ixZ - 2, ixZ ) == " " .. prn .. " "
and ixZ == #lon then
-- A 'X' means letter.
if prn == "X" then
isRN = false
-- In a multiword lemma or name: digit (with two exceptions).
elseif lon:match( ". .+ . $" ) then
isRN = ( lon ~= " Florencia de la V "
and lon ~= " Prince Far I " )
-- In a one-word lemma or name, a 'V' means letter.
elseif prn == "V" then
isRN = false
-- In a one-word name, a 'I' means digit.
elseif containingTable == prnNameTable then
isRN = true
-- In a one-word lemma, a 'I' means letter,
-- with one exception.
else
isRN = ( lon == " Crooked I " )
end
-- A name without comma indicates 'digit',
-- some exceptions are handled already.
elseif containingTable == prnNameTable
and not lon:find( "," ) then
isRN = true
-- Some prepositions immediately after the PRN
-- are used for untitled persons,
-- but not exclusively, so this may go wrong.
elseif lon:sub(ixZ , ixZ + 4) == ". De "
or lon:sub(ixZ , ixZ + 4) == ". Le "
or lon:sub(ixZ , ixZ + 4) == ". du "
or lon:sub(ixZ , ixZ + 5) == ". Van " then
isRN = false
-- The next decisions will be done for the lemma only.
elseif containingTable == prnLemmTable then
-- If lemma ~= firstName blank PRN[.] blank lastName,
-- where the names are sequences of nonblanks, PRN is a digit.
-- (But with one exception: Haskell V. Anderson III .)
if not lon:match( "^ [^ ]+ [IVX]%.? [^ ]+ $" ) then
isRN = ( lon ~= " Haskell V. Anderson III " )
-- If the lemma has the form, given above, and
-- it is a 'US-amerikanisch' marked person, PRN is a letter.
-- (But with one exception: James I. Roosevelt.)
elseif krz:find( "US%-amerikanisch" ) then
isRN = ( lon == " James I. Roosevelt " )
-- Similarily, a 'Patriarch' will be followed by a digit.
elseif krz:find( "Patriarch" ) then
isRN = true
-- The use of roman numerals decreases in the 18./19. century,
-- the use of middle names increases.
-- So, but it can fail:
else
isRN = year( geb ) <= 1810
end
else
--
end
prnInfoTable.isRN = isRN
prnInfoTable.value = value
return
end -- handlePrn
------------------------------ HandleRomanNumeral (tail)
-- Handle found PRNs in lemma and name independently.
for i, prnInfoTable in ipairs( prnLemmTable ) do
handlePrn( prnInfoTable, prnLemmTable )
end
for i, prnInfoTable in ipairs( prnNameTable ) do
handlePrn( prnInfoTable, prnNameTable )
end
-- Fill missing results for lemma from that of name and reverse.
-- (The implementation is a very naive one,
-- but the case of two equal PRNs in lemma or name
-- is very seldom and seems to be restricted on two cases:
-- equal initials, as in 'Hubert L. L. Busard',
-- artist names, as 'M.I.K.I'.)
local usedAlready = 0 -- Avoid association of a PRN in name
-- with 2 PRNs in lemma.
for i, L in ipairs( prnLemmTable ) do
for j, N in ipairs( prnNameTable ) do
if L.prn == N.prn
and not ( L.isRN and N.isRN == false )
and not ( N.isRN and L.isRN == false )
and usedAlready ~= j
then
if L.isRN == nil then L.isRN = N.isRN end
if N.isRN == nil then N.isRN = L.isRN end
usedAlready = j
break
end
end
end
-- Sometimes lemma and name contain different numerals,
-- mostly but not always errorneously;
-- if there is only one in both, they can be associated.
-- (A message seems to be appropriate, but not to deny the success.)
if #prnLemmTable == 1 and #prnNameTable == 1 then
local L = prnLemmTable[1]
local N = prnNameTable[1]
if L.prn ~= N.prn
and not ( L.isRN and N.isRN == false )
and not ( N.isRN and L.isRN == false )
then
if L.isRN == nil then L.isRN = N.isRN end
if N.isRN == nil then N.isRN = L.isRN end
wanted( false, err.numeralsDiffer )
errorInName = true
end
end
-- Maybe, the question "RN or not' couldn't be solved;
-- in doubt, we take conservative action and choose 'not a RN'.
for j, L in ipairs( prnLemmTable ) do
success = wanted( L.isRN ~= nil, err.lemmPRNopen )
and success
L.isRN = L.isRN or false
end
for j, N in ipairs( prnNameTable ) do
success = wanted( N.isRN ~= nil, err.namePRNopen )
and success
N.isRN = N.isRN or false
end
-- The results for lemma and name may be different,
-- that need not be an error, but probably it is.
-- If there is exactly one PRN in both,
-- this situation should be noted, but not deny the success.
if #prnLemmTable == 1 and #prnNameTable == 1 then
local L = prnLemmTable[1]
local N = prnNameTable[1]
wanted( L.isRN == N.isRN, err.contradiction )
end
-- ::retour::
handleErrors()
test = savedTest
fname = savedFname
return prnLemmTable, prnNameTable, success, errorInName
end -- HandleRomanNumeral
-- Export
local p = {}
function p.facette( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
return RomanNumeralInPD( Lemma, NAME, KURZBESCHREIBUNG, GEBOREN )
end -- p.facette()
function p.failsafe()
return Serial
end
return p