Module:Sandbox/DePiep/uchar

From WikiProjectMed
Jump to navigation Jump to search

-- todo split consist Char and Args
-- todo cwith double dotcircle 230/239, 233, 234
-- thought: option "speccial notes", listing: "whitesace, control, combining, NaC, .."
require( 'strict' )
local p = {}
local getArgs      = require( 'Module:Arguments' ).getArgs
local uChar_data   = mw.loadData( 'Module:Sandbox/DePiep/uchar/data' )
local uData        = require('Module:Unicode data')
local uData_helper = require('Module:Sandbox/DePiep/uchar-helper')
local uBaseConvert = require('Module:BaseConvert')
local yesno        = require('Module:Yesno')
local str          = require('Module:String')
local plaintext    = require('Module:Plain text')
--- local tabletools   = require('Module:TableTools')
local ERRstatus    = ''
local tUchar       = {}

local DOTTED_CIRCLE      = '◌' -- U+25CC
local NBSP               = ' ' -- U+00A0  
local LEFT_TO_RIGHT_MARK = '‎' -- U+200E LEFT-TO-RIGHT MARK (‎) 
local DEFAULT_IMAGE_SIZE = '21px'
local WS_BLUE            = 'lightblue'

local function testH( s )
local h = mw.html.create('span')
h
	:attr('id', 'testH')
	:tag('big')
	:css('background', WS_BLUE)
	:wikitext( s )
	--:newline()

	return tostring(h)
end

local function addStyles( tChar )
	
local h = mw.html.create('span')

	h
		:attr('id', 'testH')
		:css('font-size', '150%')
		:wikitext( tChar.uChar )
		if tChar.uIsWhitespace == true then
			h:css('background', WS_BLUE)
		end
		--:newline()

	return tostring(h)
end

function p.testH( frame )
local origArgs = getArgs( frame )
	return testH( origArgs[1] )
end

function p.testFromDoc(frame)
local div = mw.html.create( 'div' )
div
     :attr( 'id', 'testdiv' )
     :css( 'width', '100%' )
     :wikitext( 'Some text' )
     :tag( 'hr' )
	return tostring( div )
-- Output: <div id="testdiv" style="width:100%;">Some text<hr /></div>
end


-- FORMATTERS ===== ===== ===== ===== ===== ===== ===== =====
local function inTag( s, arg, val, divspan )
local obj
local rprt = ''
	if divspan == 'div' or divspan == 'span' then
	else
		return nil -- ERR
	end

	return s, rprt
end

local function decodeString( s )
	if s == nil then return nil end
	return mw.text.decode( s )
end

-- Format string in <code> tag / from m:str find word
-- replaces whitespace by single nbsp ( keep untrimmed ws visible )
local function inCode( s )
	if s == nil then return '' end
	s = string.gsub( s, '%s+', '&nbsp;' )
	return '<code>' .. s .. '</code>'
end

-- Use mono font-family ( from: Template:Mono )
local function inMono( s )
	if s == nil then s = '' end
	s = string.gsub( s, '%s+', '&nbsp;' )
	return '<span class="monospaced" style="font-family: monospace, monospace;">' .. s .. '</span>'
end

local function inSmallcaps( s )
	if ( s == nil ) or ( s == '' ) then return '' end
	-- '<templatestyles src="smallcaps/styles.css"/>'
	-- Smallcaps/styles.css: span.smallcaps {font-variant: small-caps;}
	local sc
	-- sc = '<templatestyles src="smallcaps/styles.css"/>'
	sc = '<span class="smallcaps-smaller" style="font-size:85%; xxxvariant: small-caps;">' .. s .. '</span>'
	return sc
end

local function xlLinkFileFormat( uHexBare0x, uHexFormat, sGenCat )
-- depending on parameter used, xlink one of two
	if uHexBare0x ~= nil then -- Character data page
		-- https://www.fileformat.info/info/unicode/char/00ad/index.htm (or "/ad/"); no 0x no uc
		return '[https://www.fileformat.info/info/unicode/char/' .. string.lower( uHexBare0x ) .. '/index.htm ff.info ' 
				.. uHexFormat .. ']'
	else -- GenCat list, for example gencat "Nd":
		-- https://www.fileformat.info/info/unicode/category/Nd/list.htm
		return '[https://www.fileformat.info/info/unicode/category/' .. sGenCat .. '/list.htm ff.info ' 
				.. sGenCat .. ']'
	end
end

-- UHEX HANDLERS & FORMATTERS ----- ----- ----- ----- ----- ----- ----- ----- ----- 
local function formatUhex( uHex0x, uLink )
-- formatting into normalform "U+00A9"
local uHexFmt -- working
	uHexFmt = string.gsub( uHex0x, '^0x', '' )
	uHexFmt = string.gsub( uHexFmt, '^0*', '' )
	uHexFmt = 'U+' .. string.sub( '0000' .. uHexFmt, - math.max( #uHexFmt, 4 ) )
	
	if uLink ~= nil then
		return uHexFmt .. '_[todo: fmt Uhex_link_U+]'
	end
	return uHexFmt
end

local function formatGenCat( sGenCat, fmt )
local tCat
	tCat = uChar_data.tGenCat[sGenCat]
	if tCat == nil then return '' end
	
	return inMono(sGenCat) .. '=' .. tCat[1]
end

-- Formats table ( array ) using concat
-- replace space by nbsp ( keep untrimmed sp )
-- in monospace font-family
local function formatTablelist( t ) -- unused?
local s = ''
	if t == nil then return '<?>' end
	s = table.concat( t, '; ' )
	s = mw.text.decode( string.gsub( s, '%s+', '&nbsp;' ) )
	s = '<' .. inMono( s ) .. '>'
	return s
end

local function formatCombiningChar( is_combining, cWith )
local addPrefix
local uCombWith -- working, cWith logic
local rprt
-- todo need 4-way logic for cwith
	cWith = decodeString( cWith )
	rprt = 'is_combi: ' .. tostring( is_combining ) .. '; cwith: ' .. tostring( cWith )

	-- strip wikicode; but save NBSP -- todo improve, test
	if cWith ~= nil then
		cWith = string.gsub( cWith, NBSP, 'NBSP' )
		cWith = plaintext._main( cWith, false )
		cWith = string.gsub( cWith, 'NBSP', NBSP)
	end
 
 	uCombWith = yesno( cWith ) -- y/n/nil (3-way logic; 'foo' == nil)
	addPrefix = ''
	if (cWith == nil) or (uCombWith == true) then -- default: per is_combining
		rprt = rprt .. '_dflt non-combi = none'
		if is_combining == true then
			addPrefix = DOTTED_CIRCLE
			rprt = rprt .. '_dflt'
		end
	elseif uCombWith == false then -- explicitly false, so suppress
		addPrefix = ''
		rprt = rprt .. '_false, suppress'
	else -- use character provided by cwith
		addPrefix = cWith
		rprt = rprt .. '_cleanchar: ' .. tostring( cWith )
	end
	
	return addPrefix, rprt
end

-- READ & PROCESS ==== ====== ===== ===== ===== ===== ===== =====  
local function convertHexInToHex0x( uHexAnyform )
local uHexBare0x
local uHex0x -- targets
local uHexNum
local uHexFormat

	if ( uHexAnyform == nil ) or ( uHexAnyform == '' ) then
		ERRstatus ='ERR convertHexInToHex0x: no uHex input'
		return nil
	end
	uHexBare0x = decodeString( uHexAnyform )
	uHexBare0x = string.gsub( uHexBare0x, '%s', '' )
	uHexBare0x = string.gsub( uHexBare0x, '^U%+', '' )
	uHexBare0x = string.gsub( uHexBare0x, '^0x', '' )
	uHexBare0x = string.upper( uHexBare0x )
	uHex0x = '0x' .. uHexBare0x
	
	-- number check
	uHexNum = tonumber( uHex0x ) -- kills NaN, todo: test this
	if uHexNum == nil then
		ERRstatus ='ERR convertHexInToHex0x: uHex is not hex: >' .. tostring( uHexNum ) .. '<'
		return nil
	elseif ( uHexNum < 0 ) or ( uHexNum > 0x10FFFF ) then
		ERRstatus ='ERR convertHexInToHex0x: uHex out of U+ range' .. uHex0x
		return nil
	end
	
	uHexFormat = formatUhex( uHex0x )

	return uHex0x, uHexNum, uHexBare0x, uHexFormat
end

local function convertHexToDec( uHex0x )
local xVal
	if uHex0x == nil then return nil end
	xVal = uBaseConvert.convert( {n = uHex0x, base = 10, from = 16} )
	return xVal
end

local function convertDecToHex( uDec )
-- todo: dec input is NaN, err, edge
	if uDec == nil then return nil end
	return uBaseConvert.convert( {n = tonumber( uDec, 10 ), base = 16, from = 10} )
end

-- GET DATA ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== 
local function getBlock( uHexNum )
	uData.lookup_block( uHexNum )
	return 'blck'
end

local function getPlane( uHexNum )
local i = math.floor(uHexNum / 0x10000)
	return i .. ': ' .. uChar_data.tPlanes[i]
end

local function getCombiningClass( uHex0x )
-- CCC
-- todo: 239 (230), 233, 234 = between spacing chars.
local ccc

	ccc = uData_helper.lookup_combiningclass( uHex0x ) or '' -- new -helper function

	return ccc
end

local function getNamedEntities( uDec, fmt )
-- returns from datalist, by decimal val:
-- formatted into concat.table list
-- demo: [168]='&amp;uml;, &amp;die;, &amp;Dot;, &amp;DoubleDot;'
local tNamedEntitiesData = mw.loadData( 'Module:Numcr2namecr' )
local sNameList 
local tNames= {}
---- uDec=169
-- fmt = report
	-- id = decimal input
	sNameList = tNamedEntitiesData[tonumber(uDec)]
	if sNameList == nil then return nil end
	sNameList = decodeString( sNameList ) -- has literal '&amp;' in source

	local patstring = '%f[^&][^%;]+%f[%;]'
	local hitCount = 0
	local hitWord  = ''
	while hitCount <= 20 do
		hitCount = hitCount + 1
		hitWord = str._match( sNameList, patstring, 1, hitCount, false, '' )
		hitWord = mw.text.trim( hitWord )
		if hitWord ~= '' then
			table.insert( tNames, inMono( '&amp;' .. hitWord .. ';' ) )
		elseif hitWord == '' then
			-- no more hits in the string
			break
		end
 	end
	return table.concat( tNames, '&nbsp;&#x20;' ) -- double spaced
end

local function getAliases( uHex )
-- returns t5{} = 5 alias tables named by reason
-- demo 0x002118 = weier
local tAllAliases = mw.loadData( 'Module:Unicode data/aliases' )
local tCPalias = {}

	tCPalias = tAllAliases[uHex]
	if tCPalias == nil then return nil end

-- for 2-deep 5-subtable ( Aliases )
local tAlias5 = {}
local abbreviation = {}
local alternate    = {}
local correction   = {}
local control      = {}
local figment      = {}

tAlias5["abbreviation"] = abbreviation
tAlias5["alternate"]    = alternate
tAlias5["control"]      = control
tAlias5["correction"]   = correction
tAlias5["figment"]      = figment

	for i, v in ipairs( tCPalias ) do
		-- i = counter, v[i] = table (1/5), v[2] = tablename ( alias, 1/5 )
		if type( v ) == 'table' then
			table.insert( tAlias5[v[1]], v[2] )
		end
	end
	return tAlias5
end

local function getScriptName( sScriptISO )
local sName
local UDscripts = mw.loadData( 'Module:Unicode data/scripts' )
	if sScriptISO == nil then return nil end

	sName = UDscripts.aliases[sScriptISO] or nil
	if sName == nil then
		sName = '_unk'
	end
	return sName
end

local function formatAlias5( t5Alias, fmt )
local sReport
	if t5Alias == nil then return nil end
	-- fmt = report	
	sReport = '<br/>ALIASES: '
	for k, v in pairs( t5Alias ) do
		if #v > 0 then
			sReport = sReport .. ' ' .. k .. ': ' .. table.concat( v, '; ' )
		end
	end
	return sReport
end	

-- 1. PARSE INCOMING ARGS
-- 2. READ PROPERTIES
local function getArgsAndProps( origArgs )
local tNewArgs = {}

local inHex, inDec, inChar = 1, 2, 3 -- 'inHex', 'inDec', 'inChar'
local tOrigIn = { inHex=nil, inDec=nil, inChar=nil }
local uHexIn = -1  -- the base input
local uHex0x, uHexNum -- local working val
--xx
-- PART 1 READ & NORMALISE ORIG ARGUMENTS 
-- HEX DEC CHAR
local rprt = 'R-t0:' .. #tOrigIn
	tOrigIn[inHex]  = (origArgs[1] or origArgs['hex']) or nil -- todo: split for check?
	tOrigIn[inDec]  = origArgs['dec'] or nil
	tOrigIn[inChar] = decodeString( origArgs['char'] ) or nil

rprt = rprt .. ' R-t2:' .. #tOrigIn
for n, v in pairs( tOrigIn ) do
	if v ~= nil then
		rprt = rprt .. ' ' .. tostring(v) .. ';;'
	end
end

	if tOrigIn[inDec] ~= nil then
		uHexIn = convertDecToHex( tOrigIn[inDec] )
		rprt = rprt .. ' dec;'
	end
	if tOrigIn[inChar] ~= nil then
		uHexIn = convertDecToHex( mw.ustring.codepoint( tOrigIn[inChar] ))
		rprt = rprt .. ' char;'
	end
	if tOrigIn[inHex] ~= nil then
		uHexIn = tOrigIn[inHex]
		rprt = rprt .. ' hex;'
	end

	-- REPORT todo: what if >1 input?: err msg, prio, conflictcheck
	-- 2023-02-04: removed "\|" "invalid escape sequence" ???
	tNewArgs['rprtOrigIDs'] = ' |ID in: #t4=' .. #tOrigIn .. ':>' .. rprt .. tostring(uHexIn) .. '<| '

	-- returns: uHex0x, uHexNum, uHexBare0x, uHexFormat
	tNewArgs['uHex0x'], tNewArgs['uHexNum'], tNewArgs['uHexBare0x'], tNewArgs['uHexFormat'] = convertHexInToHex0x( uHexIn )
	if tNewArgs['uHex0x'] == nil then  -- ERROR
		-- shortcut to error #1: no uHex (valid 0x) input
		return tNewArgs
	end
	
	-- local shortcut only
	uHex0x  = tNewArgs['uHex0x']
	uHexNum = tNewArgs['uHexNum']

-- DEC
	tNewArgs['uDec'] = convertHexToDec( uHex0x )
	
-- OTHER ORIG ARGS
	tNewArgs['uNameLink'] = origArgs['link'] or origArgs['nlink'] -- old nlink = depr paramname
	tNewArgs['format']    = origArgs['format'] or ''
	tNewArgs['cwith']     = decodeString( origArgs['cwith'] )

	tNewArgs['uSize']     = origArgs['size'] 
	tNewArgs['uImage']    = origArgs['image']

	tNewArgs['html']      = origArgs['html'] -- depr?
	tNewArgs['ulink']     = origArgs['ulink'] -- old ulink = depr?

-- test notice
	tNewArgs['test']      = origArgs['test'] or ''

-- PART 2 READ & USE PROPERTIES == == == == == == == == == == == == == == == == == == == == == == == ==
-- ASSIGNED, GenCat, Control, Char
	tNewArgs['uIsAssigned'] = uData.is_assigned( uHexNum )

	if tNewArgs['uIsAssigned'] == true then
 		tNewArgs['uGenCat'] = uData.lookup_category( uHexNum )
		tNewArgs['uChar']   = mw.text.decode( '&#x' .. tNewArgs['uHex0x'] .. ';' )
	else
		tNewArgs['uGenCat'] = 'Xx' -- todo not assigned == <reserved>?
		tNewArgs['uChar']   = 'ERR_not_assg' -- ERROR
	end
	
	tNewArgs['uBlock'] = uData.lookup_block( uHexNum )
	tNewArgs['uPlane'] = getPlane( uHexNum )
	
-- CHAR replacement
	if tNewArgs['uGenCat']  == 'Cc' then
		tNewArgs['uChar']   = '&#xFFFD;' -- '?' placeholder
	end

	if tNewArgs['uGenCat'] == 'Cc' then -- assuming this is 1:1
		tNewArgs['uIsControl'] = true
	else
		tNewArgs['uIsControl'] = false
	end

--NAME, ALIASES 
	tNewArgs['uName']   = uData.lookup_name( uHexNum )
	tNewArgs['Aliases'] = getAliases( uHexNum ) -- table5

--PROPS Script, Latin, WS
	tNewArgs['uIsLatin']      = uData.is_Latin( tostring( tNewArgs['uChar'] ) )
	tNewArgs['uScript']       = uData.lookup_script( uHexNum )
	tNewArgs['uScriptName']   = getScriptName( tNewArgs['uScript'] )
	tNewArgs['uIsWhitespace'] = uData.is_whitespace( uHexNum )

--PROPS rtl
	tNewArgs['uIsRtl']        = uData.is_rtl( tostring( tNewArgs['uChar'] ) )

--PROPS2 COMBINING PREFIX Combining/cwith/dottedcircle, CCC
	tNewArgs['uIsCombining']  = uData.is_combining( uHexNum ) or false
	if yesno( tNewArgs['uIsCombining'], false ) == true then -- todo: could do: read ccc, once ;-)
		tNewArgs['uCombiningClass'] = getCombiningClass( uHexNum ) 
	end
	tNewArgs['uCombiningClass'] = getCombiningClass( uHexNum ) 
	tNewArgs['uCharPrefix'], tNewArgs['uCwithReport'] = formatCombiningChar( tNewArgs['uIsCombining'], tNewArgs['cwith'] )

-- CHAR SUFFFIX; rtl
	if tNewArgs['uIsRtl'] == true then
		tNewArgs['uCharSuffix'] = LEFT_TO_RIGHT_MARK
	else
		tNewArgs['uCharSuffix'] = ''
	end

--PROPS3: NamedEntities
	tNewArgs['NamedEntities'] = getNamedEntities( convertHexToDec( uHex0x ) )

	return tNewArgs
end

function p._main ( args )
	return '_todo _main'
end

function p.main ( frame )
local origArgs = getArgs( frame, { trim=false, removeBlanks=false } )
local tArgs = {}
local s = ''

	tUchar = getArgsAndProps( origArgs )
	
	if tUchar['uHex0x'] == nil then
		return ' >' .. ( origArgs[1] or '?' ) .. '< ERR hexIn ' .. ERRstatus ..  ' ' .. (tUchar['rprtOrigIDs'] or 'unk1')
	end

-- REPORT RPRT
	s =	 formatUhex( tUchar['uHex0x'] )

--string together & css format 
	tUchar.uChar  = tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] -- cwith, rtl, 
 
--- 	tUchar['styledChar'] = addStyles( tUchar )
	local cssChar 
	cssChar = addStyles( tUchar )
	
	if tUchar['uImage'] ~= nil then
		s = s .. ' [[file:' .. tUchar['uImage'] .. '|' .. ( tUchar['uSize'] or DEFAULT_IMAGE_SIZE ) .. ']] '
	else
		--s = s .. ' <big>' .. tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] .. '</big> '
		s = s .. ' ' .. cssChar .. ' '
	end

	s = s .. inSmallcaps( tUchar['uName'] )
	
	s = s .. '<br/>[testing: ' .. tUchar['test'] .. ']' .. (tUchar['rprtOrigIDs'] or '?') .. '&rarr; '
			.. tUchar['uHex0x'] .. ' [' .. tUchar['uDec'] .. '<sub>dec</sub>]'.. '; (' .. xlLinkFileFormat( tUchar['uHexBare0x'], tUchar['uHexFormat'] ) .. ') '
				.. 'GC: ' .. formatGenCat( tUchar['uGenCat'] ) .. ' (' .. xlLinkFileFormat( nil, nil, tUchar['uGenCat'] ) .. ')'
			.. '<br/>ASSIG: ' .. tostring( tUchar['uIsAssigned'] )  .. '; '
				.. 'WS: '.. tostring( tUchar['uIsWhitespace'] )
			.. '<br/>BLK: ' .. tUchar['uBlock'] .. '; PLANE: ' .. tUchar['uPlane'] .. '; '
			.. '<br/>SC: ' .. tUchar['uScript'] .. '=' .. tUchar['uScriptName'] .. '; RTLsuffix:' .. tostring( tUchar['uIsRtl'] )   .. '; '
		
		s = s .. '<br/>COMBI PREFIX: >' .. tUchar['uCharPrefix'] .. '<; ' .. tUchar['uCwithReport']
			.. '; CCC class:' .. ( tUchar['uCombiningClass'] or '-' )

		if tUchar['NamedEntities'] ~= nil then
			s = s .. '<br/>NAMED ENTITIES: ' .. tUchar['NamedEntities']
		end

		if tUchar['Aliases'] ~= nil then
			s = s .. formatAlias5( tUchar['Aliases'], 'report' )
		end
	return s
end

function p.test(frame)
	local sChar
	sChar = frame.args['char']
	
	return mw.ustring.codepoint(sChar, 1, 2)
end

function p.testScriptName( frame )
	local sISOid
	sISOid = frame.args[1]
	return getScriptName(sISOid)
	
end

return p