Module:Sandbox/Trappist the monk/Emoji data make

From WikiProjectMed
Jump to navigation Jump to search

--[[

this module reads html of https://unicode.org/Public/emoji/latest/emoji-test.txt and creates a data table suitable
for Module:Emoji

1. open https://unicode.org/Public/emoji/latest/emoji-test.txt
2. view page source
3. select and copy the whole html text to clipboard
4. paste into this module's doc page inside the <!-- --> comment markup
5. save
6. copy the rendered table from the module documentation and paste it over the existing table in Module:Emoji/data

{{#invoke:Sandbox/Trappist the monk/Emoji data make|main}}

Is ~/annotations/americas.html the best source?  What about:
	https://www.unicode.org/emoji/charts/full-emoji-list.html (takes a vey long time to load) โ€“ currently v15.1
		but: the html source (view source) loads relatively quickly
			but: that source is much much much 'longer than the maximum of 2,048 kilobytes'
	https://unicode.org/Public/emoji/15.1/emoji-sequences.txt; simple text is good but doesn't provide names for each code
	https://unicode.org/Public/emoji/15.1/emoji-test.txt; simple text is good; appears to provide names;
		there are duplicates qualified with FE0F as the last subcode; what to do about them? names appear to be
		the same so drop the duplicates? date and version can be read from the source
	
]]

require ('strict');


--[[--------------------------< R E N D E R _ O U T P U T >----------------------------------------------------

render the base table emotbl{} that this module creates.
													
]]

local function render_output (frame, out_t, timestamp, version)
	table.insert (out_t, '\t}</syntaxhighlight>');								-- to close the table
	table.insert (out_t, 1, table.concat ({										-- insert this at the start of the output sequence
		'<syntaxhighlight lang="lua">local emoji_hex_from_name_t = {',			-- opening stuff
		string.rep ('\t', 13),													-- tabs to position the version/timestamp comment
		'-- v.',																-- version prefix
		version,																-- the version
		'; ',																	-- separator
		timestamp,																-- and the timestamp
		}));
	return frame:preprocess (table.concat (out_t, '\n'));						-- make a big string and done	
end


--[[--------------------------< M A I N >----------------------------------------------------------------------
]]

local function main (frame)
	local page_title = frame:getTitle() .. '/doc';
	local title_object_t = mw.title.new (page_title);							-- get the title object for the doc page invoking this module

	local content = title_object_t:getContent();								-- get the content of that page
	
	local timestamp = content:match ('# Date: (%d%d%d%d%-%d%d%-%d%d, %d%d:%d%d:%d%d) GMT');	-- get parts of the timestamp
	timestamp = timestamp:gsub (',%s+', 'T');

	local version = content:match ('# Version: (%d+%.%d+)')

	local data_t = {};															-- raw data extracted from source html goes here indexed by emoji hex value(s)

	for line in content:gmatch ('([%x ]+;[^\n\r]+)[\n\r]+') do
		local hex = line:match ('[%x ]+');										-- one or more hexadecimal strings separated by space characters
		hex = mw.text.trim (hex);												-- remove extraneous whitespace
		hex = hex:gsub (' +FE0F$', '');											-- remove u+FE0F
		hex = hex:lower();														-- down case

		local emoji = line:match ('# +([^ ]+) ');								-- get the emojis for possible use in comment (TODO)

		local name = line:match ('E%d+%.%d+ (.+)');								-- get emoji name
		name = name:gsub ("'", "\\'");											-- escape ' (U+0027 typewriter apostrophe)
		name = mw.ustring.gsub (name, '[โ€œโ€โ€˜โ€™]', {
			['โ€œ'] = '\"',														-- replace โ€œโ€ (U+201C & U+201D)
			['โ€'] = '\"',
			['โ€˜'] = "\\'",														-- replace โ€˜โ€™ (U+2018 & U+2019) with ' (U+0027 typewriter apostrophe)
			['โ€™'] = "\\'",
		});
		name = name:gsub (' +', '_');											-- replace whitespace with single underscore (why?)
		name = name:lower();													-- down case

		data_t[hex] = {name, emoji};											-- add to the base data list
	end

	local out_t = {};															-- prettified list goes here

	local function tabs (hex, info_t)											-- local function to calculate number of tabs needed between end of entry and column 80 comment
		local length = 14 + mw.ustring.len (info_t[1]) + string.len (hex);		-- length of table entry; ustring.len() because there are some multibyte characters
		local white_space = 80 - length;										-- comments begin at column 80
		local tabs = math.floor (white_space / 4);								-- the minimum number of tabs to get to column 80
		if 0 ~= math.fmod (white_space, 4) then									-- if there is a remainder ...
			tabs = tabs + 1;													-- add one more tab
		end
		return  ((0 >= tabs) and 1) or tabs;									-- return the number the tabs needed to get to column 80; minimum of 1 (for long entries)
	end 

	for hex, info_t in pairs (data_t) do										-- spin through data_t and make a prettified list
		table.insert (out_t, table.concat ({
			'\t[\'',															-- indent one tab space; open index
			info_t[1],															-- add emoji name as index
			'\'] = \'',															-- close index; add assignment operator; open name
			hex,																-- add emoji hex value
			'\',',																-- close name
			string.rep ('\t', tabs (hex, info_t)),								-- add enough tabs to get to column 80
			'-- ',																-- start a comment
			info_t[2],															-- and add the emoji
		}));
	end

	table.sort (out_t);															-- ascending sort

	return render_output (frame, out_t, timestamp, version);					-- make a big string and done
end


--[[--------------------------< E X P O R T S >----------------------------------------------------------------
]]

return {
	main = main,
	}