وحدة:Language/name/data/iana data extraction tool

This is a crude tool that reads a local copy of an IANA language-subtag-registry file and extracts the information necessary to create the data tables held by:

The tool skips records that contain the words: 'Deprecated', 'Preferred-Value', and 'Private use'.

At this writing, the tool extracts only the subtag code and description(s) from language, script, region, and variant records.

Usage

To use this tool:

  1. open a blank sandbox page and paste this {{#invoke:}} into it at the top line:
    {{#invoke:Language/name/data/iana_data_extraction_tool|iana_extract}}
  2. Go to the current language-subtag-registry file (or any of the files held by archive.org). Copy the whole (or just as much as you need) and paste it into the sandbox page below the {{#invoke:}}.
  3. Click Show preview
  4. wait
  5. get result

There is some crude error checking that will insert an error message in the output. No guarantees that such messaging will be helpful. Search for the word 'error' in the tool's output.




require('Module:No globals');
local p = {};


--[=[------------------------< G E T _ V A R I A N T _ P A R T S >---------------------------------------------

We get a record that looks more-or-less like this:
	%%\n
	Type: variant\n
	Subtag: bohoric\n
	Description: Slovene in Bohorič alphabet\n
	Added: 2012-06-27\n
	Prefix: sl\n

Each line is terminated with a \n character.

Type, for this function can only be 'variant'

Subtag is the code of Type

Prefix is a language code to which this variant applies; one language code per Prefix line.  There can be
more than one prefix line.

Description associates Subtag with a proper name or names; one name per Description line.  There can be more
than one Description line and Description lines can wrap to the next line.  When they do, the first two
characters of the continuation line are spaces.

Comments: lines can also be continued so once in a Comments line (which is otherwise ignored) all further
continuations in the record are also ignored.  This is a crude mechanism to prevent comment continuations
from being concatenated onto the end of descriptions and relies on Description line occuring in the record
before the Comments line.

Records with Deprecated dates or Preferred-Value codes are ignored as are private use codes.

]=]

local function get_variant_parts (record)
	local code;
	local descriptions = {};
	local prefixes = {};
	local in_comments = false;

	if string.find (record, 'Deprecated') or string.find (record, 'Preferred%-Value') or string.find (record, 'Private use') then
		return 'skip';
	end

	for line in string.gmatch (record, '([^\n]+)\n') do						-- get a \n terminated line of text (without the \n)
		if string.find (line, 'Subtag: [%a%d]+') then						-- if this line is the subtag line
			code = string.match (line, 'Subtag: ([%a%d]+)');				-- extract and save to subtag's code
		elseif string.find (line, 'Description: .+') then					-- if this line is a description line
			local desc = string.match (line, 'Description: (.+)');			-- extract the description
			desc = string.gsub (desc, '"', '\\"');							-- in case description contains quote marks (see 1959acad)
			table.insert (descriptions, '\"' .. desc .. '\"');					-- save the description wrapped in quote marks
		elseif string.find (line, 'Prefix: .+') then						-- if this line is a prefix line
			table.insert (prefixes, '\"' .. string.match (line, 'Prefix: (.+)'):lower() .. '\"');	-- extract and save the prefix wrapped in quote marks
		elseif string.find (line, 'Comments: .+') then						-- if this line is a comments line
			in_comments = true;
		elseif string.find (line, '^  .+') and not in_comments then		-- if a continuation line but not a commnets continuation
			descriptions[#descriptions] = string.gsub (descriptions[#descriptions], '\"$', '');		-- remove trailing quote mark from previous description
			descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. string.match (line, '^  (.+)') .. '\"';	-- extract and save the continuation with new quote mark
		end
	end
	
	return code, table.concat (prefixes, ', '), table.concat (descriptions, ', ');
end


--[=[------------------------< G E T _ L A N G _ S C R I P T _ R E G I O N _ P A R T S >-----------------------

We get a record that looks more-or-less like this:
	%%\n
	Type: language\n
	Subtag: aa\n
	Description: Afar\n
	Added: 2005-10-16\n

	
Each line is terminated with a \n character.

Type, for our purposes can be 'language', 'script', or 'region'

Subtag is the code of Type

Description associates Subtag with a proper name or names; one name per Description line.  There can be more
than one Description line and Description lines can wrap to the next line.  When they do, the first two
characters of the continuation line are spaces.

Comments: lines can also be continued so once in a Comments line (which is otherwise ignored) all further
continuations in the record are also ignored.  This is a crude mechanism to prevent comment continuations
from being concatenated onto the end of descriptions and relies on Description line occuring in the record
before the Comments line.

Records with Deprecated dates or Preferred-Value codes are ignored as are private use codes.

]=]

local function get_lang_script_region_parts (record)
	local code;
	local descriptions = {};
	local in_comments = false;

	if string.find (record, 'Deprecated') or string.find (record, 'Preferred%-Value') or string.find (record, 'Private use') then
		return 'skip';
	end

	for line in string.gmatch (record, '([^\n]+)\n') do						-- get a \n terminated line of text (without the \n)
		if string.find (line, 'Subtag: [%a%d]+') then						-- if this line is the subtag line
			code = string.match (line, 'Subtag: ([%a%d]+)');				-- extract and save to subtag's code
		elseif string.find (line, 'Description: .+') then					-- if this line is a description line
			table.insert (descriptions, '\"' .. string.match (line, 'Description: (.+)') .. '\"');	-- extract and save the name wrapped in quote marks
		elseif string.find (line, 'Comments: .+') then						-- if this line is a comments line
			in_comments = true;
		elseif string.find (line, '^  .+') and not in_comments then		-- if a continuation line but not a commnets continuation
			descriptions[#descriptions] = string.gsub (descriptions[#descriptions], '\"$', '');		-- remove trailing quote mark from previous description
			descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. string.match (line, '^  (.+)') .. '\"';	-- extract and save the continuation with new quote mark
		end
	end
	
	return code, table.concat (descriptions, ', ');
end


--[=[------------------------< I A N A _ E X T R A C T >-------------------------------------------------------

read a local copy of the IANA language-subtag-registry file and from it build tables to replace the tables in:
	[[Module:Language/data/iana languages]]
	[[Module:Language/data/iana scripts]
	[[Module:Language/data/iana regions]]

current language-subtag-registry file can be found at: http://www.iana.org/assignments/language-subtag-registry
archive.org has copies of previous versions see: https://web.archive.org/web/*/http://www.iana.org/assignments/language-subtag-registry

]=]

function p.iana_extract (frame)
	local page = mw.title.getCurrentTitle();									-- get a page object for this page
	local content = page:getContent();											-- get unparsed content
	local lang_table = {};														-- languages go here
	local script_table = {};													-- scripts go here
	local region_table = {};													-- regions go here
	local variant_table = {};													-- variants go here
	local file_date;															-- first line

	local code;
	local descriptions;
	local prefixes;																-- used for language variants only

	file_date = content:match ('(File%-Date: %d%d%d%d%-%d%d%-%d%d)');			-- get the file date line from this version of the source file

	for record in string.gmatch (content, '%%%%([^%%]+)') do				-- get a %% delimited 'record' from the file; leave off the delimiters
		if string.find (record, 'Type: language') then						-- if a language record
			code, descriptions = get_lang_script_region_parts (record);			-- get the code and description(s)
			
			if code and ('skip' ~= code) then														
				table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}");	-- make table entries
			elseif not code then
				table.insert (lang_table, "[\"error\"] = {" .. record .. "}");	-- code should never be nil, but inserting an error entry in the final output can be helpful
			end

		elseif string.find (record, 'Type: script') then					-- if a script record
			code, descriptions = get_lang_script_region_parts (record);			-- get the code and description(s)
			
			if code and ('skip' ~= code) then														
				table.insert (script_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}");	-- make table entries
			elseif not code then
				table.insert (script_table, "[\"error\"] = {" .. record .. "}");	-- code should never be nil, but ...
			end

		elseif string.find (record, 'Type: region') then					-- if a region record
			code, descriptions = get_lang_script_region_parts (record);			-- get the code and description(s)
			
			if code and ('skip' ~= code) then														
				table.insert (region_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}");	-- make table entries
			elseif not code then
				table.insert (region_table, "[\"error\"] = {" .. record .. "}");	-- code should never be nil, but ...
			end

		elseif string.find (record, 'Type: variant') then					-- if a region record
			code, prefixes, descriptions = get_variant_parts (record);			-- get the code, prefix(es), and description(s)

			if code and ('skip' ~= code) then														
				table.insert (variant_table,
					table.concat ({
						"[\"",
						code,
						"\"] = {<br />&#9;&#9;[\"descriptions\"] = {",
						descriptions,
						"},<br />&#9;&#9;[\"prefixes\"] = {",
						prefixes,
						"},<br />&#9;&#9;}"
						})
					);
			elseif not code then
				table.insert (variant_table, "[\"error\"] = {" .. record .. "}");	-- code should never be nil, but ...
			end
		end
	end
																				-- make pretty output
	return "<br /><pre>-- " .. file_date .. "<br />return {<br />&#9;" .. table.concat (lang_table, ',<br />&#9;') .. "<br />&#9;}<br />-- " .. 
			file_date .. "<br />return {<br />&#9;" .. table.concat (script_table, ',<br />&#9;') .. "<br />&#9;}<br />-- " ..
			file_date .. "<br />return {<br />&#9;" .. table.concat (region_table, ',<br />&#9;') .. "<br />&#9;}<br />-- " ..
			file_date .. "<br />return {<br />&#9;" .. table.concat (variant_table, ',<br />&#9;') .. "<br />&#9;}<br />" .. "</pre>";
end

return p;