Module:CsvUtils

From Against the Storm Official Wiki
Revision as of 03:44, 6 November 2023 by Aeredor (talk | contribs) (Commented out the part where single quotes get encoded into HTML characters. this is breaking building names like the Alchemist's Hut)

Overview

The CsvUtils module loads, processes, and returns data that is stored in CSV format in wiki templates. This module also provides a parsing method to convert well-structured CSV data into a Lua table.

Requirements

The CSV data must be well structured. The header row cannot have empty names, and all lines must end with a comma.

There can be documentation in the template, and this module will automatically remove it, provided it is enclosed in a <noinclude> block. The CSV data itself must be wrapped in a <pre> block in order to protect line breaks. The <pre> block will also be removed.

Usage

The standard way of using this module is a call like the following:

dataTable, headerLookupTable = CsvUtils.extractTables(DATA_TEMPLATE_NAME)

This is a shortcut method for the other two: extractCSV and luaTableFromCSV. The extractCSV method loads the template and removes all surrounding documentation. It returns a very long string of CSV data, including newlines. The luaTableFromCSV method takes that string and parses it, splitting it into a header row and data rows, and each row into fields. It returns the data itself and a lookup table with header names, allowing constant time lookup of indexes in the data table from string names of the headers. In other words, if you know the column has the header "constructionTime", then if the data at index 10 stores the value for constructionTime, 45s:

headerLookupTable"constructionTime" = 10
dataTable10 = 45

The data table has the following structure:

dataTable = {
   [1] = { 
        [1] = "header1Name",
        [2] = "header2Name",
        [3] = "header3Name",
        ...
    },
    [2] = {
        [1] = {
            dataRow1field1,
            dataRow1field2,
            dataRow1field3,
            ...
        },
        [2] = {
            dataRow2field1,
            ...
        },
        [3] = {
            dataRow3field1,
            ...
        },
        ...
    }
}

The header lookup table has the following structure:

headerLookupTable = {
    "header1Name" = 1,
    "header2Name" = 2,
    "header3Name" = 3,
    ...
}

---
-- Module for processing retrieving and returning data stored in CSV format in
-- wiki templates. There can be documentation in the template that will be 
-- automatically removed provided it is enclosed in <noinclude> block. The data
-- must be wrapped in <pre> to protect line breaks. The <pre> block will also
-- be removed. This module also provides a parsing method to convert well-
-- structured CSV data into a Lua table.
-- @module CsvUtils
local CsvUtils = {}

---
-- Loads the data from a wiki page (usually a template). This method removes
-- everything from the page content except the raw data, including anything 
-- enclosed in <noinclude> and including the enclosing <pre> tags that protect
-- line breaks in the raw data. Returns nil if the page could not be loaded or
-- if there was a problem processing the page content.
--
-- @param dataPageName (string) The name of the page containing the CSV data, 
-- including the namespace, like this: 'Template:Workshops_Recipes_csv'
-- @return string The raw CSV data.
--
function CsvUtils.extractCSV(dataPageName)
	
	-- Load the page and verify something returned correctly. If the page was 
	-- not found, then the variable will be nil or the page won't exist.
    local csvPage = mw.title.new(dataPageName)
    if not csvPage or not csvPage.exists then
        error("Could not find data page: " .. dataPageName)
    end
    
	-- Get the content of the page, or it will be nil if there is no page. We
	-- can also expect there's a problem if there's an empty string.
    local pageContent = csvPage:getContent()
	if not pageContent or pageContent == "" then
		error("Data page content does not exist: " .. dataPageName)
	end
	
	-- Process the content of the full wikipage, expanding templates and 
	-- applying wikimarkup. If it didn't work or returns an empty string, then
	-- there's a problem.
	local frame = mw.getCurrentFrame()
	local preprocessedText = frame:preprocess(pageContent)
	if not preprocessedText or preprocessedText == "" then
		error("Data page content could not be preprocessed " .. dataPageName)
	end
	
	-- Remove everything within and including the <noinclude> tags. If this
	-- results in an empty string, there was a problem. Since we're still 
	-- dealing with mediawiki content, use mw.ustring instead of Lua string
	-- library.
	local trimmedText = mw.ustring.gsub(pageContent, "<noinclude>.-</noinclude>", "")
	if trimmedText == "" then
		error("Trimming the data page content resulted in no csv data left: " .. dataPageName)
	end
    
    -- Extract the data from the enclosing <pre> block, leaving only the csv
	-- data. If this results in an empty string, there was a problem.
    local rawCSV = trimmedText:match("<pre>(.-)</pre>")
	if rawCSV == "" then
		error("Trimming the data page content resulted in no csv data left: " .. dataPageName)
	end
    
    return rawCSV
end -- extractCSV function



---
-- Converts a CSV data string into a Lua table. The CSV data should have a
-- header row with field names. Each row represents a new record, and each
-- column contains specific data for that record. The resulting Lua table has 
-- sub-tables for each row. Indices can be used or the names of the header row 
-- will be usable as keys to access the fields.
-- For example:
-- row[1] is the same as row["id"] (if the first header in the header row 
-- contains the string "id")
--
-- @function luaTableFromCSV
-- @param csvString (string) The CSV data as a string.
-- @return table A Lua table containing the CSV data.
-- 
function CsvUtils.luaTableFromCSV(csvString)

	-- top level lua table, header row, and data rows (table of rows)
    local luaTable = {}
    local header = {}
    local rows = {}
	local headerLookup = {} -- stores header indices for constant time lookup

	-- Start by cleaning (encoding) the string up so it can be processed without
	-- getting confused by any special characters like commas or newlines within
	-- quotations.
	encodedString = M.encodeCSV(csvString)
	if encodedString == "" then
		error("Encoding the csv data string removed all information.")
	end

    -- Separate the csv data into lines/rows by gmatch'ing on carriage returns 
	-- and/or newlines. Then make each line a new row (or the header row).
    for line in encodedString:gmatch("[^\r\n]+") do
		if not line or line == "" then
			error("Splitting csv data into rows failed.")
		end
		
		-- We only build the header once; then its length will be > 0
		-- The first line will be the header, and after we build that,
		-- everything else goes into the rows table.
		if #header == 0 then
			-- Count the headers to use to populate a header lookup table
			local i = 1
			-- Separate the header row into fields by gmatch'ing again, this
			-- time on commas. We can use plus here because there are no empty 
			-- header cells.
			for field in line:gmatch("([^,]+)") do
				if not field or field == "" then
					error("Splitting row into fields failed on header row.")
				end
				-- Then append each split field to the header row.
				header[i] = field
				headerLookup[field] = i
				i = i + 1
			end
			
			-- Header row done; add it after all the other lines are processed.
			if #header == 0 then
				error("Adding fields was not successful to header row.")
			end
			
		else
			-- Build each row separately as a new table.
			local row = {}
			-- Keep track of indices so fields can be named with header row
			-- names.
			local i = 1
			
			-- Separate the line into fields by gmatch'ing on commas again. Use
			-- an asterisk here instead of a plus because there are empty cells 
			-- with zero length. The rows must end in a comma to do this 
			-- correctly or there will be empty strings added between every
			-- field in the resulting table.
			for field in line:gmatch("([^,]*),") do
				-- Empty strings are allowed here.
				if not field then
					error("Splitting row into fields failed on field number: " .. i 
					.. " on non-header row number: " .. #rows+1)
				end
				-- Append to the new row by assigning the key.
				row[i] = field
				i = i + 1
			end
			
			-- Row done. Append the new row to the rows table now.
			if #row == 0 then
				error("Adding fields was not successful to row number: " .. i)
			end
			table.insert(rows, row)
			
		end
	end
	
	-- Now put the header and rows into the luaTable
	if #rows == 0 then
		error("There are no rows to add to the table.")
	end
	
	table.insert(luaTable, header)
	table.insert(luaTable, rows)
	
    return luaTable, headerLookup
end -- luaTableFromCSV function



---
-- Replaces apostrophes and double quotes in a string with their respective 
-- HTML character codes.
--
-- @param stringToEncode The input string to be processed.
-- @return The modified string with special characters replaced.
--
function encodeCSV(stringToEncode)
	
	-- Call a function on quotations to replace newlines within them. Capture 
	-- the entire quote, including the bounding quotation marks.
	local encodedString = stringToEncode:gsub("(\".-\")", encodeQuotations)
	
	-- Now that the content within quotes are handled:
	
	-- Sub double quotes for their HTML code.
	encodedString = encodedString:gsub("\"", "&quot;")
	if encodedString == "" then
		error("Encoding double quotation marks resulted in no data left.")
	end
	
	-- Sub single quotes for their HTML code.
	--encodedString = encodedString:gsub("'", "&#39;")
	--if encodedString == "" then
	--	error("Encoding single quotation marks resulted in no data left.")
	--end
	
	return encodedString
end



---
-- Local function to replace newlines that occur within quoted strings with
-- their HTML codes, so that they aren't interpreted as newlines that separate
-- records in the csv data string.
--
-- @param stringToEncode The input string to be processed.
-- @return The modified string with newlines replaced.
function encodeQuotations(stringToEncode)
	
	-- Sub out newlines within this matched quotation
	local encodedString = stringToEncode:gsub('[\r\n]+', '&#10;')
	if encodedString == "" then
		error("Encoding newlines inside quoted descriptions resulted in no data left.")
	end
	
	-- Sub out commas within the matched quotation
	local encodedString = encodedString:gsub(",", "&comma;")
	if encodedString == "" then
		error("Encoding commas inside quoted descriptions resulted in no data left.")
	end
	
	return encodedString
end

return CsvUtils