booksql/bin/parse_ics

88 lines
4.7 KiB
Plaintext
Executable File

#!/usr/bin/jq -sRf
# RFC 5545 (iCalendar) + RFC 6350 (vCard) parser
# ~lucidiot, 2023
# iCalendar (.ics) files contain a series of components delimited by BEGIN:<type> and END:<type>.
# Each component can have properties. Each property has a name and a value.
# Each property may have some optional key/value parameters:
# THING;PARAM1=VALUE1;PARAM2=VALUE2:CONTENT
# Parameters may have a list of values instead of just one parameter value:
# THING;PARAM=VAL1,VAL2,VAL3,"VAL4,WITH,COMMAS":CONTENT
# vCard (.vcf) files have a similar syntax, but with different quoting and escaping rules,
# and RFC 6868 adds a layer of complexity with a second escape character, ^.
# For these reasons, we will not parse parameters here, and they are just one string in the output.
# Remove any final newlines as this would appear to us as an empty line
rtrimstr("\n")
| rtrimstr("\r")
# Lines are supposed to end after 75 characters. Adding a space at the beginning of the next line
# means that the next line is really just part of the previous line, so we remove those extra
# line breaks to merge every line. vCard also allows tabs and not just spaces.
| gsub("\r?\n[ \t]"; "")
# Iterate on each line.
| reduce split("\n")[] as $item (
# Initial state of the parser
{
# Placeholder for the root component of this file.
# The _type will be filled in with the type specified in BEGIN:<type>.
# "_" is not an allowed character in property names, so we can use it for our own purposes.
"root": {"_type": null},
# Path within this state where the parser is currently inserting new properties.
# This is used to keep track of where we are in the hierarchy when parsing nested components.
"current_path": ["root"]
};
. as $state
| (
$item
# Parse a whole line as { name: "...", param: "..." (or null), value: "..." }
| capture("^(?'name'[a-zA-Z0-9-]+)(?:;(?'params'(?:[^:]*\".*\")*[^:]+))?:(?'value'[^[:cntrl:]]*)\r?$")
) as {$name, $params, $value}
# Property names should be case-insensitive, we will use lowercase everywhere
| ($name | ascii_downcase) as $name
| $state
| if .current_path[0] != "root" then
# If we get any line after an `END:` that was meant for the root component,
# the current_path will be set to []. We should not allow parsing anything else.
error("Unexpected end of root component")
elif getpath([.current_path[], "_type"]) == null then
# When the type was not yet filled in, we are expecting a BEGIN for the root component.
if $name == "begin" then
setpath([.current_path[], "_type"]; $value)
else
error("Expected BEGIN, got \($name)")
end
elif $name == "begin" then
# This BEGIN: declares a nested component.
# When we are somewhere other than the root component, we will never get a `null` type
# because we can set it as soon as we get here; so we know that the above branch only
# runs for the root component and we are always working with nested components.
# We will nest components under an array called `_components`.
# We therefore add to our paths the `_components` key, and then the index of this new
# component. The length of the array matches the last index of the array + 1, so this
# will append our new component at the end of the list.
.current_path += ["_components", ((getpath(.current_path)._components // []) | length)]
# Add the new component now at our new path with the type from the BEGIN:<type>.
| setpath(.current_path; {"_type": $value})
elif $name == "end" then
# Handle an END by checking that its type matches the type we are working with now,
# and going back up in the structure by removing the array index and the `_components` from the path.
if $value == getpath([.current_path[], "_type"]) then
.current_path |= .[:-2]
else
error("Unexpected end of \($value) component while in a \(getpath([.current_path[], "_type"])) component")
end
else
# This is not any special case, so we will just set a property.
# Since some properties could have multiple values for the same set of parameters
# and multiple sets of parameters for the same name, we structure the output like this:
# { "name": {"parameters": ["value1", "value2"] } }
# When there are no parameters, we will use the "" key.
setpath(
[.current_path[], $name, ($params // "")];
(getpath([.current_path[], $name, ($params // "")]) // []) + [$value]
)
end
)
# Return the parsed calendar from our parser's state.
| .root