itsb/jq/helpers.jq

# Parse URLs into an object with {scheme, netloc, path, params, query, fragment}.
# Similar to Python's urllib.parse.urlparse.
def urlparse: capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?:(?<path>(?:[^?#]+/)?[^?#;]*)(?:;(?<params>[^?#/]*))?)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");

# Parse URLs into an object with {scheme, netloc, path, query, fragment}. Path parameters are not parsed.
# Similar to Python's urllib.parse.urlsplit.
def urlsplit: capture("^(?:(?<scheme>[^:/?#]+):)?(?://(?<netloc>[^/?#]*))?(?<path>(?:[^?#]+/)?[^?#]*)?(?:\\?(?<query>[^#]*))?(?:#(?<fragment>.*))?$");

# Reverse operation of either urlparse or urlsplit.
def urlunparse:
    (if .scheme then .scheme + "://" else "" end)
    + (.netloc // "")
    + (.path // "")
    + (if .params then ";" + .params else "" end)
    + (if .query then "?" + .query else "" end)
    + (if .fragment then "#" + .fragment else "" end);

# Resolve a possibly relative URI into an absolute URI.
def urlresolve(base):
    (if type == "string" then urlsplit else . end) as $parsed
    # There is a scheme: this is an absolute URL
    | if $parsed.scheme then . else (
        base|(if type == "string" then urlsplit else . end) as $parsedbase
        # No scheme but a domain: use the base's scheme
        | $parsed
        | if .netloc then (
            .scheme = $parsedbase.scheme
        # No scheme and no domain: resolve the relative URI
        ) elif .path then (
            .scheme = $parsedbase.scheme
            | .netloc = $parsedbase.netloc
            # When the path does not start with a slash, make it relative to the base's path
            # by removing the filename from the base's path and appending the path
            | if .path|startswith("/")|not then (
              .path = (($parsedbase.path|split("/")[:-1]|join("/")) + "/" + ($parsed.path|ltrimstr("/")))
            ) else . end
        ) elif (.query // .fragment) then (
            .scheme = $parsedbase.scheme
            | .netloc = $parsedbase.netloc
            | .path = $parsedbase.path
        ) else . end
        | urlunparse
    ) end;

# Basic Chinese number parsing meant for Chinese date parsing.
def parse_chinese_number:
    . as $input
    | {
        "零": "0",
        "〇": "0",
        "一": "1",
        "二": "2",
        "三": "3",
        "四": "4",
        "五": "5",
        "六": "6",
        "七": "7",
        "八": "8",
        "九": "9",
        # 10 is ignored here as we will parse number by number.
        "十": ""
    } as $charmap
    | $input / ""
    | map($charmap[.] // .)
    | join("")
    # Special case for when we parse 十 alone
    | if . == "" then 1 else . end
    | tonumber
    # Parsing number by number, ignoring 10, will work as long as there is a digit before and after 10:
    # 二十八 works because we parse it as 二八 (2 and 8), but 二十 would yield 2 only,
    # so we multiply manually by 10 when the number ends with 10.
    # 十八 gives 8 when it should give 18 because it would be parsed as 八, so we add 10 when the number starts with 10.
    | if $input|endswith("十") then . * 10
      elif $input|startswith("十") then . + 10
      else . end;

# Parse a Traditional or Simplified Chinese date into a Unix timestamp.
def parse_chinese_date:
    capture("(?<year>[0123456789零〇一二三四五六七八九十]+)年(?<month>[0123456789零〇一二三四五六七八九十]+)月(?<day>[0123456789零〇一二三四五六七八九十]+)日")
    | map_values(parse_chinese_number)
    # Handle the Chinese calendar by assuming we never would get dates before 1900. This will no longer work in the year 2811.
    | if .year < 1900 then .year += 1911 else . end
    | "\(.year)-\(.month)-\(.day)T00:00:00Z"
    | fromdateiso8601;