diff --git a/CHANGELOG.gmi b/CHANGELOG.gmi index 490d094..582abab 100644 --- a/CHANGELOG.gmi +++ b/CHANGELOG.gmi @@ -4,6 +4,17 @@ All notable changes to this project will be documented in this file. => https://keepachangelog.com/en/1.0.0/ The format is based on keep a Changelog. => https://semver.org/spec/v2.0.0.html And this project adheres to Semantic Versioning. +## [1.2.0] - 2021-03-19 +* Removes "^" to disable text decoration line-wise. +* CSS is no longer incorporated in the HTML page. +* Perform sanity checks against unauthorized file access. +* Properly close tags when the page exists in a non-null mode. +* Split HTML generation in two: parsing and translating. +* Create classes to handle gemtext parsing and translating. +* Create class to generate back gemtext (for future test cases). +* Fix: 404 doesn't occur for an empty file. +* Page 404 fully generated by HtmGem itself. + ## [1.1.0] - 2021-03-14 * File download when using "source" as a style. * Improves the regex. diff --git a/index.gmi b/index.gmi index 7998350..7367289 100644 --- a/index.gmi +++ b/index.gmi @@ -118,8 +118,7 @@ La décoration du texte, qui interprête le **gras** par exemple, ne fait pas pa ### Désactiver la décoration du texte On peut : -* commencer la ligne par **^**, -* faire un bloc non décoré avec **^^^**, +* désactiver et activer la décoration du texte avec une ligne **^^^**, * ajouter ce qui suit à la **réécriture** d’URL : > &textDecoration=0 diff --git a/index.php b/index.php index 57af85c..a551626 100644 --- a/index.php +++ b/index.php @@ -1,305 +1,72 @@ "); } -?> - - - -Installation de HtmGem - - - - -\n\n"; - die(); + $t = new \htmgem\GemTextTranslate_html(@file_get_contents("index.gmi")); + echo $t->getFullHtml(); + exit(); } -######################################## /Installation page + +$documentRoot = $_SERVER['DOCUMENT_ROOT']; # Removes the headling and trailling slashes, to be sure there's not any. $filePath = rtrim($_SERVER['DOCUMENT_ROOT'], "/")."/".ltrim($url, "/"); -$fileContents = @file_get_contents($filePath); +switch(true) { + case false: + case !realPath($filePath): + case !preg_match("/\.gmi$/", $url): # not finishing by .gmi + case strpos($filePath, $documentRoot)!==0: # not in web directory + $go404 = true; + // Says 404 even if the file exists to not give any information. + break; + default: + $go404 = false; +} - -######################################## 404 page -if (!file_exists($filePath) || !preg_match("/\.gmi$/", $url)) { +/* 404 page + */ +if ($go404) { error_log("HtmGem: 404 $url $filePath"); - http_response_code(404); ?> - - - - - - - - $url Recharger 🔄 - -=> / +=> .. 🔄 🔄 EOF; -echo translateGemToHtml($text404); -echo "\n"; -die(); + $t = new \htmgem\GemTextTranslate_html($page404); + echo $t->getFullHtml(); + exit(); } -######################################## /404 page +# to false only if textDecoration=0 in the URL +$textDecoration = "0" != @$_REQUEST['textDecoration']; + +$fileContents = @file_get_contents($filePath); # Removes the Byte Order Mark $fileContents = preg_replace("/\xEF\xBB\xBF/", "", $fileContents); -/** - * Replaces markups things like __underlined__ to underlined. - * @param $instruction the characters to replace, ex. _ - * @param $markup the markup to replace to, ex. "u" to get - * @param &$text where to replace. +/* CSS and special style management */ -function markupPreg($instruction, $markup, &$text) { - $output = $text; - - # Replaces couples "__word__" into "word". - $output = mb_ereg_replace("${instruction}(.+?)${instruction}", "<{$markup}>\\1", $output); - - # Replaces a remaining __ into "" to the end of the line. - $output = mb_ereg_replace("${instruction}(.+)?", "<{$markup}>\\1", $output); - - $text = $output; -} - - -/** - * Adds text attributes sucj as underline, bold, … to $line - * @param $line the line to process - */ -function addTextAttributes(&$line) { - global $textDecoration; - if (!$textDecoration) return; - markupPreg("__", "u", $line); - markupPreg("\*\*", "strong", $line); - markupPreg("//", "em", $line); - markupPreg("~~", "del", $line); -} - -/** - * Prepares the raw text to be displayed in HTML environment: - * * Escapes the HTML entities yet contained in the Gemtext. - * * Puts thin unbrakable spaces before some characters. - * @param $text1, $text2 texts to process - */ -function htmlPrepare(&$text) { - $text = htmlspecialchars($text, ENT_HTML5|ENT_NOQUOTES, "UTF-8", false); - $text = mb_ereg_replace("\ ([?!:;»€$])", NARROW_NO_BREAK_SPACE."\\1", $text); - $text = mb_ereg_replace("([«])\ ", "\\1".NARROW_NO_BREAK_SPACE, $text); # Espace fine insécable - - # Warning: using a monospace font editor may not display dashes as they should be! - # Adds no-break spaces to stick the (EM/EN dashes) to words : aaaaaa – bb – ccccc ==> aaaaaa –$bb$– ccccc - $text = mb_ereg_replace("([".DASHES."]) ([^".DASHES.".]+) ([".DASHES."])", "\\1".NARROW_NO_BREAK_SPACE."\\2".NARROW_NO_BREAK_SPACE."\\3", $text); - - # Adds no-break space to stick the (EM/EN dashes) to words : aaaaaa – bb. ==> aaaaaa –$bb. - $text = mb_ereg_replace("([—–]) ([^.]+)\.", "\\1".NARROW_NO_BREAK_SPACE."\\2.", $text); -} - -function translateGemToHtml($fileContents) { - $fileLines = preg_split("/\n/", $fileContents); - if (empty($fileLines[-1])) array_pop($fileLines); # Don't output a last empty line - ob_start(); - $mode = null; - $mode_textAttributes = true; - foreach ($fileLines as $line) { - $reDoCount = 0; - $mode_textAttributes_temp = false; - while (true) { - if ($reDoCount>2) { - error_log("HtmGem: Too many loops, mode == '$mode'"); - $mode = null; - $reDoCount = 0; - break; - } - $reDoCount += 1; - $line1 = substr($line, 0, 1); // $line can be modified - $line2 = substr($line, 0, 2); // in the meantime. - $line3 = substr($line, 0, 3); - if (is_null($mode)) { - if (empty($line)) { - echo "

 

\n"; - } elseif ('^^^' == $line3) { - $mode_textAttributes = !$mode_textAttributes; - } elseif ('^' == $line1 and !$mode_textAttributes_temp) { - if (preg_match("/^\^\s*(.+)$/", $line, $parts)) { - $line = $parts[1]; - $mode_textAttributes_temp = true; - } else { - $mode = "raw"; - } - continue; - } elseif ("#" == $line1) { - if (preg_match("/^(#{1,3})\s*(.+)/", $line, $sharps)) { - $h_level = strlen($sharps[1]); - $text = $sharps[2]; - htmlPrepare($text); - switch ($h_level) { - case 1: echo "

".$text."

\n"; break; - case 2: echo "

".$text."

\n"; break; - case 3: echo "

".$text."

\n"; break; - } - } else { - $mode = "raw"; - continue; - } - } elseif ("=>" == $line2) { - if (preg_match("/^=>\s*([^\s]+)(?:\s+(.*))?$/", $line, $linkParts)) { - $url_link = $linkParts[1]; - $url_label = @$linkParts[2]; - preg_match("/^([^:]+):/", $url_link, $matches); - $url_protocol = @$matches[1]; - if (empty($url_protocol)) $url_protocol = "local"; - if (empty(trim($url_label))) { - $url_label = $url_link; - } else { - // the label is humain-made, apply formatting - htmlPrepare($url_label); - if ($mode_textAttributes xor $mode_textAttributes_temp) addTextAttributes($url_label); - } - echo "

$url_label

\n"; - } else { - $mode = "raw"; - continue; - } - } elseif ("```" == $line3) { - preg_match("/^```\s*(.*)$/", $line, $matches); - $alt_text = trim($matches[1]); - if (empty($alt_text)) { - echo "
\n";
-                    } else {
-                        echo "
\n";
-                    }
-                    $mode="pre";
-                } elseif (">" == $line1) {
-                    echo "
\n"; - $mode = "quote"; - continue; - } elseif ("*" == $line1) { - echo "
    \n"; - $mode = "ul"; - continue; - } else { - $mode = "raw"; - continue; - } - } else { - if ("raw"==$mode) { - if (empty($line)) { - $line = " "; - } else { - htmlPrepare($line); - if ($mode_textAttributes xor $mode_textAttributes_temp) - addTextAttributes($line); - } - echo "

    $line

    \n"; - $mode = null; - } elseif ("pre"==$mode) { - if ("```" == $line3) { - echo "
\n"; - $mode = null; - } else { - htmlPrepare($line); - echo $line."\n"; - } - } elseif ("quote"==$mode) { - if (">" == $line1) { - preg_match("/^>\s*(.*)$/", $line, $quoteParts); - $quote = $quoteParts[1]; - if (empty($quote)) - echo "

 

\n"; - else { - htmlPrepare($quote); - if ($mode_textAttributes xor $mode_textAttributes_temp) - addTextAttributes($line); - echo "

".$quote."

\n"; - } - } else { - echo "\n"; - $mode = null; - continue; - } - } elseif ("ul"==$mode) { - if ("*" == $line1) { - preg_match("/^\*\s*(.*)$/", $line, $ulParts); - $li = $ulParts[1]; - if (empty($li)) { - echo "
  •  \n"; - } else { - htmlPrepare($li); - if ($mode_textAttributes xor $mode_textAttributes_temp) addTextAttributes($li); - echo "
  • ".$li."\n"; - } - } else { - echo "\n"; - $mode = null; - continue; - } - } else { - die("Unexpected mode: $mode!"); - } - } - break; // exits the while(true) as no continue occured - } // while(true) - } - $html = ob_get_contents(); - ob_clean(); - return $html; -} - -# Gets the page title: the first occurrence with # at the line start -mb_ereg("#\s*([^\n]+)\n", $fileContents, $matches); -$page_title = @$matches[1]; - -###################################### CSS Management -/** -* if &style=source displays the source directly and stops. -* if there's a filename.css besides filename.gmi, use the css and stops. -* if &style= then embbed the default style, and stops. -* if &style= then use htmgem/word.css -* if &style=/… then use the … as as stylesheet. -**/ +$style = @$_REQUEST['style']; if ("source" == $style) { $basename = basename($filePath); header("Cache-Control: public"); @@ -307,9 +74,12 @@ if ("source" == $style) { header("Content-Type: text/plain"); header("Content-Transfer-Encoding: binary"); header('Content-Length: ' . filesize($filePath)); - readfile($filePath); + echo $fileContents; exit(); } elseif ("pre" == $style) { + # Gets the page title: the first occurrence with # at the line start + mb_ereg("#\s*([^\n]+)\n", $fileContents, $matches); + $page_title = @$matches[1]; $fileContents = htmlspecialchars($fileContents, ENT_HTML5|ENT_NOQUOTES, "UTF-8", false); echo << @@ -318,52 +88,32 @@ if ("source" == $style) { $page_title -
    $fileContents
    +
    +$fileContents
    EOL; -} else { + exit(); +} + +$t = new \htmgem\GemTextTranslate_html($fileContents, $textDecoration); +if ("none" == $style) { + $t->addCss(""); +} elseif ("/" == @$style[0]) { + $t->addCss($style); +} elseif (empty($style)) { $parts = pathinfo($filePath); $localCss = $parts["filename"].".css"; $localCssFilePath = $parts["dirname"]."/".$localCss; if (file_exists($localCssFilePath)) { # Warning, using htmhem.php?url=… will make $localCss not found # as the path is relative to htmgem.php and not / ! - $cssContent = ""; - } else { - if (empty($style)) { - $cssContent = - "\n"; - } else { - if ("none" == $style) { - $cssContent = ""; - } else { - if ("/" == $style[0]) - $href = $style; - else - $href = "/htmgem/css/$style.css"; - $cssContent = ""; - } - } + $t->addCss($localCss); } - echo << - - -$page_title - -$cssContent - - -EOL; - - echo "\n".translateGemToHtml($fileContents); - echo "\n\n"; +} else { #TODO: regex check for $style + $t->addCss("/htmgem/css/$style.css"); } - -ob_end_flush(); +echo $t->getFullHtml(); ?> diff --git a/lib-htmgem.php b/lib-htmgem.php new file mode 100644 index 0000000..cf2c3cc --- /dev/null +++ b/lib-htmgem.php @@ -0,0 +1,350 @@ +1) { + die("HtmGem: Too many loops, mode == '$mode'"); + } + $reDoCount += 1; + $line1 = substr($line, 0, 1); // $line can be modified + $line2 = substr($line, 0, 2); // in the meantime. + $line3 = substr($line, 0, 3); + if (is_null($mode)) { + if ('^^^' == $line3) { + yield array("mode" => "^^^"); + } elseif ("#" == $line1) { + preg_match("/^(#{1,3})\s*(.+)/", $line, $matches); + yield array("mode" => $matches[1], "title" => trim($matches[2])); + } elseif ("=>" == $line2) { + preg_match("/^=>\s*([^\s]+)(?:\s+(.*))?$/", $line, $matches); + yield array("mode" => "=>", "link" => trim($matches[1]), "text" => trim(@$matches[2])); + } elseif ("```" == $line3) { + preg_match("/^```\s*(.*)$/", $line, $matches); + $current = array("mode" => "```", "alt" => trim($matches[1]), "texts" => array()); + $mode="```"; + } elseif (">" == $line1) { + preg_match("/^>\s*(.*)$/", $line, $matches); + $current = array("mode" => ">", "texts" => array(trim($matches[1]))); + $mode = ">"; + } elseif ("*" == $line1) { + preg_match("/^\*\s*(.*)$/", $line, $matches); + $current = array("mode" => "*", "texts" => array(trim($matches[1]))); + $mode = "*"; + } else { + // text_line + yield array("mode"=>"", "text" => trim($line)); + } + } else { + if ("```"==$mode) { + if ("```" == $line3) { + yield $current; + $current = array(); + $mode = null; + } else { + $current["texts"] []= $line; // No trim() as it’s a preformated text! + } + } elseif (">"==$mode) { + if (">" == $line1) { + preg_match("/^>\s*(.*)$/", $line, $matches); + $current["texts"] []= trim($matches[1]); + } else { + yield $current; + $current = array(); + $mode = null; + continue; + } + } elseif ("*"==$mode) { + if ("*" == $line1) { + preg_match("/^\*\s*(.*)$/", $line, $matches); + $current["texts"] []= trim($matches[1]); + } else { + yield $current; + $current = array(); + $mode = null; + continue; + } + } else { + die("Unexpected mode: $mode!"); + } + } + break; // exits the while(true) as no continue occured + } // while(true) + }// foreach + if ($current) yield $current; # File ends before the block. +} // gemtextParser + + +/** + * Translates the internal format into a gemtext. + * Uses cases: + * + * - test suites + * - serialisation easier with a text content + * - normalization (trimming spaces for instance) + */ +class GemtextTranslate_gemtext { + + function __construct($parsedGemtext) { + $this->parsedGemtext = $parsedGemtext; + $this->translate(); + } + + protected function translate() { + ob_start(); + foreach ($this->parsedGemtext as $node) { + $mode = $node["mode"]; + switch($mode) { + case "": + echo $node["text"]."\n"; + break; + case "*": + foreach ($node["texts"] as $text) { + echo "* $text\n"; + } + break; + case "```": + print("```\n"); + foreach ($node["texts"] as $text) { + echo "$text\n"; + } + print("```\n"); + break; + case ">": + foreach ($node["texts"] as $text) { + echo "> $text\n"; + } + break; + case "=>": + $linkText = $node["text"]; + if (!empty($linkText)) $linkText = " $linkText"; + print("=> ".$node["link"].$linkText."\n"); + break; + case "#": + case "##": + case "###": + print("$mode ".$node["title"]."\n"); + break; + case "^^^": + print("^^^\n"); + break; + default: + die("Unknown mode: '{$node["mode"]}'\n"); + } + } + + $this->translatedGemtext = ob_get_contents(); + ob_end_clean(); + } + + public function __toString() { + return $this->translatedGemtext; + } +} // GemtextTranslate_gemtext + + +/** + * Translates the internal format to HTML + */ +class GemtextTranslate_html { + + protected $cssList = array(); + protected $pageTitle = ""; + public $translatedGemtext; + + function __construct($parsedGemtext, $textDecorationEnabled=true) { + if (empty($parsedGemtext)) + $parsedGemtext = ""; + elseif (is_string($parsedGemtext)) + // to delete the last empty line,

     

    in HTML + $parsedGemtext = rtrim($parsedGemtext); + // The text must be parsed + $parsedGemtext = gemtextParser($parsedGemtext); + $this->parsedGemtext = $parsedGemtext; + $this->translate($textDecorationEnabled); + } + + function addCss($css) { + $this->cssList []= $css; + } + + const NARROW_NO_BREAK_SPACE = " "; + const DASHES + ="‒" # U+2012 Figure Dash + ."–" # U+2013 En Dash + ."—" # U+2014 Em Dash + ."⸺" # U+2E3A Two-Em Dash + ."⸻" # U+2E3B Three-Em Dash (Three times larger than a single char) + ; + + /** + * Replaces markups things like __underlined__ to underlined. + * @param $instruction the characters to replace, ex. _ + * @param $markup the markup to replace to, ex. "u" to get + * @param &$text where to replace. + */ + protected static function markupPreg($instruction, $markup, &$text) { + $output = $text; + + # Replaces couples "__word__" into "word". + $output = mb_ereg_replace("${instruction}(.+?)${instruction}", "<{$markup}>\\1", $output); + + # Replaces a remaining __ into "" to the end of the line. + $output = mb_ereg_replace("${instruction}(.+)?", "<{$markup}>\\1", $output); + + $text = $output; + } + + /** + * Adds text attributes sucj as underline, bold, … to $line + * @param $line the line to process + */ + protected static function addTextDecoration(&$line) { + self::markupPreg("__", "u", $line); + self::markupPreg("\*\*", "strong", $line); + self::markupPreg("//", "em", $line); + self::markupPreg("~~", "del", $line); + } + + /** + * Prepares the raw text to be displayed in HTML environment: + * * Escapes the HTML entities yet contained in the Gemtext. + * * Puts thin unbrakable spaces before some characters. + * @param $text1, $text2 texts to process + */ + protected static function htmlPrepare(&$text) { + if (empty($text)) { + $text = " "; + } else { + $text = htmlspecialchars($text, ENT_HTML5|ENT_NOQUOTES, "UTF-8", false); + $text = mb_ereg_replace("\ ([?!:;»€$])", self::NARROW_NO_BREAK_SPACE."\\1", $text); + $text = mb_ereg_replace("([«])\ ", "\\1".self::NARROW_NO_BREAK_SPACE, $text); # Espace fine insécable + + # Warning: using a monospace font editor may not display dashes as they should be! + # Adds no-break spaces to stick the (EM/EN dashes) to words : aaaaaa – bb – ccccc ==> aaaaaa –$bb$– ccccc + $text = mb_ereg_replace("([".self::DASHES."]) ([^".self::DASHES.".]+) ([".self::DASHES."])", "\\1".self::NARROW_NO_BREAK_SPACE."\\2".self::NARROW_NO_BREAK_SPACE."\\3", $text); + + # Adds no-break space to stick the (EM/EN dashes) to words : aaaaaa – bb. ==> aaaaaa –$bb. + $text = mb_ereg_replace("([—–]) ([^.]+)\.", "\\1".self::NARROW_NO_BREAK_SPACE."\\2.", $text); + } + } + + public function translate($textDecoration=true) { + ob_start(); + foreach ($this->parsedGemtext as $node) { + $mode = $node["mode"]; + switch($mode) { + case "": + $text = $node["text"]; + self::htmlPrepare($text); + if ($textDecoration) self::addTextDecoration($text); + echo "

    $text

    \n"; + break; + case "*": + echo "
      \n"; + foreach ($node["texts"] as $text) { + self::htmlPrepare($text); + if ($textDecoration) self::addTextDecoration($text); + print("
    • $text\n"); + } + echo "
    \n"; + break; + case "```": + $text = implode("\n", $node["texts"]); + self::htmlPrepare($text); + echo "
    \n$text\n
    \n"; + break; + case ">": + $text = implode("\n", $node["texts"]); + self::htmlPrepare($text); + if ($textDecoration) self::addTextDecoration($text); + echo "
    \n$text\n
    \n"; + break; + case "=>": + $link = $node["link"]; + $linkText = $node["text"]; + if (empty($linkText)) { + $linkText = $link; + self::htmlPrepare($linkText); + } else { + self::htmlPrepare($linkText); + if ($textDecoration) self::addTextDecoration($linkText); + } + preg_match("/^([^:]+):/", $link, $matches); + $protocol = @$matches[1]; + if (empty($protocol)) $protocol = "local"; + echo "

    $linkText

    \n"; + break; + case "#": + $title = $node["title"]; + self::htmlPrepare($title); + if (empty($this->pageTitle)) $this->pageTitle = $title; + echo "

    $title

    \n"; + break; + case "##": + $title = $node["title"]; + self::htmlPrepare($title); + echo "

    $title

    \n"; + break; + case "###": + $title = $node["title"]; + self::htmlPrepare($title); + echo "

    $title

    \n"; + break; + case "^^^": + $this->textDecorationEnabled = !$this->textDecorationEnabled; + break; + default: + die("Unknown mode: '{$node["mode"]}'\n"); + } + } + + $this->translatedGemtext = ob_get_contents(); + ob_end_clean(); + } + + function getFullHtml() { + if (!$this->cssList) + $css = array("/htmgem/css/htmgem.css"); + else + $css = $this->cssList; + echo << + + +{$this->pageTitle} + +EOL; + foreach ($css as $c) { + echo "\n"; + } + echo << +\n +EOL; + echo $this->translatedGemtext; + echo "\n\n"; + } + + public function __toString() { + return $this->translatedGemtext; + } +} // GemTextTranslate_html + +?> diff --git a/tutogemtext.gmi b/tutogemtext.gmi index a0d6f1c..32f03c7 100644 --- a/tutogemtext.gmi +++ b/tutogemtext.gmi @@ -53,9 +53,11 @@ Il ne peut exister qu’un lien par ligne. Et la ligne est dédiée à ça ! Voi La décoration du texte ne fait pas partie des spécifications de Gemini. -^ Cette **ligne** utilise la //décoration du texte// qui peut ~~barrer~~ ou __souligner__ des mots. +Cette *​*ligne*​* utilise la /​/décoration du texte/​/ qui peut ~​~barrer~​~ ou _​_souligner_​_ des mots. Cette **ligne** utilise la //décoration du texte// qui peut ~~barrer~~ ou __souligner__ des mots. +On peut désactiver et activer la décoration du texte avec **^^^** sur une ligne. + Et voilà ! Vous savez tout ce qui est à savoir 🥳 Les spécifications complètes sont sur le site de Gemini : => https://gemini.circumlunar.space/docs/specification.gmi