From 859b0aad811d730377f48abe90169b1603f95966 Mon Sep 17 00:00:00 2001 From: Christophe HENRY Date: Tue, 16 Mar 2021 20:54:10 +0100 Subject: [PATCH] Deep refactoring: parsing, translating, classes * Removes "^" to disable text decoration line-wise. * Split HTML generation in two: parsing and translating. * Create class to handle gemtext parsing. * Create class to translate to HTML. * Create class to generate back gemtext (for future test cases). * Uses generators to parse then translate. * Fix: 404 doesn't occur for an empty file. * Page 404 fully generated by HtmGem itself. * CSS is no longer incorporated in the HTML page. * Handle CSS inclusion by addCss() calls. --- index.gmi | 3 +- index.php | 147 ++++++--------- lib-htmgem.php | 467 +++++++++++++++++++++++++++++++----------------- tutogemtext.gmi | 4 +- 4 files changed, 353 insertions(+), 268 deletions(-) diff --git a/index.gmi b/index.gmi index ba08644..5be4e06 100644 --- a/index.gmi +++ b/index.gmi @@ -118,8 +118,7 @@ La décoration du texte, qui interprête le **gras** par exemple, ne fait pas pa ### Désactiver la décoration du texte On peut : -* commencer la ligne par **^**, -* faire un bloc non décoré avec **^^^**, +* désactiver et activer la décoration du texte avec une ligne **^^^**, * ajouter ce qui suit à la **réécriture** d’URL : > &textDecoration=0 diff --git a/index.php b/index.php index cac10d5..5832956 100644 --- a/index.php +++ b/index.php @@ -1,91 +1,61 @@ "); } -?> - - - -Installation de HtmGem - - - - -\n\n"; - die(); + $t = new \htmgem\GemTextTranslate_html(@file_get_contents("index.gmi")); + echo $t->getFullHtml(); + exit(); } -######################################## /Installation page + + +# to false only if textDecoration=0 in the URL +$textDecoration = "0" != @$_REQUEST['textDecoration']; # Removes the headling and trailling slashes, to be sure there's not any. $filePath = rtrim($_SERVER['DOCUMENT_ROOT'], "/")."/".ltrim($url, "/"); -$fileContents = @file_get_contents($filePath); - - -######################################## 404 page -if (empty($fileContents)) { +/* 404 page + */ +if (!file_exists($filePath)) { error_log("HtmGem: 404 $url $filePath"); - http_response_code(404); ?> - - - - - - - - $url Recharger 🔄 - => / EOF; -echo translateGemToHtml($text404); -echo "\n"; -die(); + $t = new \htmgem\GemTextTranslate_html($page404); + echo $t->getFullHtml(); + exit(); } -######################################## /404 page + +$fileContents = @file_get_contents($filePath); # Removes the Byte Order Mark $fileContents = preg_replace("/\xEF\xBB\xBF/", "", $fileContents); -# Gets the page title: the first occurrence with # at the line start -mb_ereg("#\s*([^\n]+)\n", $fileContents, $matches); -$page_title = @$matches[1]; -###################################### CSS Management -/** -* if &style=source displays the source directly and stops. -* if there's a filename.css besides filename.gmi, use the css and stops. -* if &style= then embbed the default style, and stops. -* if &style= then use htmgem/word.css -* if &style=/… then use the … as as stylesheet. -**/ +/* CSS and special style management + */ +$style = @$_REQUEST['style']; if ("source" == $style) { $basename = basename($filePath); header("Cache-Control: public"); @@ -93,9 +63,12 @@ if ("source" == $style) { header("Content-Type: text/plain"); header("Content-Transfer-Encoding: binary"); header('Content-Length: ' . filesize($filePath)); - readfile($filePath); + echo $fileContents; exit(); } elseif ("pre" == $style) { + # Gets the page title: the first occurrence with # at the line start + mb_ereg("#\s*([^\n]+)\n", $fileContents, $matches); + $page_title = @$matches[1]; $fileContents = htmlspecialchars($fileContents, ENT_HTML5|ENT_NOQUOTES, "UTF-8", false); echo << @@ -104,52 +77,32 @@ if ("source" == $style) { $page_title -
$fileContents
+
+$fileContents
EOL; + exit(); } else { - $parts = pathinfo($filePath); - $localCss = $parts["filename"].".css"; - $localCssFilePath = $parts["dirname"]."/".$localCss; - if (file_exists($localCssFilePath)) { - # Warning, using htmhem.php?url=… will make $localCss not found - # as the path is relative to htmgem.php and not / ! - $cssContent = ""; - } else { - if (empty($style)) { - $cssContent = - "\n"; - } else { - if ("none" == $style) { - $cssContent = ""; - } else { - if ("/" == $style[0]) - $href = $style; - else - $href = "/htmgem/css/$style.css"; - $cssContent = ""; - } + $t = new \htmgem\GemTextTranslate_html($fileContents); + if ("none" == $style) { + $t->addCss(""); + } elseif ("/" == @$style[0]) { + $t->addCss($style); + } elseif (empty($style)) { + $parts = pathinfo($filePath); + $localCss = $parts["filename"].".css"; + $localCssFilePath = $parts["dirname"]."/".$localCss; + if (file_exists($localCssFilePath)) { + # Warning, using htmhem.php?url=… will make $localCss not found + # as the path is relative to htmgem.php and not / ! + $t->addCss($localCss); } + } else { #TODO: regex check for $style + $t->addCss("/htmgem/css/$style.css"); } - echo << - - -$page_title - -$cssContent - - -EOL; - - echo "\n".translateGemToHtml($fileContents); - echo "\n\n"; } - -ob_end_flush(); +echo $t->getFullHtml(); ?> diff --git a/lib-htmgem.php b/lib-htmgem.php index 0711a93..1b40b39 100644 --- a/lib-htmgem.php +++ b/lib-htmgem.php @@ -5,203 +5,78 @@ namespace htmgem; mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); -$style = @$_REQUEST['style']; - -define("NARROW_NO_BREAK_SPACE", " "); -define("DASHES" - ,"‒" # U+2012 Figure Dash - ."–" # U+2013 En Dash - ."—" # U+2014 Em Dash - ."⸺" # U+2E3A Two-Em Dash - ."⸻" # U+2E3B Three-Em Dash (Three times larger than a single char) -); - /** - * Replaces markups things like __underlined__ to underlined. - * @param $instruction the characters to replace, ex. _ - * @param $markup the markup to replace to, ex. "u" to get - * @param &$text where to replace. + * Parses the gemtext and generates the internal format version + * @param str $fileContents the gemtext to parse */ -function markupPreg($instruction, $markup, &$text) { - $output = $text; - - # Replaces couples "__word__" into "word". - $output = mb_ereg_replace("${instruction}(.+?)${instruction}", "<{$markup}>\\1", $output); - - # Replaces a remaining __ into "" to the end of the line. - $output = mb_ereg_replace("${instruction}(.+)?", "<{$markup}>\\1", $output); - - $text = $output; -} - -/** - * Adds text attributes sucj as underline, bold, … to $line - * @param $line the line to process - */ -function addTextAttributes(&$line) { - global $textDecoration; - if (!$textDecoration) return; - markupPreg("__", "u", $line); - markupPreg("\*\*", "strong", $line); - markupPreg("//", "em", $line); - markupPreg("~~", "del", $line); -} - -/** - * Prepares the raw text to be displayed in HTML environment: - * * Escapes the HTML entities yet contained in the Gemtext. - * * Puts thin unbrakable spaces before some characters. - * @param $text1, $text2 texts to process - */ -function htmlPrepare(&$text) { - $text = htmlspecialchars($text, ENT_HTML5|ENT_NOQUOTES, "UTF-8", false); - $text = mb_ereg_replace("\ ([?!:;»€$])", NARROW_NO_BREAK_SPACE."\\1", $text); - $text = mb_ereg_replace("([«])\ ", "\\1".NARROW_NO_BREAK_SPACE, $text); # Espace fine insécable - - # Warning: using a monospace font editor may not display dashes as they should be! - # Adds no-break spaces to stick the (EM/EN dashes) to words : aaaaaa – bb – ccccc ==> aaaaaa –$bb$– ccccc - $text = mb_ereg_replace("([".DASHES."]) ([^".DASHES.".]+) ([".DASHES."])", "\\1".NARROW_NO_BREAK_SPACE."\\2".NARROW_NO_BREAK_SPACE."\\3", $text); - - # Adds no-break space to stick the (EM/EN dashes) to words : aaaaaa – bb. ==> aaaaaa –$bb. - $text = mb_ereg_replace("([—–]) ([^.]+)\.", "\\1".NARROW_NO_BREAK_SPACE."\\2.", $text); -} - -function translateGemToHtml($fileContents) { - $fileLines = preg_split("/\n/", $fileContents); - if (empty($fileLines[-1])) array_pop($fileLines); # Don't output a last empty line - ob_start(); +function gemtextParser($fileContents) { + $fileLines = explode("\n", $fileContents); $mode = null; - $mode_textAttributes = true; + $current = array(); foreach ($fileLines as $line) { $reDoCount = 0; $mode_textAttributes_temp = false; while (true) { - if ($reDoCount>2) { - error_log("HtmGem: Too many loops, mode == '$mode'"); - $mode = null; - $reDoCount = 0; - break; + /* The continue instruction is used to make another turn when there is a transition + * between two modes. */ + if ($reDoCount>1) { + die("HtmGem: Too many loops, mode == '$mode'"); } $reDoCount += 1; $line1 = substr($line, 0, 1); // $line can be modified $line2 = substr($line, 0, 2); // in the meantime. $line3 = substr($line, 0, 3); if (is_null($mode)) { - if (empty($line)) { - echo "

 

\n"; - } elseif ('^^^' == $line3) { - $mode_textAttributes = !$mode_textAttributes; - } elseif ('^' == $line1 and !$mode_textAttributes_temp) { - if (preg_match("/^\^\s*(.+)$/", $line, $parts)) { - $line = $parts[1]; - $mode_textAttributes_temp = true; - } else { - $mode = "raw"; - } - continue; + if ('^^^' == $line3) { + yield array("mode" => "^^^"); } elseif ("#" == $line1) { - if (preg_match("/^(#{1,3})\s*(.+)/", $line, $sharps)) { - $h_level = strlen($sharps[1]); - $text = $sharps[2]; - htmlPrepare($text); - switch ($h_level) { - case 1: echo "

".$text."

\n"; break; - case 2: echo "

".$text."

\n"; break; - case 3: echo "

".$text."

\n"; break; - } - } else { - $mode = "raw"; - continue; - } + preg_match("/^(#{1,3})\s*(.+)/", $line, $matches); + yield array("mode" => $matches[1], "title" => trim($matches[2])); } elseif ("=>" == $line2) { - if (preg_match("/^=>\s*([^\s]+)(?:\s+(.*))?$/", $line, $linkParts)) { - $url_link = $linkParts[1]; - $url_label = @$linkParts[2]; - preg_match("/^([^:]+):/", $url_link, $matches); - $url_protocol = @$matches[1]; - if (empty($url_protocol)) $url_protocol = "local"; - if (empty(trim($url_label))) { - $url_label = $url_link; - } else { - // the label is humain-made, apply formatting - htmlPrepare($url_label); - if ($mode_textAttributes xor $mode_textAttributes_temp) addTextAttributes($url_label); - } - echo "

$url_label

\n"; - } else { - $mode = "raw"; - continue; - } + preg_match("/^=>\s*([^\s]+)(?:\s+(.*))?$/", $line, $matches); + yield array("mode" => "=>", "link" => trim($matches[1]), "text" => trim(@$matches[2])); } elseif ("```" == $line3) { preg_match("/^```\s*(.*)$/", $line, $matches); - $alt_text = trim($matches[1]); - if (empty($alt_text)) { - echo "
\n";
-                    } else {
-                        echo "
\n";
-                    }
-                    $mode="pre";
+                    $current = array("mode" => "```", "alt" => trim($matches[1]), "texts" => array());
+                    $mode="```";
                 } elseif (">" == $line1) {
-                    echo "
\n"; - $mode = "quote"; - continue; + preg_match("/^>\s*(.*)$/", $line, $matches); + $current = array("mode" => ">", "texts" => array(trim($matches[1]))); + $mode = ">"; } elseif ("*" == $line1) { - echo "
    \n"; - $mode = "ul"; - continue; + preg_match("/^\*\s*(.*)$/", $line, $matches); + $current = array("mode" => "*", "texts" => array(trim($matches[1]))); + $mode = "*"; } else { - $mode = "raw"; - continue; + // text_line + yield array("mode"=>"", "text" => trim($line)); } } else { - if ("raw"==$mode) { - if (empty($line)) { - $line = " "; - } else { - htmlPrepare($line); - if ($mode_textAttributes xor $mode_textAttributes_temp) - addTextAttributes($line); - } - echo "

    $line

    \n"; - $mode = null; - } elseif ("pre"==$mode) { + if ("```"==$mode) { if ("```" == $line3) { - echo "
\n"; + yield $current; + $current = array(); $mode = null; } else { - htmlPrepare($line); - echo $line."\n"; + $current["texts"] []= $line; // No trim() as it’s a preformated text! } - } elseif ("quote"==$mode) { + } elseif (">"==$mode) { if (">" == $line1) { - preg_match("/^>\s*(.*)$/", $line, $quoteParts); - $quote = $quoteParts[1]; - if (empty($quote)) - echo "

 

\n"; - else { - htmlPrepare($quote); - if ($mode_textAttributes xor $mode_textAttributes_temp) - addTextAttributes($line); - echo "

".$quote."

\n"; - } + preg_match("/^>\s*(.*)$/", $line, $matches); + $current["texts"] []= trim($matches[1]); } else { - echo "\n"; + yield $current; + $current = array(); $mode = null; continue; } - } elseif ("ul"==$mode) { + } elseif ("*"==$mode) { if ("*" == $line1) { - preg_match("/^\*\s*(.*)$/", $line, $ulParts); - $li = $ulParts[1]; - if (empty($li)) { - echo "
  •  \n"; - } else { - htmlPrepare($li); - if ($mode_textAttributes xor $mode_textAttributes_temp) addTextAttributes($li); - echo "
  • ".$li."\n"; - } + preg_match("/^\*\s*(.*)$/", $line, $matches); + $current["texts"] []= trim($matches[1]); } else { - echo "\n"; + yield $current; + $current = array(); $mode = null; continue; } @@ -211,10 +86,266 @@ function translateGemToHtml($fileContents) { } break; // exits the while(true) as no continue occured } // while(true) + }// foreach + if ($current) yield $current; # File ends before the block. +} // gemtextParser + + +/** + * Translates the internal format into a gemtext. + * Uses cases: + * + * - test suites + * - serialisation easier with a text content + * - normalization (trimming spaces for instance) + */ +class GemtextTranslate_gemtext { + + function __construct($parsedGemtext) { + $this->parsedGemtext = $parsedGemtext; + $this->translate(); } - $html = ob_get_contents(); - ob_clean(); - return $html; -} + + protected function translate() { + ob_start(); + foreach ($this->parsedGemtext as $node) { + $mode = $node["mode"]; + switch($mode) { + case "": + echo $node["text"]."\n"; + break; + case "*": + foreach ($node["texts"] as $text) { + echo "* $text\n"; + } + break; + case "```": + print("```\n"); + foreach ($node["texts"] as $text) { + echo "$text\n"; + } + print("```\n"); + break; + case ">": + foreach ($node["texts"] as $text) { + echo "> $text\n"; + } + break; + case "=>": + $linkText = $node["text"]; + if (!empty($linkText)) $linkText = " $linkText"; + print("=> ".$node["link"].$linkText."\n"); + break; + case "#": + case "##": + case "###": + print("$mode ".$node["title"]."\n"); + break; + case "^^^": + print("^^^\n"); + break; + default: + die("Unknown mode: '{$node["mode"]}'\n"); + } + } + + $this->translatedGemtext = ob_get_contents(); + ob_end_clean(); + } + + public function __toString() { + return $this->translatedGemtext; + } +} // GemtextTranslate_gemtext + + +/** + * Translates the internal format to HTML + */ +class GemtextTranslate_html { + + protected $cssList = array(); + protected $pageTitle = ""; + public $translatedGemtext; + + function __construct($parsedGemtext, $textDecorationEnabled=true) { + if (empty($parsedGemtext)) + // So translate() will do no loop. + $parsedGemtext = array(); + elseif (is_string($parsedGemtext)) + // to delete the last empty line,

     

    in HTML + $parsedGemtext = rtrim($parsedGemtext); + // The text must be parsed + $parsedGemtext = gemtextParser($parsedGemtext); + $this->parsedGemtext = $parsedGemtext; + $this->translate($textDecorationEnabled); + } + + function addCss($css) { + $this->cssList []= $css; + } + + const NARROW_NO_BREAK_SPACE = " "; + const DASHES + ="‒" # U+2012 Figure Dash + ."–" # U+2013 En Dash + ."—" # U+2014 Em Dash + ."⸺" # U+2E3A Two-Em Dash + ."⸻" # U+2E3B Three-Em Dash (Three times larger than a single char) + ; + + /** + * Replaces markups things like __underlined__ to underlined. + * @param $instruction the characters to replace, ex. _ + * @param $markup the markup to replace to, ex. "u" to get + * @param &$text where to replace. + */ + protected static function markupPreg($instruction, $markup, &$text) { + $output = $text; + + # Replaces couples "__word__" into "word". + $output = mb_ereg_replace("${instruction}(.+?)${instruction}", "<{$markup}>\\1", $output); + + # Replaces a remaining __ into "" to the end of the line. + $output = mb_ereg_replace("${instruction}(.+)?", "<{$markup}>\\1", $output); + + $text = $output; + } + + /** + * Adds text attributes sucj as underline, bold, … to $line + * @param $line the line to process + */ + protected static function addTextDecoration(&$line) { + self::markupPreg("__", "u", $line); + self::markupPreg("\*\*", "strong", $line); + self::markupPreg("//", "em", $line); + self::markupPreg("~~", "del", $line); + } + + /** + * Prepares the raw text to be displayed in HTML environment: + * * Escapes the HTML entities yet contained in the Gemtext. + * * Puts thin unbrakable spaces before some characters. + * @param $text1, $text2 texts to process + */ + protected static function htmlPrepare(&$text) { + if (empty($text)) { + $text = " "; + } else { + $text = htmlspecialchars($text, ENT_HTML5|ENT_NOQUOTES, "UTF-8", false); + $text = mb_ereg_replace("\ ([?!:;»€$])", self::NARROW_NO_BREAK_SPACE."\\1", $text); + $text = mb_ereg_replace("([«])\ ", "\\1".self::NARROW_NO_BREAK_SPACE, $text); # Espace fine insécable + + # Warning: using a monospace font editor may not display dashes as they should be! + # Adds no-break spaces to stick the (EM/EN dashes) to words : aaaaaa – bb – ccccc ==> aaaaaa –$bb$– ccccc + $text = mb_ereg_replace("([".self::DASHES."]) ([^".self::DASHES.".]+) ([".self::DASHES."])", "\\1".self::NARROW_NO_BREAK_SPACE."\\2".self::NARROW_NO_BREAK_SPACE."\\3", $text); + + # Adds no-break space to stick the (EM/EN dashes) to words : aaaaaa – bb. ==> aaaaaa –$bb. + $text = mb_ereg_replace("([—–]) ([^.]+)\.", "\\1".self::NARROW_NO_BREAK_SPACE."\\2.", $text); + } + } + + public function translate($textDecoration=true) { + ob_start(); + foreach ($this->parsedGemtext as $node) { + $mode = $node["mode"]; + switch($mode) { + case "": + $text = $node["text"]; + self::htmlPrepare($text); + if ($textDecoration) self::addTextDecoration($text); + echo "

    $text

    \n"; + break; + case "*": + echo "
      \n"; + foreach ($node["texts"] as $text) { + self::htmlPrepare($text); + if ($textDecoration) self::addTextDecoration($text); + print("
    • $text\n"); + } + echo "
    \n"; + break; + case "```": + $text = implode("\n", $node["texts"]); + self::htmlPrepare($text); + echo "
    \n$text\n
    \n"; + break; + case ">": + $text = implode("\n", $node["texts"]); + self::htmlPrepare($text); + if ($textDecoration) self::addTextDecoration($text); + echo "
    \n$text\n
    \n"; + break; + case "=>": + $link = $node["link"]; + $linkText = $node["text"]; + if (empty($linkText)) { + $linkText = $link; + self::htmlPrepare($linkText); + } else { + self::htmlPrepare($linkText); + if ($textDecoration) self::addTextDecoration($linkText); + } + preg_match("/^([^:]+):/", $link, $matches); + $protocol = @$matches[1]; + if (empty($protocol)) $protocol = "local"; + echo "

    $linkText

    \n"; + break; + case "#": + $title = $node["title"]; + self::htmlPrepare($title); + if (empty($this->pageTitle)) $this->pageTitle = $title; + echo "

    $title

    \n"; + break; + case "##": + $title = $node["title"]; + self::htmlPrepare($title); + echo "

    $title

    \n"; + break; + case "###": + $title = $node["title"]; + self::htmlPrepare($title); + echo "

    $title

    \n"; + break; + case "^^^": + $this->textDecorationEnabled = !$this->textDecorationEnabled; + break; + default: + die("Unknown mode: '{$node["mode"]}'\n"); + } + } + + $this->translatedGemtext = ob_get_contents(); + ob_end_clean(); + } + + function getFullHtml() { + if (!$this->cssList) + $css = array("/htmgem/css/htmgem.css"); + else + $css = $this->cssList; + echo << + + +{$this->pageTitle} + +EOL; + foreach ($css as $c) { + echo "\n"; + } + echo << +\n +EOL; + echo $this->translatedGemtext; + echo "\n\n"; + } + + public function __toString() { + return $this->translatedGemtext; + } +} // GemTextTranslate_html ?> diff --git a/tutogemtext.gmi b/tutogemtext.gmi index a0d6f1c..32f03c7 100644 --- a/tutogemtext.gmi +++ b/tutogemtext.gmi @@ -53,9 +53,11 @@ Il ne peut exister qu’un lien par ligne. Et la ligne est dédiée à ça ! Voi La décoration du texte ne fait pas partie des spécifications de Gemini. -^ Cette **ligne** utilise la //décoration du texte// qui peut ~~barrer~~ ou __souligner__ des mots. +Cette *​*ligne*​* utilise la /​/décoration du texte/​/ qui peut ~​~barrer~​~ ou _​_souligner_​_ des mots. Cette **ligne** utilise la //décoration du texte// qui peut ~~barrer~~ ou __souligner__ des mots. +On peut désactiver et activer la décoration du texte avec **^^^** sur une ligne. + Et voilà ! Vous savez tout ce qui est à savoir 🥳 Les spécifications complètes sont sur le site de Gemini : => https://gemini.circumlunar.space/docs/specification.gmi