markrs/
types.rs

1//! This module defines the types used in the markdown parser, including tokens, inline elements,
2//! block elements, and a cursor for navigating through tokens.
3
4use log::warn;
5
6use crate::html_generator::indent_html;
7use crate::{CONFIG, io::copy_image_to_output_dir, utils::build_rel_prefix};
8
9pub trait ToHtml {
10    /// Converts the implementing type to an String representing its HTML equivalent.
11    fn to_html(&self, output_dir: &str, input_dir: &str, html_rel_path: &str) -> String;
12}
13
14/// Represents the different types of tokens that can be found in a markdown line.
15#[derive(Debug, PartialEq, Clone)]
16pub enum Token {
17    Text(String),
18    EmphasisRun { delimiter: char, length: usize },
19    Punctuation(String),
20    OpenBracket,
21    CloseBracket,
22    OpenParenthesis,
23    CloseParenthesis,
24    TableCellSeparator,
25    OrderedListMarker(String),
26    Whitespace,
27    CodeTick,
28    CodeFence,
29    ThematicBreak,
30    Escape(String),
31    Tab,
32    Newline,
33    BlockQuoteMarker,
34    RawHtmlTag(String),
35}
36
37impl From<String> for Token {
38    fn from(s: String) -> Self {
39        Token::Text(s.to_string())
40    }
41}
42
43/// Represents block-level markdown elements.
44#[derive(Debug, PartialEq)]
45pub enum MdBlockElement {
46    Header {
47        level: u8,
48        content: Vec<MdInlineElement>,
49    },
50    Paragraph {
51        content: Vec<MdInlineElement>,
52    },
53    CodeBlock {
54        language: Option<String>,
55        lines: Vec<String>,
56    },
57    ThematicBreak,
58    UnorderedList {
59        items: Vec<MdListItem>,
60    },
61    OrderedList {
62        starting_num: usize,
63        items: Vec<MdListItem>,
64    },
65    Table {
66        headers: Vec<MdTableCell>,
67        body: Vec<Vec<MdTableCell>>,
68    },
69    BlockQuote {
70        content: Vec<MdBlockElement>,
71    },
72    RawHtml {
73        content: String,
74    },
75}
76
77impl ToHtml for MdBlockElement {
78    fn to_html(&self, output_dir: &str, input_dir: &str, html_rel_path: &str) -> String {
79        match self {
80            MdBlockElement::Header { level, content } => {
81                let inner_html = content
82                    .iter()
83                    .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
84                    .collect::<String>();
85
86                let id = content
87                    .iter()
88                    .map(MdInlineElement::to_plain_text)
89                    .collect::<String>();
90
91                let id = clean_id(id);
92
93                format!("\n<h{level} id=\"{id}\">{inner_html}</h{level}>\n")
94            }
95            MdBlockElement::Paragraph { content } => {
96                let inner_html = content
97                    .iter()
98                    .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
99                    .collect::<String>();
100                format!("<p>{inner_html}</p>")
101            }
102            MdBlockElement::CodeBlock { language, lines } => {
103                let language_class = match language {
104                    Some(language) => format!("language-{language}"),
105                    None => "language-none".to_string(),
106                };
107
108                if CONFIG.get().unwrap().html.use_prism {
109                    let code = lines.join("\n");
110
111                    format!(
112                        "<pre class=\"{language_class} line-numbers\" style=\"white-space: pre-wrap;\" data-prismjs-copy=\"📋\">\n<code class=\"{language_class} line-numbers\">{code}</code></pre>"
113                    )
114                } else {
115                    let code = lines
116                        .iter()
117                        .map(|line| format!("<code class=\"non_prism\">{line}</code>"))
118                        .collect::<String>();
119
120                    format!("<pre class=\"non_prism\">{code}</pre>")
121                }
122            }
123            MdBlockElement::ThematicBreak => "<hr>".to_string(),
124            MdBlockElement::UnorderedList { items } => {
125                let inner_items = items
126                    .iter()
127                    .map(|item| item.to_html(output_dir, input_dir, html_rel_path))
128                    .collect::<String>();
129
130                let inner_items = indent_html(&inner_items, 1);
131                format!("<ul>\n{inner_items}\n</ul>")
132            }
133            MdBlockElement::OrderedList {
134                items,
135                starting_num,
136            } => {
137                let inner_items = items
138                    .iter()
139                    .map(|item| item.to_html(output_dir, input_dir, html_rel_path))
140                    .collect::<String>();
141
142                let inner_items = indent_html(&inner_items, 1);
143                format!("<ol start=\"{starting_num}\">\n{inner_items}\n</ol>")
144            }
145            MdBlockElement::Table { headers, body } => {
146                let header_html = headers
147                    .iter()
148                    .map(|cell| cell.to_html(output_dir, input_dir, html_rel_path))
149                    .collect::<Vec<_>>()
150                    .join("\n");
151
152                let header_html = indent_html(&header_html, 3);
153
154                let body_html = body
155                    .iter()
156                    .map(|row| {
157                        let cell_html = row
158                            .iter()
159                            .map(|cell| cell.to_html(output_dir, input_dir, html_rel_path))
160                            .collect::<Vec<_>>()
161                            .join("\n");
162
163                        let cell_html = indent_html(&cell_html, 1);
164
165                        format!("<tr>\n{cell_html}\n</tr>")
166                    })
167                    .collect::<Vec<_>>()
168                    .join("\n");
169
170                let body_html = indent_html(&body_html, 2);
171
172                format!(
173                    "<table>\n\t<thead>\n\t\t<tr>\n{header_html}\n\t\t</tr>\n\t</thead>\n\t<tbody>\n{body_html}\n\t</tbody>\n</table>"
174                )
175            }
176            MdBlockElement::BlockQuote { content } => {
177                let inner_html = content
178                    .iter()
179                    .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
180                    .collect::<String>();
181
182                format!("<blockquote>\n{inner_html}\n</blockquote>")
183            }
184            MdBlockElement::RawHtml { content } => {
185                format!("{}\n", content)
186            }
187        }
188    }
189}
190
191/// Cleans the ID string by removing HTML tags and special characters, and replacing spaces and underscores with hyphens.
192fn clean_id(old_id: String) -> String {
193    let mut new_id = String::new();
194
195    let mut in_tag = false;
196    for char in old_id.chars() {
197        if char == '<' {
198            in_tag = true;
199        } else if char == '>' {
200            in_tag = false;
201            continue;
202        }
203
204        if !in_tag && (char.is_alphanumeric() || char == '_' || char == ' ') {
205            new_id.push(char);
206        }
207    }
208
209    new_id
210        .replace([' ', '_'], "-")
211        .to_lowercase()
212        .trim_matches('-')
213        .to_string()
214}
215
216/// Represents a list item in markdown, which can contain block elements.
217///
218/// # Fields
219/// * `content` - The content of the list item, which can be any block-level markdown element.
220#[derive(Debug, PartialEq)]
221pub struct MdListItem {
222    pub content: MdBlockElement,
223}
224
225impl ToHtml for MdListItem {
226    fn to_html(&self, output_dir: &str, input_dir: &str, html_rel_path: &str) -> String {
227        match &self.content {
228            MdBlockElement::UnorderedList { items } => {
229                let inner_items = items
230                    .iter()
231                    .map(|item| item.to_html(output_dir, input_dir, html_rel_path))
232                    .collect::<String>();
233                let inner_items = indent_html(&inner_items, 1);
234                format!("<ul>\n{inner_items}\n</ul>")
235            }
236            MdBlockElement::OrderedList {
237                items,
238                starting_num,
239            } => {
240                let inner_items = items
241                    .iter()
242                    .map(|item| item.to_html(output_dir, input_dir, html_rel_path))
243                    .collect::<String>();
244                format!("<ol start=\"{starting_num}\">\n{inner_items}\n</ol>")
245            }
246            _ => {
247                let inner_html = indent_html(
248                    &self.content.to_html(output_dir, input_dir, html_rel_path),
249                    1,
250                );
251                format!("<li>\n{inner_html}\n</li>\n")
252            }
253        }
254    }
255}
256
257/// Represents a cell in a markdown table.
258#[derive(Debug, PartialEq, Clone)]
259pub struct MdTableCell {
260    pub content: Vec<MdInlineElement>,
261    pub alignment: TableAlignment,
262    pub is_header: bool,
263}
264
265impl ToHtml for MdTableCell {
266    fn to_html(&self, output_dir: &str, input_dir: &str, html_rel_path: &str) -> String {
267        let inner_html = self
268            .content
269            .iter()
270            .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
271            .collect::<String>();
272
273        let text_alignment = match self.alignment {
274            TableAlignment::Left | TableAlignment::None => "left",
275            TableAlignment::Center => "center",
276            TableAlignment::Right => "right",
277        };
278
279        match self.is_header {
280            true => format!("<th style=\"text-align:{text_alignment};\">{inner_html}</th>"),
281            false => format!("<td style=\"text-align:{text_alignment};\">{inner_html}</td>"),
282        }
283    }
284}
285
286/// Represents the alignment of table cells in markdown tables.
287#[derive(Debug, PartialEq, Clone)]
288pub enum TableAlignment {
289    Left,
290    Center,
291    Right,
292    None,
293}
294
295/// Represents inline markdown elements (text, bold/italic, link, etc.)
296#[derive(Debug, PartialEq, Clone)]
297pub enum MdInlineElement {
298    Text {
299        content: String,
300    },
301    Bold {
302        content: Vec<MdInlineElement>,
303    },
304    Italic {
305        content: Vec<MdInlineElement>,
306    },
307    Link {
308        text: Vec<MdInlineElement>,
309        title: Option<String>,
310        url: String,
311    },
312    Image {
313        alt_text: String,
314        title: Option<String>,
315        url: String,
316    },
317    Code {
318        content: String,
319    },
320    Placeholder {
321        ch: char,
322        token_position: usize,
323    },
324}
325
326impl From<String> for MdInlineElement {
327    fn from(s: String) -> Self {
328        MdInlineElement::Text {
329            content: s.to_string(),
330        }
331    }
332}
333
334impl ToHtml for MdInlineElement {
335    fn to_html(&self, output_dir: &str, input_dir: &str, html_rel_path: &str) -> String {
336        match self {
337            MdInlineElement::Text { content } => content.clone(),
338            MdInlineElement::Bold { content } => {
339                let inner_html = content
340                    .iter()
341                    .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
342                    .collect::<String>();
343                format!("<b>{}</b>", inner_html)
344            }
345            MdInlineElement::Italic { content } => {
346                let inner_html = content
347                    .iter()
348                    .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
349                    .collect::<String>();
350                format!("<i>{}</i>", inner_html)
351            }
352            MdInlineElement::Link { text, title, url } => {
353                let label_html = text
354                    .iter()
355                    .map(|el| el.to_html(output_dir, input_dir, html_rel_path))
356                    .collect::<String>();
357
358                if url.contains("youtube.com") && url.contains("v=") {
359                    let video_id = url
360                        .split("v=")
361                        .nth(1)
362                        .and_then(|s| s.split('&').next())
363                        .unwrap_or("");
364
365                    return format!(
366                        r#"<div class="video-container">
367                        <iframe width="560" height="315" src="https://www.youtube.com/embed/{}" 
368                        title="YouTube video player" frameborder="0" allowfullscreen></iframe>
369                        </div>"#,
370                        video_id
371                    );
372                }
373
374                // Links to external URLs will open in a new tab
375                if url.starts_with("http") {
376                    match title {
377                        Some(text) => {
378                            format!(
379                                "<a href=\"{url}\" title=\"{text}\" target=\"_blank\">{label_html}⮺</a>"
380                            )
381                        }
382                        None => format!("<a href=\"{url}\" target=\"_blank\">{label_html}⮺</a>"),
383                    }
384                } else {
385                    match title {
386                        Some(text) => {
387                            format!("<a href=\"{url}\" title=\"{text}\">{label_html}</a>")
388                        }
389                        None => format!("<a href=\"{url}\">{label_html}</a>"),
390                    }
391                }
392            }
393            MdInlineElement::Image {
394                alt_text,
395                title,
396                url,
397            } => {
398                // If the image uses a relative path, copy it to the output directory
399                let media_url = if !url.starts_with("http") {
400                    if let Err(e) = copy_image_to_output_dir(url, output_dir, input_dir) {
401                        warn!("Unable to copy image {url}: {e}");
402                    }
403
404                    // Update the URL to point to the copied image in the output directory
405                    let url = url.rsplit('/').next().unwrap_or(url);
406
407                    let rel_prefix = build_rel_prefix(html_rel_path);
408
409                    &format!("./{}/media/{}", rel_prefix.to_string_lossy(), url)
410                } else {
411                    url
412                };
413
414                match title {
415                    Some(text) => {
416                        format!("<img src=\"{media_url}\" alt=\"{alt_text}\" title=\"{text}\"/>")
417                    }
418                    None => format!("<img src=\"{media_url}\" alt=\"{alt_text}\"/>"),
419                }
420            }
421            MdInlineElement::Code { content } => format!("<code>{content}</code>"),
422            MdInlineElement::Placeholder {
423                ch,
424                token_position: _,
425            } => ch.to_string(),
426        }
427    }
428}
429
430impl MdInlineElement {
431    /// Converts the inline element to a plain text representation.
432    pub fn to_plain_text(&self) -> String {
433        match self {
434            MdInlineElement::Text { content } => content.clone(),
435            MdInlineElement::Bold { content } => content
436                .iter()
437                .map(MdInlineElement::to_plain_text)
438                .collect::<Vec<_>>()
439                .join(""),
440            MdInlineElement::Italic { content } => content
441                .iter()
442                .map(MdInlineElement::to_plain_text)
443                .collect::<Vec<_>>()
444                .join(""),
445            MdInlineElement::Link { text, .. } => text
446                .iter()
447                .map(MdInlineElement::to_plain_text)
448                .collect::<Vec<_>>()
449                .join(""),
450            MdInlineElement::Image { alt_text, .. } => alt_text.clone(),
451            MdInlineElement::Code { content } => content.clone(),
452            MdInlineElement::Placeholder {
453                ch,
454                token_position: _,
455            } => ch.to_string(),
456        }
457    }
458}
459
460/// Cursor for navigating through a vector of tokens
461///
462/// This struct provides methods to access the current token, peek ahead or behind, and advance the
463/// cursor position.
464///
465/// # Fields
466/// * `tokens` - A vector of tokens to navigate through.
467/// * `current_position` - The current position of the cursor within the token vector.
468#[derive(Debug)]
469pub struct TokenCursor {
470    pub tokens: Vec<Token>,
471    pub current_position: usize,
472}
473
474impl TokenCursor {
475    /// Returns the current token, if any.
476    pub fn current(&self) -> Option<&Token> {
477        self.tokens.get(self.current_position)
478    }
479
480    /// Returns the nth next token, if any.
481    ///
482    /// # Arguments
483    /// * `n` - The number of tokens to look ahead.
484    ///
485    /// # Returns
486    /// An `Option` containing a reference to the token if it exists, or `None` if it is out of
487    /// bounds.
488    pub fn peek_ahead(&self, n: usize) -> Option<&Token> {
489        self.tokens.get(self.current_position + n)
490    }
491
492    /// Returns the nth previous token, if any.
493    ///
494    /// # Arguments
495    /// * `n` - The number of tokens to look behind.
496    ///
497    /// # Returns
498    /// An `Option` containing a reference to the token if it exists, or `None` if it is out of
499    pub fn _peek_behind(&self, n: usize) -> Option<&Token> {
500        self.tokens.get(self.current_position - n)
501    }
502
503    /// Moves the cursor forward one position.
504    pub fn advance(&mut self) {
505        if self.current_position < self.tokens.len() {
506            self.current_position += 1;
507        }
508    }
509
510    /// Sets the cursor's position to the specified position.
511    ///
512    /// # Arguments
513    /// * `pos` - The position to set the cursor to.
514    ///
515    /// # Panics
516    /// Panics if the position is out of bounds for the token list.
517    pub fn _set_position(&mut self, pos: usize) {
518        if pos < self.tokens.len() {
519            self.current_position = pos;
520        } else {
521            panic!("Position {pos} is out of bounds for the TokenCursor");
522        }
523    }
524
525    /// Returns the current position of the cursor.
526    pub fn position(&self) -> usize {
527        self.current_position
528    }
529
530    /// Returns whether the cursor is at the end of the token list.
531    pub fn is_at_eof(&self) -> bool {
532        self.current_position >= self.tokens.len()
533    }
534}
535
536/// Manages Delimiter runs in a markdown document.
537/// A delimiter run is a sequence of the same character (e.g., `*`, `_`, `~`) that can be used for
538/// bold/italic writing.
539///
540/// # Fields
541/// * `ch` - The character that represents the delimiter (e.g., `*`, `_`, `~`).
542/// * `run_length` - The number of times the delimiter character appears in a row.
543/// * `token_position` - The position of the first token in this delimiter run.
544/// * `parsed_position` - The position in the `Vec<MdInlineElement>` where the content of this
545///   delimiter run will be stored.
546/// * `active` - Whether this delimiter run is currently active (i.e., it has not been closed).
547/// * `can_open` - Whether this delimiter can open a new emphasis run (e.g., it is left-flanking).
548/// * `can_close` - Whether this delimiter can close an existing emphasis run (e.g., it is
549///   right-flanking).
550#[derive(Debug, Clone)]
551pub struct Delimiter {
552    pub ch: char,
553    pub run_length: usize,
554    pub token_position: usize,
555    pub parsed_position: usize,
556    pub active: bool,
557    pub can_open: bool,  //Must be left-flanking
558    pub can_close: bool, //Must be right-flanking
559}
560
561impl Delimiter {
562    /// Determines whether a delimiter is "Left", "Right", or "Both" flanking
563    /// For exmample, it is left flanking if it's not followed by non-whitespace, and either:
564    /// 1. Not followed by punctuation
565    /// 2. Followed by punctuation and preceded by whitespace or punctuation
566    ///
567    /// Modifies the `can_open` and `can_close` fields in-place based on the classification.
568    ///
569    /// See i<https://spec.commonmark.org/0.31.2/#left-flanking-delimiter-run> for more information.
570    ///
571    /// # Arguments
572    /// * `tokens` - A slice of tokens to classify the delimiter against.
573    pub fn classify_flanking(&mut self, tokens: &[Token]) {
574        let before = if self.token_position > 0 {
575            Some(&tokens[self.token_position - 1])
576        } else {
577            None
578        };
579
580        let after = tokens.get(self.token_position + 1);
581        let followed_by_whitespace = after.is_none_or(is_whitespace);
582        let followed_by_punctuation = after.is_some_and(is_punctuation);
583
584        let preceded_by_whitespace = before.is_none_or(is_whitespace);
585        let preceded_by_punctuation = before.is_some_and(is_punctuation);
586
587        let is_left_flanking = if followed_by_whitespace {
588            false
589        } else if !followed_by_punctuation {
590            true
591        } else {
592            preceded_by_whitespace || preceded_by_punctuation
593        };
594
595        let is_right_flanking = if preceded_by_whitespace {
596            false
597        } else if !preceded_by_punctuation {
598            true
599        } else {
600            followed_by_whitespace || followed_by_punctuation
601        };
602
603        let delimiter_char = self.ch;
604
605        // Apply Rule of 3 and underscore restrictions
606        let is_underscore = delimiter_char == '_';
607
608        if is_underscore {
609            self.can_open = is_left_flanking && (!is_right_flanking || followed_by_punctuation);
610
611            self.can_close = is_right_flanking && (!is_left_flanking || followed_by_punctuation);
612        } else {
613            self.can_open = is_left_flanking;
614            self.can_close = is_right_flanking;
615        }
616    }
617}
618
619/// Helper function to determine if a token is whitespace or newline.
620///
621/// # Arguments
622/// * `token` - The token to check.
623fn is_whitespace(token: &Token) -> bool {
624    matches!(token, Token::Newline | Token::Whitespace)
625}
626
627/// Helper function to determine if a token is punctuation.
628///
629/// # Arguments
630/// * `token` - The token to check.
631fn is_punctuation(token: &Token) -> bool {
632    matches!(
633        token,
634        Token::Punctuation(_)
635            | Token::EmphasisRun {
636                delimiter: _,
637                length: _
638            }
639            | Token::OpenBracket
640            | Token::CloseBracket
641            | Token::OpenParenthesis
642            | Token::CloseParenthesis
643    )
644}