markrs/
lexer.rs

1//! This module provides functionality to tokenize a line of markdown text into a vector of `Token`
2//! enums.
3
4use std::mem::take;
5
6use crate::CONFIG;
7use crate::types::Token;
8use crate::utils::push_buffer_to_collection;
9use unicode_categories::UnicodeCategories;
10use unicode_segmentation::UnicodeSegmentation;
11
12/// Tokenizes a line of markdown text into a vector of `Token` enums.
13///
14/// # Arguments
15/// * `markdown_line` - A string slice representing a line of markdown text.
16///
17/// # Returns
18/// A vector of `Token` enums representing the tokenized line.
19///
20/// # Example
21/// ```
22/// use lexer::tokenize;
23/// use types::Token;
24/// let tokens = tokenize("This is *italic* and **bold** text.");
25/// assert_eq!(tokens.len(), 9);
26/// assert_eq!(tokens[4], Token::EmphasisRun { delimiter: '*', length: 1 });
27/// ```
28pub fn tokenize(markdown_line: &str) -> Vec<Token> {
29    if markdown_line.is_empty() {
30        return vec![Token::Newline];
31    }
32
33    let mut tokens: Vec<Token> = Vec::new();
34    let mut buffer: String = String::new();
35
36    let chars = Vec::from_iter(markdown_line.graphemes(true));
37    let str_len = chars.len();
38
39    // Loop through each character, and perform foward lookups for *
40    let mut i = 0;
41    while i < str_len {
42        match chars[i] {
43            "*" if (i == 0 && matches!(chars.get(i + 1), Some(&" ")))
44                || (i > 0
45                    && matches!(chars.get(i - 1), Some(&" ") | Some(&"\t"))
46                    && matches!(chars.get(i + 1), Some(&" "))) =>
47            {
48                push_buffer_to_collection(&mut tokens, &mut buffer);
49
50                // Start of unordered list
51                tokens.push(Token::Punctuation(String::from(chars[i])));
52            }
53            "*" | "_" => {
54                // if the current buffer isn't empty, append a Text token to the Vec<Token>
55                push_buffer_to_collection(&mut tokens, &mut buffer);
56
57                let delimiter = chars[i];
58                let mut run_length = 1;
59                while i + run_length < str_len && chars[i + run_length] == delimiter {
60                    run_length += 1;
61                }
62
63                tokens.push(Token::EmphasisRun {
64                    delimiter: delimiter.chars().next().unwrap(),
65                    length: run_length,
66                });
67
68                i += run_length - 1;
69            }
70            "`" => {
71                push_buffer_to_collection(&mut tokens, &mut buffer);
72
73                if i + 2 < str_len && chars[i + 1] == "`" && chars[i + 2] == "`" {
74                    tokens.push(Token::CodeFence);
75                    i += 2;
76                } else {
77                    tokens.push(Token::CodeTick);
78                }
79            }
80            "\\" => {
81                push_buffer_to_collection(&mut tokens, &mut buffer);
82
83                if i + 1 < str_len {
84                    tokens.push(Token::Escape(String::from(chars[i + 1])));
85                    i += 1;
86                } else {
87                    buffer.push_str(chars[i]);
88                }
89            }
90            "-" => {
91                // Check for thematic break
92                push_buffer_to_collection(&mut tokens, &mut buffer);
93
94                if i + 2 < str_len && chars[i + 1] == "-" && chars[i + 2] == "-" {
95                    tokens.push(Token::ThematicBreak);
96                    i += 2;
97                } else {
98                    tokens.push(Token::Punctuation(String::from(chars[i])));
99                }
100            }
101            "[" => {
102                push_buffer_to_collection(&mut tokens, &mut buffer);
103
104                tokens.push(Token::OpenBracket);
105            }
106            "]" => {
107                push_buffer_to_collection(&mut tokens, &mut buffer);
108
109                tokens.push(Token::CloseBracket);
110            }
111            "(" => {
112                push_buffer_to_collection(&mut tokens, &mut buffer);
113
114                tokens.push(Token::OpenParenthesis);
115            }
116            ")" => {
117                push_buffer_to_collection(&mut tokens, &mut buffer);
118
119                tokens.push(Token::CloseParenthesis);
120            }
121            "|" => {
122                push_buffer_to_collection(&mut tokens, &mut buffer);
123
124                tokens.push(Token::TableCellSeparator);
125            }
126            ">" => {
127                push_buffer_to_collection(&mut tokens, &mut buffer);
128
129                if i == 0 {
130                    tokens.push(Token::BlockQuoteMarker);
131                } else {
132                    buffer.push_str(chars[i]);
133                }
134            }
135            "<" => {
136                push_buffer_to_collection(&mut tokens, &mut buffer);
137
138                while i + 1 < str_len && chars[i + 1] != ">" {
139                    buffer.push_str(chars[i]);
140                    i += 1;
141                }
142
143                if i + 1 < str_len && chars[i + 1] == ">" {
144                    buffer.push_str(chars[i]);
145                    buffer.push_str(chars[i + 1]);
146                    tokens.push(Token::RawHtmlTag(take(&mut buffer)));
147                    i += 1;
148                } else {
149                    buffer.push_str(chars[i]);
150                }
151            }
152            "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" => {
153                let mut marker = String::from(chars[i]);
154                while i + 1 < str_len && chars[i + 1].chars().next().unwrap().is_ascii_digit() {
155                    i += 1;
156                    marker.push_str(chars[i]);
157                }
158
159                if i + 1 < str_len && chars[i + 1] != "." {
160                    buffer.push_str(&marker);
161                } else if i + 2 < str_len && chars[i + 2] == " " {
162                    push_buffer_to_collection(&mut tokens, &mut buffer);
163                    tokens.push(Token::OrderedListMarker(marker));
164
165                    i += 2;
166                    continue;
167                } else {
168                    buffer.push_str(&marker);
169                }
170            }
171            "\t" => {
172                push_buffer_to_collection(&mut tokens, &mut buffer);
173
174                tokens.push(Token::Tab);
175            }
176            " " => {
177                // Will be configurable later, but for now we'll stick to 4 spaces = 1 tab
178                let tab_size = CONFIG.get().unwrap().lexer.tab_size;
179                if i + tab_size <= str_len && chars[i + 1..i + tab_size].iter().all(|&c| c == " ") {
180                    push_buffer_to_collection(&mut tokens, &mut buffer);
181                    tokens.push(Token::Tab);
182                    i += tab_size; // i won't increment after continue, so we do it here
183                    continue;
184                }
185
186                push_buffer_to_collection(&mut tokens, &mut buffer);
187
188                tokens.push(Token::Whitespace);
189            }
190            "" | "\n" => {
191                push_buffer_to_collection(&mut tokens, &mut buffer);
192
193                tokens.push(Token::Newline);
194            }
195            // Note that graphemes() returns strings because graphemes can consist of things like a
196            // char + a modifier
197            _ if is_punctuation(chars[i]) => {
198                push_buffer_to_collection(&mut tokens, &mut buffer);
199                tokens.push(Token::Punctuation(String::from(chars[i])));
200            }
201            _ => buffer.push_str(chars[i]),
202        }
203
204        i += 1;
205    }
206
207    // If the current buffer isn't empty when the loop is over, append it to the tokens vector
208    push_buffer_to_collection(&mut tokens, &mut buffer);
209
210    tokens
211}
212
213/// Helper function to determine if a string is a single punctuation character.
214///
215/// # Arguments
216/// * `input_str` - A string slice to check.
217///
218/// # Returns
219/// Returns `true` if the string is a single punctuation character or symbol currency, otherwise
220/// `false`.
221///
222/// # Example
223/// ```
224/// use lexer::is_punctuation;
225/// assert!(is_punctuation("!"));
226/// assert!(!is_punctuation("Hello"));
227/// assert!(is_punctuation("$"));
228/// ```
229fn is_punctuation(input_str: &str) -> bool {
230    if let Some(ch) = input_str.chars().next() {
231        ch.is_punctuation() || ch.is_symbol_currency()
232    } else {
233        false
234    }
235}
236
237#[cfg(test)]
238mod test;