1use std::mem::take;
5
6use crate::CONFIG;
7use crate::types::Token;
8use crate::utils::push_buffer_to_collection;
9use unicode_categories::UnicodeCategories;
10use unicode_segmentation::UnicodeSegmentation;
11
12pub fn tokenize(markdown_line: &str) -> Vec<Token> {
29 if markdown_line.is_empty() {
30 return vec![Token::Newline];
31 }
32
33 let mut tokens: Vec<Token> = Vec::new();
34 let mut buffer: String = String::new();
35
36 let chars = Vec::from_iter(markdown_line.graphemes(true));
37 let str_len = chars.len();
38
39 let mut i = 0;
41 while i < str_len {
42 match chars[i] {
43 "*" if (i == 0 && matches!(chars.get(i + 1), Some(&" ")))
44 || (i > 0
45 && matches!(chars.get(i - 1), Some(&" ") | Some(&"\t"))
46 && matches!(chars.get(i + 1), Some(&" "))) =>
47 {
48 push_buffer_to_collection(&mut tokens, &mut buffer);
49
50 tokens.push(Token::Punctuation(String::from(chars[i])));
52 }
53 "*" | "_" => {
54 push_buffer_to_collection(&mut tokens, &mut buffer);
56
57 let delimiter = chars[i];
58 let mut run_length = 1;
59 while i + run_length < str_len && chars[i + run_length] == delimiter {
60 run_length += 1;
61 }
62
63 tokens.push(Token::EmphasisRun {
64 delimiter: delimiter.chars().next().unwrap(),
65 length: run_length,
66 });
67
68 i += run_length - 1;
69 }
70 "`" => {
71 push_buffer_to_collection(&mut tokens, &mut buffer);
72
73 if i + 2 < str_len && chars[i + 1] == "`" && chars[i + 2] == "`" {
74 tokens.push(Token::CodeFence);
75 i += 2;
76 } else {
77 tokens.push(Token::CodeTick);
78 }
79 }
80 "\\" => {
81 push_buffer_to_collection(&mut tokens, &mut buffer);
82
83 if i + 1 < str_len {
84 tokens.push(Token::Escape(String::from(chars[i + 1])));
85 i += 1;
86 } else {
87 buffer.push_str(chars[i]);
88 }
89 }
90 "-" => {
91 push_buffer_to_collection(&mut tokens, &mut buffer);
93
94 if i + 2 < str_len && chars[i + 1] == "-" && chars[i + 2] == "-" {
95 tokens.push(Token::ThematicBreak);
96 i += 2;
97 } else {
98 tokens.push(Token::Punctuation(String::from(chars[i])));
99 }
100 }
101 "[" => {
102 push_buffer_to_collection(&mut tokens, &mut buffer);
103
104 tokens.push(Token::OpenBracket);
105 }
106 "]" => {
107 push_buffer_to_collection(&mut tokens, &mut buffer);
108
109 tokens.push(Token::CloseBracket);
110 }
111 "(" => {
112 push_buffer_to_collection(&mut tokens, &mut buffer);
113
114 tokens.push(Token::OpenParenthesis);
115 }
116 ")" => {
117 push_buffer_to_collection(&mut tokens, &mut buffer);
118
119 tokens.push(Token::CloseParenthesis);
120 }
121 "|" => {
122 push_buffer_to_collection(&mut tokens, &mut buffer);
123
124 tokens.push(Token::TableCellSeparator);
125 }
126 ">" => {
127 push_buffer_to_collection(&mut tokens, &mut buffer);
128
129 if i == 0 {
130 tokens.push(Token::BlockQuoteMarker);
131 } else {
132 buffer.push_str(chars[i]);
133 }
134 }
135 "<" => {
136 push_buffer_to_collection(&mut tokens, &mut buffer);
137
138 while i + 1 < str_len && chars[i + 1] != ">" {
139 buffer.push_str(chars[i]);
140 i += 1;
141 }
142
143 if i + 1 < str_len && chars[i + 1] == ">" {
144 buffer.push_str(chars[i]);
145 buffer.push_str(chars[i + 1]);
146 tokens.push(Token::RawHtmlTag(take(&mut buffer)));
147 i += 1;
148 } else {
149 buffer.push_str(chars[i]);
150 }
151 }
152 "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" => {
153 let mut marker = String::from(chars[i]);
154 while i + 1 < str_len && chars[i + 1].chars().next().unwrap().is_ascii_digit() {
155 i += 1;
156 marker.push_str(chars[i]);
157 }
158
159 if i + 1 < str_len && chars[i + 1] != "." {
160 buffer.push_str(&marker);
161 } else if i + 2 < str_len && chars[i + 2] == " " {
162 push_buffer_to_collection(&mut tokens, &mut buffer);
163 tokens.push(Token::OrderedListMarker(marker));
164
165 i += 2;
166 continue;
167 } else {
168 buffer.push_str(&marker);
169 }
170 }
171 "\t" => {
172 push_buffer_to_collection(&mut tokens, &mut buffer);
173
174 tokens.push(Token::Tab);
175 }
176 " " => {
177 let tab_size = CONFIG.get().unwrap().lexer.tab_size;
179 if i + tab_size <= str_len && chars[i + 1..i + tab_size].iter().all(|&c| c == " ") {
180 push_buffer_to_collection(&mut tokens, &mut buffer);
181 tokens.push(Token::Tab);
182 i += tab_size; continue;
184 }
185
186 push_buffer_to_collection(&mut tokens, &mut buffer);
187
188 tokens.push(Token::Whitespace);
189 }
190 "" | "\n" => {
191 push_buffer_to_collection(&mut tokens, &mut buffer);
192
193 tokens.push(Token::Newline);
194 }
195 _ if is_punctuation(chars[i]) => {
198 push_buffer_to_collection(&mut tokens, &mut buffer);
199 tokens.push(Token::Punctuation(String::from(chars[i])));
200 }
201 _ => buffer.push_str(chars[i]),
202 }
203
204 i += 1;
205 }
206
207 push_buffer_to_collection(&mut tokens, &mut buffer);
209
210 tokens
211}
212
213fn is_punctuation(input_str: &str) -> bool {
230 if let Some(ch) = input_str.chars().next() {
231 ch.is_punctuation() || ch.is_symbol_currency()
232 } else {
233 false
234 }
235}
236
237#[cfg(test)]
238mod test;