1#![allow(dead_code)]
8
9use alloc::{
10 string::{String, ToString},
11 vec::Vec,
12};
13
14#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct Attribute {
17 pub name: String,
18 pub value: String,
19}
20
21#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum Token {
24 Doctype(String),
26 StartTag(String, Vec<Attribute>, bool),
28 EndTag(String),
30 Character(char),
32 Comment(String),
34 Eof,
36}
37
38#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40enum TokenizerState {
41 Data,
42 TagOpen,
43 EndTagOpen,
44 TagName,
45 BeforeAttrName,
46 AttrName,
47 AfterAttrName,
48 BeforeAttrValue,
49 AttrValueDoubleQuoted,
50 AttrValueSingleQuoted,
51 AttrValueUnquoted,
52 AfterAttrValueQuoted,
53 SelfClosingStartTag,
54 BogusComment,
55 MarkupDeclarationOpen,
56 CommentStart,
57 CommentStartDash,
58 InComment,
59 CommentEndDash,
60 CommentEnd,
61}
62
63const VOID_ELEMENTS: &[&str] = &[
65 "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
66 "track", "wbr",
67];
68
69pub struct HtmlTokenizer {
71 input: Vec<u8>,
72 pos: usize,
73 state: TokenizerState,
74 current_tag_name: String,
75 current_tag_attrs: Vec<Attribute>,
76 current_attr_name: String,
77 current_attr_value: String,
78 self_closing: bool,
79 is_end_tag: bool,
80 comment_data: String,
81 reconsume: bool,
82 current_char: u8,
83}
84
85impl HtmlTokenizer {
86 pub fn new(input: &[u8]) -> Self {
88 Self {
89 input: input.to_vec(),
90 pos: 0,
91 state: TokenizerState::Data,
92 current_tag_name: String::new(),
93 current_tag_attrs: Vec::new(),
94 current_attr_name: String::new(),
95 current_attr_value: String::new(),
96 self_closing: false,
97 is_end_tag: false,
98 comment_data: String::new(),
99 reconsume: false,
100 current_char: 0,
101 }
102 }
103
104 pub fn from_text(input: &str) -> Self {
106 Self::new(input.as_bytes())
107 }
108
109 pub fn tokenize_all(&mut self) -> Vec<Token> {
111 let mut tokens = Vec::new();
112 loop {
113 let token = self.next_token();
114 if token == Token::Eof {
115 tokens.push(Token::Eof);
116 break;
117 }
118 tokens.push(token);
119 }
120 tokens
121 }
122
123 fn consume_next(&mut self) -> Option<u8> {
125 if self.reconsume {
126 self.reconsume = false;
127 return Some(self.current_char);
128 }
129 if self.pos < self.input.len() {
130 let ch = self.input[self.pos];
131 self.pos += 1;
132 self.current_char = ch;
133 Some(ch)
134 } else {
135 None
136 }
137 }
138
139 fn peek(&self) -> Option<u8> {
141 if self.pos < self.input.len() {
142 Some(self.input[self.pos])
143 } else {
144 None
145 }
146 }
147
148 fn starts_with_ci(&self, s: &str) -> bool {
150 let bytes = s.as_bytes();
151 if self.pos + bytes.len() > self.input.len() {
152 return false;
153 }
154 for (i, &b) in bytes.iter().enumerate() {
155 let input_byte = self.input[self.pos + i];
156 if !input_byte.eq_ignore_ascii_case(&b) {
157 return false;
158 }
159 }
160 true
161 }
162
163 fn emit_tag(&mut self) -> Token {
165 let name = core::mem::take(&mut self.current_tag_name);
166 let attrs = core::mem::take(&mut self.current_tag_attrs);
167 let sc = self.self_closing;
168 self.self_closing = false;
169
170 if self.is_end_tag {
171 self.is_end_tag = false;
172 Token::EndTag(name)
173 } else {
174 let is_void = VOID_ELEMENTS.contains(&name.as_str());
176 Token::StartTag(name, attrs, sc || is_void)
177 }
178 }
179
180 fn finish_attr(&mut self) {
182 if !self.current_attr_name.is_empty() {
183 let attr = Attribute {
184 name: core::mem::take(&mut self.current_attr_name),
185 value: core::mem::take(&mut self.current_attr_value),
186 };
187 self.current_tag_attrs.push(attr);
188 } else {
189 self.current_attr_name.clear();
190 self.current_attr_value.clear();
191 }
192 }
193
194 fn try_decode_entity(&mut self) -> Option<char> {
197 let start = self.pos;
199 let mut entity = String::new();
200
201 for _ in 0..10 {
203 match self.peek() {
204 Some(b';') => {
205 self.pos += 1; break;
207 }
208 Some(ch) if ch.is_ascii_alphanumeric() || ch == b'#' => {
209 self.pos += 1;
210 entity.push(ch as char);
211 }
212 _ => break,
213 }
214 }
215
216 match entity.as_str() {
217 "amp" => Some('&'),
218 "lt" => Some('<'),
219 "gt" => Some('>'),
220 "quot" => Some('"'),
221 "apos" => Some('\''),
222 "nbsp" => Some('\u{00A0}'),
223 s if s.starts_with('#') => {
224 let num_str = &s[1..];
225 let code = if let Some(hex) = num_str.strip_prefix('x') {
226 u32::from_str_radix(hex, 16).ok()
227 } else {
228 num_str.parse::<u32>().ok()
229 };
230 code.and_then(char::from_u32)
231 }
232 _ => {
233 self.pos = start;
235 Some('&')
236 }
237 }
238 }
239
240 pub fn next_token(&mut self) -> Token {
242 loop {
243 match self.state {
244 TokenizerState::Data => match self.consume_next() {
245 Some(b'<') => {
246 self.state = TokenizerState::TagOpen;
247 }
248 Some(b'&') => {
249 if let Some(ch) = self.try_decode_entity() {
250 return Token::Character(ch);
251 }
252 return Token::Character('&');
253 }
254 Some(ch) => {
255 return Token::Character(ch as char);
256 }
257 None => return Token::Eof,
258 },
259
260 TokenizerState::TagOpen => {
261 match self.consume_next() {
262 Some(b'!') => {
263 self.state = TokenizerState::MarkupDeclarationOpen;
264 }
265 Some(b'/') => {
266 self.state = TokenizerState::EndTagOpen;
267 }
268 Some(ch) if ch.is_ascii_alphabetic() => {
269 self.current_tag_name.clear();
270 self.current_tag_attrs.clear();
271 self.self_closing = false;
272 self.is_end_tag = false;
273 self.current_tag_name.push(ch.to_ascii_lowercase() as char);
274 self.state = TokenizerState::TagName;
275 }
276 Some(b'?') => {
277 self.comment_data.clear();
279 self.state = TokenizerState::BogusComment;
280 }
281 _ => {
282 self.reconsume = true;
284 self.state = TokenizerState::Data;
285 return Token::Character('<');
286 }
287 }
288 }
289
290 TokenizerState::EndTagOpen => {
291 match self.consume_next() {
292 Some(ch) if ch.is_ascii_alphabetic() => {
293 self.current_tag_name.clear();
294 self.current_tag_attrs.clear();
295 self.self_closing = false;
296 self.is_end_tag = true;
297 self.current_tag_name.push(ch.to_ascii_lowercase() as char);
298 self.state = TokenizerState::TagName;
299 }
300 Some(b'>') => {
301 self.state = TokenizerState::Data;
303 }
304 _ => {
305 self.comment_data.clear();
306 self.reconsume = true;
307 self.state = TokenizerState::BogusComment;
308 }
309 }
310 }
311
312 TokenizerState::TagName => match self.consume_next() {
313 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
314 self.state = TokenizerState::BeforeAttrName;
315 }
316 Some(b'/') => {
317 self.state = TokenizerState::SelfClosingStartTag;
318 }
319 Some(b'>') => {
320 self.state = TokenizerState::Data;
321 return self.emit_tag();
322 }
323 Some(ch) => {
324 self.current_tag_name.push(ch.to_ascii_lowercase() as char);
325 }
326 None => return Token::Eof,
327 },
328
329 TokenizerState::BeforeAttrName => {
330 match self.consume_next() {
331 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
332 }
334 Some(b'/') => {
335 self.state = TokenizerState::SelfClosingStartTag;
336 }
337 Some(b'>') => {
338 self.state = TokenizerState::Data;
339 return self.emit_tag();
340 }
341 Some(ch) => {
342 self.current_attr_name.clear();
343 self.current_attr_value.clear();
344 self.current_attr_name.push(ch.to_ascii_lowercase() as char);
345 self.state = TokenizerState::AttrName;
346 }
347 None => return Token::Eof,
348 }
349 }
350
351 TokenizerState::AttrName => match self.consume_next() {
352 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
353 self.state = TokenizerState::AfterAttrName;
354 }
355 Some(b'/') => {
356 self.finish_attr();
357 self.state = TokenizerState::SelfClosingStartTag;
358 }
359 Some(b'=') => {
360 self.state = TokenizerState::BeforeAttrValue;
361 }
362 Some(b'>') => {
363 self.finish_attr();
364 self.state = TokenizerState::Data;
365 return self.emit_tag();
366 }
367 Some(ch) => {
368 self.current_attr_name.push(ch.to_ascii_lowercase() as char);
369 }
370 None => return Token::Eof,
371 },
372
373 TokenizerState::AfterAttrName => {
374 match self.consume_next() {
375 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
376 }
378 Some(b'/') => {
379 self.finish_attr();
380 self.state = TokenizerState::SelfClosingStartTag;
381 }
382 Some(b'=') => {
383 self.state = TokenizerState::BeforeAttrValue;
384 }
385 Some(b'>') => {
386 self.finish_attr();
387 self.state = TokenizerState::Data;
388 return self.emit_tag();
389 }
390 Some(ch) => {
391 self.finish_attr();
393 self.current_attr_name.clear();
394 self.current_attr_value.clear();
395 self.current_attr_name.push(ch.to_ascii_lowercase() as char);
396 self.state = TokenizerState::AttrName;
397 }
398 None => return Token::Eof,
399 }
400 }
401
402 TokenizerState::BeforeAttrValue => {
403 match self.consume_next() {
404 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
405 }
407 Some(b'"') => {
408 self.state = TokenizerState::AttrValueDoubleQuoted;
409 }
410 Some(b'\'') => {
411 self.state = TokenizerState::AttrValueSingleQuoted;
412 }
413 Some(b'>') => {
414 self.finish_attr();
415 self.state = TokenizerState::Data;
416 return self.emit_tag();
417 }
418 Some(ch) => {
419 self.current_attr_value.push(ch as char);
420 self.state = TokenizerState::AttrValueUnquoted;
421 }
422 None => return Token::Eof,
423 }
424 }
425
426 TokenizerState::AttrValueDoubleQuoted => match self.consume_next() {
427 Some(b'"') => {
428 self.finish_attr();
429 self.state = TokenizerState::AfterAttrValueQuoted;
430 }
431 Some(b'&') => {
432 if let Some(ch) = self.try_decode_entity() {
433 self.current_attr_value.push(ch);
434 } else {
435 self.current_attr_value.push('&');
436 }
437 }
438 Some(ch) => {
439 self.current_attr_value.push(ch as char);
440 }
441 None => return Token::Eof,
442 },
443
444 TokenizerState::AttrValueSingleQuoted => match self.consume_next() {
445 Some(b'\'') => {
446 self.finish_attr();
447 self.state = TokenizerState::AfterAttrValueQuoted;
448 }
449 Some(b'&') => {
450 if let Some(ch) = self.try_decode_entity() {
451 self.current_attr_value.push(ch);
452 } else {
453 self.current_attr_value.push('&');
454 }
455 }
456 Some(ch) => {
457 self.current_attr_value.push(ch as char);
458 }
459 None => return Token::Eof,
460 },
461
462 TokenizerState::AttrValueUnquoted => match self.consume_next() {
463 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
464 self.finish_attr();
465 self.state = TokenizerState::BeforeAttrName;
466 }
467 Some(b'&') => {
468 if let Some(ch) = self.try_decode_entity() {
469 self.current_attr_value.push(ch);
470 } else {
471 self.current_attr_value.push('&');
472 }
473 }
474 Some(b'>') => {
475 self.finish_attr();
476 self.state = TokenizerState::Data;
477 return self.emit_tag();
478 }
479 Some(ch) => {
480 self.current_attr_value.push(ch as char);
481 }
482 None => return Token::Eof,
483 },
484
485 TokenizerState::AfterAttrValueQuoted => match self.consume_next() {
486 Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
487 self.state = TokenizerState::BeforeAttrName;
488 }
489 Some(b'/') => {
490 self.state = TokenizerState::SelfClosingStartTag;
491 }
492 Some(b'>') => {
493 self.state = TokenizerState::Data;
494 return self.emit_tag();
495 }
496 _ => {
497 self.reconsume = true;
498 self.state = TokenizerState::BeforeAttrName;
499 }
500 },
501
502 TokenizerState::SelfClosingStartTag => match self.consume_next() {
503 Some(b'>') => {
504 self.self_closing = true;
505 self.state = TokenizerState::Data;
506 return self.emit_tag();
507 }
508 _ => {
509 self.reconsume = true;
510 self.state = TokenizerState::BeforeAttrName;
511 }
512 },
513
514 TokenizerState::BogusComment => match self.consume_next() {
515 Some(b'>') => {
516 let data = core::mem::take(&mut self.comment_data);
517 self.state = TokenizerState::Data;
518 return Token::Comment(data);
519 }
520 Some(ch) => {
521 self.comment_data.push(ch as char);
522 }
523 None => {
524 let data = core::mem::take(&mut self.comment_data);
525 self.state = TokenizerState::Data;
526 return Token::Comment(data);
527 }
528 },
529
530 TokenizerState::MarkupDeclarationOpen => {
531 if self.starts_with_ci("--") {
532 self.pos += 2;
533 self.comment_data.clear();
534 self.state = TokenizerState::CommentStart;
535 } else if self.starts_with_ci("doctype") {
536 self.pos += 7;
537 let mut name = String::new();
539 while let Some(&ch) = self.input.get(self.pos) {
541 if ch == b' ' || ch == b'\t' || ch == b'\n' {
542 self.pos += 1;
543 } else {
544 break;
545 }
546 }
547 while let Some(&ch) = self.input.get(self.pos) {
549 if ch == b'>' {
550 self.pos += 1;
551 break;
552 }
553 name.push(ch.to_ascii_lowercase() as char);
554 self.pos += 1;
555 }
556 self.state = TokenizerState::Data;
557 return Token::Doctype(name.trim().to_string());
558 } else {
559 self.comment_data.clear();
561 self.state = TokenizerState::BogusComment;
562 }
563 }
564
565 TokenizerState::CommentStart => match self.consume_next() {
566 Some(b'-') => {
567 self.state = TokenizerState::CommentStartDash;
568 }
569 Some(b'>') => {
570 let data = core::mem::take(&mut self.comment_data);
571 self.state = TokenizerState::Data;
572 return Token::Comment(data);
573 }
574 Some(ch) => {
575 self.comment_data.push(ch as char);
576 self.state = TokenizerState::InComment;
577 }
578 None => {
579 let data = core::mem::take(&mut self.comment_data);
580 return Token::Comment(data);
581 }
582 },
583
584 TokenizerState::CommentStartDash => match self.consume_next() {
585 Some(b'-') => {
586 self.state = TokenizerState::CommentEnd;
587 }
588 Some(b'>') => {
589 let data = core::mem::take(&mut self.comment_data);
590 self.state = TokenizerState::Data;
591 return Token::Comment(data);
592 }
593 Some(ch) => {
594 self.comment_data.push('-');
595 self.comment_data.push(ch as char);
596 self.state = TokenizerState::InComment;
597 }
598 None => {
599 let data = core::mem::take(&mut self.comment_data);
600 return Token::Comment(data);
601 }
602 },
603
604 TokenizerState::InComment => match self.consume_next() {
605 Some(b'-') => {
606 self.state = TokenizerState::CommentEndDash;
607 }
608 Some(ch) => {
609 self.comment_data.push(ch as char);
610 }
611 None => {
612 let data = core::mem::take(&mut self.comment_data);
613 return Token::Comment(data);
614 }
615 },
616
617 TokenizerState::CommentEndDash => match self.consume_next() {
618 Some(b'-') => {
619 self.state = TokenizerState::CommentEnd;
620 }
621 Some(ch) => {
622 self.comment_data.push('-');
623 self.comment_data.push(ch as char);
624 self.state = TokenizerState::InComment;
625 }
626 None => {
627 let data = core::mem::take(&mut self.comment_data);
628 return Token::Comment(data);
629 }
630 },
631
632 TokenizerState::CommentEnd => match self.consume_next() {
633 Some(b'>') => {
634 let data = core::mem::take(&mut self.comment_data);
635 self.state = TokenizerState::Data;
636 return Token::Comment(data);
637 }
638 Some(b'-') => {
639 self.comment_data.push('-');
640 }
641 Some(ch) => {
642 self.comment_data.push('-');
643 self.comment_data.push('-');
644 self.comment_data.push(ch as char);
645 self.state = TokenizerState::InComment;
646 }
647 None => {
648 let data = core::mem::take(&mut self.comment_data);
649 return Token::Comment(data);
650 }
651 },
652 }
653 }
654 }
655}
656
657#[cfg(test)]
658mod tests {
659 #[allow(unused_imports)]
660 use alloc::vec;
661
662 use super::*;
663
664 #[test]
665 fn test_empty_input() {
666 let mut t = HtmlTokenizer::from_text("");
667 assert_eq!(t.next_token(), Token::Eof);
668 }
669
670 #[test]
671 fn test_plain_text() {
672 let mut t = HtmlTokenizer::from_text("hello");
673 assert_eq!(t.next_token(), Token::Character('h'));
674 assert_eq!(t.next_token(), Token::Character('e'));
675 assert_eq!(t.next_token(), Token::Character('l'));
676 assert_eq!(t.next_token(), Token::Character('l'));
677 assert_eq!(t.next_token(), Token::Character('o'));
678 assert_eq!(t.next_token(), Token::Eof);
679 }
680
681 #[test]
682 fn test_simple_tag() {
683 let mut t = HtmlTokenizer::from_text("<p>");
684 let token = t.next_token();
685 assert_eq!(token, Token::StartTag("p".into(), vec![], false));
686 }
687
688 #[test]
689 fn test_end_tag() {
690 let mut t = HtmlTokenizer::from_text("</p>");
691 let token = t.next_token();
692 assert_eq!(token, Token::EndTag("p".into()));
693 }
694
695 #[test]
696 fn test_self_closing_tag() {
697 let mut t = HtmlTokenizer::from_text("<br/>");
698 let token = t.next_token();
699 assert_eq!(token, Token::StartTag("br".into(), vec![], true));
700 }
701
702 #[test]
703 fn test_void_element_auto_self_closing() {
704 let mut t = HtmlTokenizer::from_text("<br>");
705 let token = t.next_token();
706 assert_eq!(token, Token::StartTag("br".into(), vec![], true));
707 }
708
709 #[test]
710 fn test_tag_with_attribute() {
711 let mut t = HtmlTokenizer::from_text("<div class=\"main\">");
712 let token = t.next_token();
713 assert_eq!(
714 token,
715 Token::StartTag(
716 "div".into(),
717 vec![Attribute {
718 name: "class".into(),
719 value: "main".into()
720 }],
721 false
722 )
723 );
724 }
725
726 #[test]
727 fn test_tag_with_multiple_attrs() {
728 let mut t = HtmlTokenizer::from_text("<a href=\"/\" id=\"home\">");
729 let token = t.next_token();
730 assert_eq!(
731 token,
732 Token::StartTag(
733 "a".into(),
734 vec![
735 Attribute {
736 name: "href".into(),
737 value: "/".into()
738 },
739 Attribute {
740 name: "id".into(),
741 value: "home".into()
742 },
743 ],
744 false
745 )
746 );
747 }
748
749 #[test]
750 fn test_single_quoted_attr() {
751 let mut t = HtmlTokenizer::from_text("<div class='main'>");
752 let token = t.next_token();
753 assert_eq!(
754 token,
755 Token::StartTag(
756 "div".into(),
757 vec![Attribute {
758 name: "class".into(),
759 value: "main".into()
760 }],
761 false
762 )
763 );
764 }
765
766 #[test]
767 fn test_unquoted_attr() {
768 let mut t = HtmlTokenizer::from_text("<div class=main>");
769 let token = t.next_token();
770 assert_eq!(
771 token,
772 Token::StartTag(
773 "div".into(),
774 vec![Attribute {
775 name: "class".into(),
776 value: "main".into()
777 }],
778 false
779 )
780 );
781 }
782
783 #[test]
784 fn test_entity_amp() {
785 let mut t = HtmlTokenizer::from_text("&");
786 assert_eq!(t.next_token(), Token::Character('&'));
787 }
788
789 #[test]
790 fn test_entity_lt() {
791 let mut t = HtmlTokenizer::from_text("<");
792 assert_eq!(t.next_token(), Token::Character('<'));
793 }
794
795 #[test]
796 fn test_entity_gt() {
797 let mut t = HtmlTokenizer::from_text(">");
798 assert_eq!(t.next_token(), Token::Character('>'));
799 }
800
801 #[test]
802 fn test_entity_quot() {
803 let mut t = HtmlTokenizer::from_text(""");
804 assert_eq!(t.next_token(), Token::Character('"'));
805 }
806
807 #[test]
808 fn test_entity_numeric() {
809 let mut t = HtmlTokenizer::from_text("A");
810 assert_eq!(t.next_token(), Token::Character('A'));
811 }
812
813 #[test]
814 fn test_entity_hex() {
815 let mut t = HtmlTokenizer::from_text("A");
816 assert_eq!(t.next_token(), Token::Character('A'));
817 }
818
819 #[test]
820 fn test_entity_in_attr() {
821 let mut t = HtmlTokenizer::from_text("<a href=\"a&b\">");
822 let token = t.next_token();
823 assert_eq!(
824 token,
825 Token::StartTag(
826 "a".into(),
827 vec![Attribute {
828 name: "href".into(),
829 value: "a&b".into()
830 }],
831 false
832 )
833 );
834 }
835
836 #[test]
837 fn test_comment() {
838 let mut t = HtmlTokenizer::from_text("<!-- hello -->");
839 let token = t.next_token();
840 assert_eq!(token, Token::Comment(" hello ".into()));
841 }
842
843 #[test]
844 fn test_empty_comment() {
845 let mut t = HtmlTokenizer::from_text("<!---->");
846 let token = t.next_token();
847 assert_eq!(token, Token::Comment(String::new()));
848 }
849
850 #[test]
851 fn test_doctype() {
852 let mut t = HtmlTokenizer::from_text("<!DOCTYPE html>");
853 let token = t.next_token();
854 assert_eq!(token, Token::Doctype("html".into()));
855 }
856
857 #[test]
858 fn test_case_insensitive_tags() {
859 let mut t = HtmlTokenizer::from_text("<DIV>");
860 let token = t.next_token();
861 assert_eq!(token, Token::StartTag("div".into(), vec![], false));
862 }
863
864 #[test]
865 fn test_img_void() {
866 let mut t = HtmlTokenizer::from_text("<img src=\"a.png\">");
867 let token = t.next_token();
868 assert_eq!(
869 token,
870 Token::StartTag(
871 "img".into(),
872 vec![Attribute {
873 name: "src".into(),
874 value: "a.png".into()
875 }],
876 true
877 )
878 );
879 }
880
881 #[test]
882 fn test_input_void() {
883 let mut t = HtmlTokenizer::from_text("<input type=\"text\">");
884 let token = t.next_token();
885 assert_eq!(
886 token,
887 Token::StartTag(
888 "input".into(),
889 vec![Attribute {
890 name: "type".into(),
891 value: "text".into()
892 }],
893 true
894 )
895 );
896 }
897
898 #[test]
899 fn test_hr_void() {
900 let mut t = HtmlTokenizer::from_text("<hr>");
901 let token = t.next_token();
902 assert_eq!(token, Token::StartTag("hr".into(), vec![], true));
903 }
904
905 #[test]
906 fn test_meta_void() {
907 let mut t = HtmlTokenizer::from_text("<meta charset=\"utf-8\">");
908 let token = t.next_token();
909 assert_eq!(
910 token,
911 Token::StartTag(
912 "meta".into(),
913 vec![Attribute {
914 name: "charset".into(),
915 value: "utf-8".into()
916 }],
917 true
918 )
919 );
920 }
921
922 #[test]
923 fn test_link_void() {
924 let mut t = HtmlTokenizer::from_text("<link rel=\"stylesheet\">");
925 let token = t.next_token();
926 assert_eq!(
927 token,
928 Token::StartTag(
929 "link".into(),
930 vec![Attribute {
931 name: "rel".into(),
932 value: "stylesheet".into()
933 }],
934 true
935 )
936 );
937 }
938
939 #[test]
940 fn test_boolean_attribute() {
941 let mut t = HtmlTokenizer::from_text("<input disabled>");
942 let token = t.next_token();
943 assert_eq!(
944 token,
945 Token::StartTag(
946 "input".into(),
947 vec![Attribute {
948 name: "disabled".into(),
949 value: String::new(),
950 }],
951 true
952 )
953 );
954 }
955
956 #[test]
957 fn test_full_document() {
958 let html =
959 "<!DOCTYPE html><html><head><title>Hi</title></head><body><p>Hello</p></body></html>";
960 let mut t = HtmlTokenizer::from_text(html);
961 let tokens = t.tokenize_all();
962 assert_eq!(tokens[0], Token::Doctype("html".into()));
964 assert_eq!(tokens[1], Token::StartTag("html".into(), vec![], false));
965 assert_eq!(*tokens.last().unwrap(), Token::Eof);
967 }
968
969 #[test]
970 fn test_tokenize_all() {
971 let mut t = HtmlTokenizer::from_text("<b>hi</b>");
972 let tokens = t.tokenize_all();
973 assert_eq!(tokens.len(), 5); }
975
976 #[test]
977 fn test_text_between_tags() {
978 let mut t = HtmlTokenizer::from_text("<p>ab</p>");
979 assert_eq!(t.next_token(), Token::StartTag("p".into(), vec![], false));
980 assert_eq!(t.next_token(), Token::Character('a'));
981 assert_eq!(t.next_token(), Token::Character('b'));
982 assert_eq!(t.next_token(), Token::EndTag("p".into()));
983 assert_eq!(t.next_token(), Token::Eof);
984 }
985
986 #[test]
987 fn test_nested_tags() {
988 let mut t = HtmlTokenizer::from_text("<div><span></span></div>");
989 let tokens = t.tokenize_all();
990 assert_eq!(tokens[0], Token::StartTag("div".into(), vec![], false));
991 assert_eq!(tokens[1], Token::StartTag("span".into(), vec![], false));
992 assert_eq!(tokens[2], Token::EndTag("span".into()));
993 assert_eq!(tokens[3], Token::EndTag("div".into()));
994 assert_eq!(tokens[4], Token::Eof);
995 }
996
997 #[test]
998 fn test_attribute_no_value_before_another() {
999 let mut t = HtmlTokenizer::from_text("<input disabled type=\"text\">");
1000 let token = t.next_token();
1001 assert_eq!(
1002 token,
1003 Token::StartTag(
1004 "input".into(),
1005 vec![
1006 Attribute {
1007 name: "disabled".into(),
1008 value: String::new()
1009 },
1010 Attribute {
1011 name: "type".into(),
1012 value: "text".into()
1013 },
1014 ],
1015 true
1016 )
1017 );
1018 }
1019
1020 #[test]
1021 fn test_entity_nbsp() {
1022 let mut t = HtmlTokenizer::from_text(" ");
1023 assert_eq!(t.next_token(), Token::Character('\u{00A0}'));
1024 }
1025
1026 #[test]
1027 fn test_entity_apos() {
1028 let mut t = HtmlTokenizer::from_text("'");
1029 assert_eq!(t.next_token(), Token::Character('\''));
1030 }
1031
1032 #[test]
1033 fn test_processing_instruction() {
1034 let mut t = HtmlTokenizer::from_text("<?xml version=\"1.0\"?>");
1035 let token = t.next_token();
1036 if let Token::Comment(_) = token {
1038 } else {
1040 panic!("Expected Comment for PI");
1041 }
1042 }
1043
1044 #[test]
1045 fn test_mixed_content() {
1046 let mut t = HtmlTokenizer::from_text("a<b>c</b>d");
1047 assert_eq!(t.next_token(), Token::Character('a'));
1048 assert_eq!(t.next_token(), Token::StartTag("b".into(), vec![], false));
1049 assert_eq!(t.next_token(), Token::Character('c'));
1050 assert_eq!(t.next_token(), Token::EndTag("b".into()));
1051 assert_eq!(t.next_token(), Token::Character('d'));
1052 assert_eq!(t.next_token(), Token::Eof);
1053 }
1054
1055 #[test]
1056 fn test_whitespace_in_tags() {
1057 let mut t = HtmlTokenizer::from_text("< p >");
1058 assert_eq!(t.next_token(), Token::Character('<'));
1060 }
1061
1062 #[test]
1063 fn test_comment_with_dashes() {
1064 let mut t = HtmlTokenizer::from_text("<!-- a-b -->");
1065 let token = t.next_token();
1066 assert_eq!(token, Token::Comment(" a-b ".into()));
1067 }
1068
1069 #[test]
1070 fn test_self_closing_nonvoid() {
1071 let mut t = HtmlTokenizer::from_text("<div/>");
1072 let token = t.next_token();
1073 assert_eq!(token, Token::StartTag("div".into(), vec![], true));
1075 }
1076}