⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/browser/
html_tokenizer.rs

1//! HTML Tokenizer
2//!
3//! Implements a state-machine HTML tokenizer that converts raw HTML bytes
4//! into a stream of tokens (start tags, end tags, text, comments, doctype).
5//! Handles entity references (&, <, >, ") and void elements.
6
7#![allow(dead_code)]
8
9use alloc::{
10    string::{String, ToString},
11    vec::Vec,
12};
13
14/// An HTML attribute (name=value pair)
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct Attribute {
17    pub name: String,
18    pub value: String,
19}
20
21/// Tokens produced by the HTML tokenizer
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum Token {
24    /// <!DOCTYPE ...>
25    Doctype(String),
26    /// <tag attr="val">
27    StartTag(String, Vec<Attribute>, bool),
28    /// </tag>
29    EndTag(String),
30    /// A single character of text
31    Character(char),
32    /// <!-- comment -->
33    Comment(String),
34    /// End of input
35    Eof,
36}
37
38/// States for the tokenizer state machine
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40enum TokenizerState {
41    Data,
42    TagOpen,
43    EndTagOpen,
44    TagName,
45    BeforeAttrName,
46    AttrName,
47    AfterAttrName,
48    BeforeAttrValue,
49    AttrValueDoubleQuoted,
50    AttrValueSingleQuoted,
51    AttrValueUnquoted,
52    AfterAttrValueQuoted,
53    SelfClosingStartTag,
54    BogusComment,
55    MarkupDeclarationOpen,
56    CommentStart,
57    CommentStartDash,
58    InComment,
59    CommentEndDash,
60    CommentEnd,
61}
62
63/// Void elements that are self-closing
64const VOID_ELEMENTS: &[&str] = &[
65    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
66    "track", "wbr",
67];
68
69/// HTML tokenizer that converts input bytes into tokens
70pub struct HtmlTokenizer {
71    input: Vec<u8>,
72    pos: usize,
73    state: TokenizerState,
74    current_tag_name: String,
75    current_tag_attrs: Vec<Attribute>,
76    current_attr_name: String,
77    current_attr_value: String,
78    self_closing: bool,
79    is_end_tag: bool,
80    comment_data: String,
81    reconsume: bool,
82    current_char: u8,
83}
84
85impl HtmlTokenizer {
86    /// Create a new tokenizer from input bytes
87    pub fn new(input: &[u8]) -> Self {
88        Self {
89            input: input.to_vec(),
90            pos: 0,
91            state: TokenizerState::Data,
92            current_tag_name: String::new(),
93            current_tag_attrs: Vec::new(),
94            current_attr_name: String::new(),
95            current_attr_value: String::new(),
96            self_closing: false,
97            is_end_tag: false,
98            comment_data: String::new(),
99            reconsume: false,
100            current_char: 0,
101        }
102    }
103
104    /// Create a tokenizer from a string slice
105    pub fn from_text(input: &str) -> Self {
106        Self::new(input.as_bytes())
107    }
108
109    /// Tokenize the entire input into a Vec of tokens
110    pub fn tokenize_all(&mut self) -> Vec<Token> {
111        let mut tokens = Vec::new();
112        loop {
113            let token = self.next_token();
114            if token == Token::Eof {
115                tokens.push(Token::Eof);
116                break;
117            }
118            tokens.push(token);
119        }
120        tokens
121    }
122
123    /// Consume the next character from input
124    fn consume_next(&mut self) -> Option<u8> {
125        if self.reconsume {
126            self.reconsume = false;
127            return Some(self.current_char);
128        }
129        if self.pos < self.input.len() {
130            let ch = self.input[self.pos];
131            self.pos += 1;
132            self.current_char = ch;
133            Some(ch)
134        } else {
135            None
136        }
137    }
138
139    /// Peek at the next character without consuming
140    fn peek(&self) -> Option<u8> {
141        if self.pos < self.input.len() {
142            Some(self.input[self.pos])
143        } else {
144            None
145        }
146    }
147
148    /// Check if the next bytes match a string (case-insensitive)
149    fn starts_with_ci(&self, s: &str) -> bool {
150        let bytes = s.as_bytes();
151        if self.pos + bytes.len() > self.input.len() {
152            return false;
153        }
154        for (i, &b) in bytes.iter().enumerate() {
155            let input_byte = self.input[self.pos + i];
156            if !input_byte.eq_ignore_ascii_case(&b) {
157                return false;
158            }
159        }
160        true
161    }
162
163    /// Emit the current tag as a token
164    fn emit_tag(&mut self) -> Token {
165        let name = core::mem::take(&mut self.current_tag_name);
166        let attrs = core::mem::take(&mut self.current_tag_attrs);
167        let sc = self.self_closing;
168        self.self_closing = false;
169
170        if self.is_end_tag {
171            self.is_end_tag = false;
172            Token::EndTag(name)
173        } else {
174            // Check if it's a void element
175            let is_void = VOID_ELEMENTS.contains(&name.as_str());
176            Token::StartTag(name, attrs, sc || is_void)
177        }
178    }
179
180    /// Finalize the current attribute and add it to the list
181    fn finish_attr(&mut self) {
182        if !self.current_attr_name.is_empty() {
183            let attr = Attribute {
184                name: core::mem::take(&mut self.current_attr_name),
185                value: core::mem::take(&mut self.current_attr_value),
186            };
187            self.current_tag_attrs.push(attr);
188        } else {
189            self.current_attr_name.clear();
190            self.current_attr_value.clear();
191        }
192    }
193
194    /// Try to decode an entity reference starting at current position.
195    /// Returns the decoded character or None.
196    fn try_decode_entity(&mut self) -> Option<char> {
197        // We've already consumed '&'
198        let start = self.pos;
199        let mut entity = String::new();
200
201        // Read up to 10 chars until ';' or non-alpha
202        for _ in 0..10 {
203            match self.peek() {
204                Some(b';') => {
205                    self.pos += 1; // consume ';'
206                    break;
207                }
208                Some(ch) if ch.is_ascii_alphanumeric() || ch == b'#' => {
209                    self.pos += 1;
210                    entity.push(ch as char);
211                }
212                _ => break,
213            }
214        }
215
216        match entity.as_str() {
217            "amp" => Some('&'),
218            "lt" => Some('<'),
219            "gt" => Some('>'),
220            "quot" => Some('"'),
221            "apos" => Some('\''),
222            "nbsp" => Some('\u{00A0}'),
223            s if s.starts_with('#') => {
224                let num_str = &s[1..];
225                let code = if let Some(hex) = num_str.strip_prefix('x') {
226                    u32::from_str_radix(hex, 16).ok()
227                } else {
228                    num_str.parse::<u32>().ok()
229                };
230                code.and_then(char::from_u32)
231            }
232            _ => {
233                // Unknown entity, rewind and emit '&' literally
234                self.pos = start;
235                Some('&')
236            }
237        }
238    }
239
240    /// Get the next token from the input
241    pub fn next_token(&mut self) -> Token {
242        loop {
243            match self.state {
244                TokenizerState::Data => match self.consume_next() {
245                    Some(b'<') => {
246                        self.state = TokenizerState::TagOpen;
247                    }
248                    Some(b'&') => {
249                        if let Some(ch) = self.try_decode_entity() {
250                            return Token::Character(ch);
251                        }
252                        return Token::Character('&');
253                    }
254                    Some(ch) => {
255                        return Token::Character(ch as char);
256                    }
257                    None => return Token::Eof,
258                },
259
260                TokenizerState::TagOpen => {
261                    match self.consume_next() {
262                        Some(b'!') => {
263                            self.state = TokenizerState::MarkupDeclarationOpen;
264                        }
265                        Some(b'/') => {
266                            self.state = TokenizerState::EndTagOpen;
267                        }
268                        Some(ch) if ch.is_ascii_alphabetic() => {
269                            self.current_tag_name.clear();
270                            self.current_tag_attrs.clear();
271                            self.self_closing = false;
272                            self.is_end_tag = false;
273                            self.current_tag_name.push(ch.to_ascii_lowercase() as char);
274                            self.state = TokenizerState::TagName;
275                        }
276                        Some(b'?') => {
277                            // Processing instruction, treat as bogus comment
278                            self.comment_data.clear();
279                            self.state = TokenizerState::BogusComment;
280                        }
281                        _ => {
282                            // Not a tag, emit '<' as character
283                            self.reconsume = true;
284                            self.state = TokenizerState::Data;
285                            return Token::Character('<');
286                        }
287                    }
288                }
289
290                TokenizerState::EndTagOpen => {
291                    match self.consume_next() {
292                        Some(ch) if ch.is_ascii_alphabetic() => {
293                            self.current_tag_name.clear();
294                            self.current_tag_attrs.clear();
295                            self.self_closing = false;
296                            self.is_end_tag = true;
297                            self.current_tag_name.push(ch.to_ascii_lowercase() as char);
298                            self.state = TokenizerState::TagName;
299                        }
300                        Some(b'>') => {
301                            // </> is invalid, ignore
302                            self.state = TokenizerState::Data;
303                        }
304                        _ => {
305                            self.comment_data.clear();
306                            self.reconsume = true;
307                            self.state = TokenizerState::BogusComment;
308                        }
309                    }
310                }
311
312                TokenizerState::TagName => match self.consume_next() {
313                    Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
314                        self.state = TokenizerState::BeforeAttrName;
315                    }
316                    Some(b'/') => {
317                        self.state = TokenizerState::SelfClosingStartTag;
318                    }
319                    Some(b'>') => {
320                        self.state = TokenizerState::Data;
321                        return self.emit_tag();
322                    }
323                    Some(ch) => {
324                        self.current_tag_name.push(ch.to_ascii_lowercase() as char);
325                    }
326                    None => return Token::Eof,
327                },
328
329                TokenizerState::BeforeAttrName => {
330                    match self.consume_next() {
331                        Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
332                            // skip whitespace
333                        }
334                        Some(b'/') => {
335                            self.state = TokenizerState::SelfClosingStartTag;
336                        }
337                        Some(b'>') => {
338                            self.state = TokenizerState::Data;
339                            return self.emit_tag();
340                        }
341                        Some(ch) => {
342                            self.current_attr_name.clear();
343                            self.current_attr_value.clear();
344                            self.current_attr_name.push(ch.to_ascii_lowercase() as char);
345                            self.state = TokenizerState::AttrName;
346                        }
347                        None => return Token::Eof,
348                    }
349                }
350
351                TokenizerState::AttrName => match self.consume_next() {
352                    Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
353                        self.state = TokenizerState::AfterAttrName;
354                    }
355                    Some(b'/') => {
356                        self.finish_attr();
357                        self.state = TokenizerState::SelfClosingStartTag;
358                    }
359                    Some(b'=') => {
360                        self.state = TokenizerState::BeforeAttrValue;
361                    }
362                    Some(b'>') => {
363                        self.finish_attr();
364                        self.state = TokenizerState::Data;
365                        return self.emit_tag();
366                    }
367                    Some(ch) => {
368                        self.current_attr_name.push(ch.to_ascii_lowercase() as char);
369                    }
370                    None => return Token::Eof,
371                },
372
373                TokenizerState::AfterAttrName => {
374                    match self.consume_next() {
375                        Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
376                            // skip whitespace
377                        }
378                        Some(b'/') => {
379                            self.finish_attr();
380                            self.state = TokenizerState::SelfClosingStartTag;
381                        }
382                        Some(b'=') => {
383                            self.state = TokenizerState::BeforeAttrValue;
384                        }
385                        Some(b'>') => {
386                            self.finish_attr();
387                            self.state = TokenizerState::Data;
388                            return self.emit_tag();
389                        }
390                        Some(ch) => {
391                            // New attribute without value
392                            self.finish_attr();
393                            self.current_attr_name.clear();
394                            self.current_attr_value.clear();
395                            self.current_attr_name.push(ch.to_ascii_lowercase() as char);
396                            self.state = TokenizerState::AttrName;
397                        }
398                        None => return Token::Eof,
399                    }
400                }
401
402                TokenizerState::BeforeAttrValue => {
403                    match self.consume_next() {
404                        Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
405                            // skip whitespace
406                        }
407                        Some(b'"') => {
408                            self.state = TokenizerState::AttrValueDoubleQuoted;
409                        }
410                        Some(b'\'') => {
411                            self.state = TokenizerState::AttrValueSingleQuoted;
412                        }
413                        Some(b'>') => {
414                            self.finish_attr();
415                            self.state = TokenizerState::Data;
416                            return self.emit_tag();
417                        }
418                        Some(ch) => {
419                            self.current_attr_value.push(ch as char);
420                            self.state = TokenizerState::AttrValueUnquoted;
421                        }
422                        None => return Token::Eof,
423                    }
424                }
425
426                TokenizerState::AttrValueDoubleQuoted => match self.consume_next() {
427                    Some(b'"') => {
428                        self.finish_attr();
429                        self.state = TokenizerState::AfterAttrValueQuoted;
430                    }
431                    Some(b'&') => {
432                        if let Some(ch) = self.try_decode_entity() {
433                            self.current_attr_value.push(ch);
434                        } else {
435                            self.current_attr_value.push('&');
436                        }
437                    }
438                    Some(ch) => {
439                        self.current_attr_value.push(ch as char);
440                    }
441                    None => return Token::Eof,
442                },
443
444                TokenizerState::AttrValueSingleQuoted => match self.consume_next() {
445                    Some(b'\'') => {
446                        self.finish_attr();
447                        self.state = TokenizerState::AfterAttrValueQuoted;
448                    }
449                    Some(b'&') => {
450                        if let Some(ch) = self.try_decode_entity() {
451                            self.current_attr_value.push(ch);
452                        } else {
453                            self.current_attr_value.push('&');
454                        }
455                    }
456                    Some(ch) => {
457                        self.current_attr_value.push(ch as char);
458                    }
459                    None => return Token::Eof,
460                },
461
462                TokenizerState::AttrValueUnquoted => match self.consume_next() {
463                    Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
464                        self.finish_attr();
465                        self.state = TokenizerState::BeforeAttrName;
466                    }
467                    Some(b'&') => {
468                        if let Some(ch) = self.try_decode_entity() {
469                            self.current_attr_value.push(ch);
470                        } else {
471                            self.current_attr_value.push('&');
472                        }
473                    }
474                    Some(b'>') => {
475                        self.finish_attr();
476                        self.state = TokenizerState::Data;
477                        return self.emit_tag();
478                    }
479                    Some(ch) => {
480                        self.current_attr_value.push(ch as char);
481                    }
482                    None => return Token::Eof,
483                },
484
485                TokenizerState::AfterAttrValueQuoted => match self.consume_next() {
486                    Some(b'\t') | Some(b'\n') | Some(b'\x0C') | Some(b' ') => {
487                        self.state = TokenizerState::BeforeAttrName;
488                    }
489                    Some(b'/') => {
490                        self.state = TokenizerState::SelfClosingStartTag;
491                    }
492                    Some(b'>') => {
493                        self.state = TokenizerState::Data;
494                        return self.emit_tag();
495                    }
496                    _ => {
497                        self.reconsume = true;
498                        self.state = TokenizerState::BeforeAttrName;
499                    }
500                },
501
502                TokenizerState::SelfClosingStartTag => match self.consume_next() {
503                    Some(b'>') => {
504                        self.self_closing = true;
505                        self.state = TokenizerState::Data;
506                        return self.emit_tag();
507                    }
508                    _ => {
509                        self.reconsume = true;
510                        self.state = TokenizerState::BeforeAttrName;
511                    }
512                },
513
514                TokenizerState::BogusComment => match self.consume_next() {
515                    Some(b'>') => {
516                        let data = core::mem::take(&mut self.comment_data);
517                        self.state = TokenizerState::Data;
518                        return Token::Comment(data);
519                    }
520                    Some(ch) => {
521                        self.comment_data.push(ch as char);
522                    }
523                    None => {
524                        let data = core::mem::take(&mut self.comment_data);
525                        self.state = TokenizerState::Data;
526                        return Token::Comment(data);
527                    }
528                },
529
530                TokenizerState::MarkupDeclarationOpen => {
531                    if self.starts_with_ci("--") {
532                        self.pos += 2;
533                        self.comment_data.clear();
534                        self.state = TokenizerState::CommentStart;
535                    } else if self.starts_with_ci("doctype") {
536                        self.pos += 7;
537                        // Read doctype name
538                        let mut name = String::new();
539                        // Skip whitespace
540                        while let Some(&ch) = self.input.get(self.pos) {
541                            if ch == b' ' || ch == b'\t' || ch == b'\n' {
542                                self.pos += 1;
543                            } else {
544                                break;
545                            }
546                        }
547                        // Read name until '>'
548                        while let Some(&ch) = self.input.get(self.pos) {
549                            if ch == b'>' {
550                                self.pos += 1;
551                                break;
552                            }
553                            name.push(ch.to_ascii_lowercase() as char);
554                            self.pos += 1;
555                        }
556                        self.state = TokenizerState::Data;
557                        return Token::Doctype(name.trim().to_string());
558                    } else {
559                        // Bogus comment
560                        self.comment_data.clear();
561                        self.state = TokenizerState::BogusComment;
562                    }
563                }
564
565                TokenizerState::CommentStart => match self.consume_next() {
566                    Some(b'-') => {
567                        self.state = TokenizerState::CommentStartDash;
568                    }
569                    Some(b'>') => {
570                        let data = core::mem::take(&mut self.comment_data);
571                        self.state = TokenizerState::Data;
572                        return Token::Comment(data);
573                    }
574                    Some(ch) => {
575                        self.comment_data.push(ch as char);
576                        self.state = TokenizerState::InComment;
577                    }
578                    None => {
579                        let data = core::mem::take(&mut self.comment_data);
580                        return Token::Comment(data);
581                    }
582                },
583
584                TokenizerState::CommentStartDash => match self.consume_next() {
585                    Some(b'-') => {
586                        self.state = TokenizerState::CommentEnd;
587                    }
588                    Some(b'>') => {
589                        let data = core::mem::take(&mut self.comment_data);
590                        self.state = TokenizerState::Data;
591                        return Token::Comment(data);
592                    }
593                    Some(ch) => {
594                        self.comment_data.push('-');
595                        self.comment_data.push(ch as char);
596                        self.state = TokenizerState::InComment;
597                    }
598                    None => {
599                        let data = core::mem::take(&mut self.comment_data);
600                        return Token::Comment(data);
601                    }
602                },
603
604                TokenizerState::InComment => match self.consume_next() {
605                    Some(b'-') => {
606                        self.state = TokenizerState::CommentEndDash;
607                    }
608                    Some(ch) => {
609                        self.comment_data.push(ch as char);
610                    }
611                    None => {
612                        let data = core::mem::take(&mut self.comment_data);
613                        return Token::Comment(data);
614                    }
615                },
616
617                TokenizerState::CommentEndDash => match self.consume_next() {
618                    Some(b'-') => {
619                        self.state = TokenizerState::CommentEnd;
620                    }
621                    Some(ch) => {
622                        self.comment_data.push('-');
623                        self.comment_data.push(ch as char);
624                        self.state = TokenizerState::InComment;
625                    }
626                    None => {
627                        let data = core::mem::take(&mut self.comment_data);
628                        return Token::Comment(data);
629                    }
630                },
631
632                TokenizerState::CommentEnd => match self.consume_next() {
633                    Some(b'>') => {
634                        let data = core::mem::take(&mut self.comment_data);
635                        self.state = TokenizerState::Data;
636                        return Token::Comment(data);
637                    }
638                    Some(b'-') => {
639                        self.comment_data.push('-');
640                    }
641                    Some(ch) => {
642                        self.comment_data.push('-');
643                        self.comment_data.push('-');
644                        self.comment_data.push(ch as char);
645                        self.state = TokenizerState::InComment;
646                    }
647                    None => {
648                        let data = core::mem::take(&mut self.comment_data);
649                        return Token::Comment(data);
650                    }
651                },
652            }
653        }
654    }
655}
656
657#[cfg(test)]
658mod tests {
659    #[allow(unused_imports)]
660    use alloc::vec;
661
662    use super::*;
663
664    #[test]
665    fn test_empty_input() {
666        let mut t = HtmlTokenizer::from_text("");
667        assert_eq!(t.next_token(), Token::Eof);
668    }
669
670    #[test]
671    fn test_plain_text() {
672        let mut t = HtmlTokenizer::from_text("hello");
673        assert_eq!(t.next_token(), Token::Character('h'));
674        assert_eq!(t.next_token(), Token::Character('e'));
675        assert_eq!(t.next_token(), Token::Character('l'));
676        assert_eq!(t.next_token(), Token::Character('l'));
677        assert_eq!(t.next_token(), Token::Character('o'));
678        assert_eq!(t.next_token(), Token::Eof);
679    }
680
681    #[test]
682    fn test_simple_tag() {
683        let mut t = HtmlTokenizer::from_text("<p>");
684        let token = t.next_token();
685        assert_eq!(token, Token::StartTag("p".into(), vec![], false));
686    }
687
688    #[test]
689    fn test_end_tag() {
690        let mut t = HtmlTokenizer::from_text("</p>");
691        let token = t.next_token();
692        assert_eq!(token, Token::EndTag("p".into()));
693    }
694
695    #[test]
696    fn test_self_closing_tag() {
697        let mut t = HtmlTokenizer::from_text("<br/>");
698        let token = t.next_token();
699        assert_eq!(token, Token::StartTag("br".into(), vec![], true));
700    }
701
702    #[test]
703    fn test_void_element_auto_self_closing() {
704        let mut t = HtmlTokenizer::from_text("<br>");
705        let token = t.next_token();
706        assert_eq!(token, Token::StartTag("br".into(), vec![], true));
707    }
708
709    #[test]
710    fn test_tag_with_attribute() {
711        let mut t = HtmlTokenizer::from_text("<div class=\"main\">");
712        let token = t.next_token();
713        assert_eq!(
714            token,
715            Token::StartTag(
716                "div".into(),
717                vec![Attribute {
718                    name: "class".into(),
719                    value: "main".into()
720                }],
721                false
722            )
723        );
724    }
725
726    #[test]
727    fn test_tag_with_multiple_attrs() {
728        let mut t = HtmlTokenizer::from_text("<a href=\"/\" id=\"home\">");
729        let token = t.next_token();
730        assert_eq!(
731            token,
732            Token::StartTag(
733                "a".into(),
734                vec![
735                    Attribute {
736                        name: "href".into(),
737                        value: "/".into()
738                    },
739                    Attribute {
740                        name: "id".into(),
741                        value: "home".into()
742                    },
743                ],
744                false
745            )
746        );
747    }
748
749    #[test]
750    fn test_single_quoted_attr() {
751        let mut t = HtmlTokenizer::from_text("<div class='main'>");
752        let token = t.next_token();
753        assert_eq!(
754            token,
755            Token::StartTag(
756                "div".into(),
757                vec![Attribute {
758                    name: "class".into(),
759                    value: "main".into()
760                }],
761                false
762            )
763        );
764    }
765
766    #[test]
767    fn test_unquoted_attr() {
768        let mut t = HtmlTokenizer::from_text("<div class=main>");
769        let token = t.next_token();
770        assert_eq!(
771            token,
772            Token::StartTag(
773                "div".into(),
774                vec![Attribute {
775                    name: "class".into(),
776                    value: "main".into()
777                }],
778                false
779            )
780        );
781    }
782
783    #[test]
784    fn test_entity_amp() {
785        let mut t = HtmlTokenizer::from_text("&amp;");
786        assert_eq!(t.next_token(), Token::Character('&'));
787    }
788
789    #[test]
790    fn test_entity_lt() {
791        let mut t = HtmlTokenizer::from_text("&lt;");
792        assert_eq!(t.next_token(), Token::Character('<'));
793    }
794
795    #[test]
796    fn test_entity_gt() {
797        let mut t = HtmlTokenizer::from_text("&gt;");
798        assert_eq!(t.next_token(), Token::Character('>'));
799    }
800
801    #[test]
802    fn test_entity_quot() {
803        let mut t = HtmlTokenizer::from_text("&quot;");
804        assert_eq!(t.next_token(), Token::Character('"'));
805    }
806
807    #[test]
808    fn test_entity_numeric() {
809        let mut t = HtmlTokenizer::from_text("&#65;");
810        assert_eq!(t.next_token(), Token::Character('A'));
811    }
812
813    #[test]
814    fn test_entity_hex() {
815        let mut t = HtmlTokenizer::from_text("&#x41;");
816        assert_eq!(t.next_token(), Token::Character('A'));
817    }
818
819    #[test]
820    fn test_entity_in_attr() {
821        let mut t = HtmlTokenizer::from_text("<a href=\"a&amp;b\">");
822        let token = t.next_token();
823        assert_eq!(
824            token,
825            Token::StartTag(
826                "a".into(),
827                vec![Attribute {
828                    name: "href".into(),
829                    value: "a&b".into()
830                }],
831                false
832            )
833        );
834    }
835
836    #[test]
837    fn test_comment() {
838        let mut t = HtmlTokenizer::from_text("<!-- hello -->");
839        let token = t.next_token();
840        assert_eq!(token, Token::Comment(" hello ".into()));
841    }
842
843    #[test]
844    fn test_empty_comment() {
845        let mut t = HtmlTokenizer::from_text("<!---->");
846        let token = t.next_token();
847        assert_eq!(token, Token::Comment(String::new()));
848    }
849
850    #[test]
851    fn test_doctype() {
852        let mut t = HtmlTokenizer::from_text("<!DOCTYPE html>");
853        let token = t.next_token();
854        assert_eq!(token, Token::Doctype("html".into()));
855    }
856
857    #[test]
858    fn test_case_insensitive_tags() {
859        let mut t = HtmlTokenizer::from_text("<DIV>");
860        let token = t.next_token();
861        assert_eq!(token, Token::StartTag("div".into(), vec![], false));
862    }
863
864    #[test]
865    fn test_img_void() {
866        let mut t = HtmlTokenizer::from_text("<img src=\"a.png\">");
867        let token = t.next_token();
868        assert_eq!(
869            token,
870            Token::StartTag(
871                "img".into(),
872                vec![Attribute {
873                    name: "src".into(),
874                    value: "a.png".into()
875                }],
876                true
877            )
878        );
879    }
880
881    #[test]
882    fn test_input_void() {
883        let mut t = HtmlTokenizer::from_text("<input type=\"text\">");
884        let token = t.next_token();
885        assert_eq!(
886            token,
887            Token::StartTag(
888                "input".into(),
889                vec![Attribute {
890                    name: "type".into(),
891                    value: "text".into()
892                }],
893                true
894            )
895        );
896    }
897
898    #[test]
899    fn test_hr_void() {
900        let mut t = HtmlTokenizer::from_text("<hr>");
901        let token = t.next_token();
902        assert_eq!(token, Token::StartTag("hr".into(), vec![], true));
903    }
904
905    #[test]
906    fn test_meta_void() {
907        let mut t = HtmlTokenizer::from_text("<meta charset=\"utf-8\">");
908        let token = t.next_token();
909        assert_eq!(
910            token,
911            Token::StartTag(
912                "meta".into(),
913                vec![Attribute {
914                    name: "charset".into(),
915                    value: "utf-8".into()
916                }],
917                true
918            )
919        );
920    }
921
922    #[test]
923    fn test_link_void() {
924        let mut t = HtmlTokenizer::from_text("<link rel=\"stylesheet\">");
925        let token = t.next_token();
926        assert_eq!(
927            token,
928            Token::StartTag(
929                "link".into(),
930                vec![Attribute {
931                    name: "rel".into(),
932                    value: "stylesheet".into()
933                }],
934                true
935            )
936        );
937    }
938
939    #[test]
940    fn test_boolean_attribute() {
941        let mut t = HtmlTokenizer::from_text("<input disabled>");
942        let token = t.next_token();
943        assert_eq!(
944            token,
945            Token::StartTag(
946                "input".into(),
947                vec![Attribute {
948                    name: "disabled".into(),
949                    value: String::new(),
950                }],
951                true
952            )
953        );
954    }
955
956    #[test]
957    fn test_full_document() {
958        let html =
959            "<!DOCTYPE html><html><head><title>Hi</title></head><body><p>Hello</p></body></html>";
960        let mut t = HtmlTokenizer::from_text(html);
961        let tokens = t.tokenize_all();
962        // Should start with Doctype, then StartTag html, etc.
963        assert_eq!(tokens[0], Token::Doctype("html".into()));
964        assert_eq!(tokens[1], Token::StartTag("html".into(), vec![], false));
965        // Last should be Eof
966        assert_eq!(*tokens.last().unwrap(), Token::Eof);
967    }
968
969    #[test]
970    fn test_tokenize_all() {
971        let mut t = HtmlTokenizer::from_text("<b>hi</b>");
972        let tokens = t.tokenize_all();
973        assert_eq!(tokens.len(), 5); // StartTag, 'h', 'i', EndTag, Eof
974    }
975
976    #[test]
977    fn test_text_between_tags() {
978        let mut t = HtmlTokenizer::from_text("<p>ab</p>");
979        assert_eq!(t.next_token(), Token::StartTag("p".into(), vec![], false));
980        assert_eq!(t.next_token(), Token::Character('a'));
981        assert_eq!(t.next_token(), Token::Character('b'));
982        assert_eq!(t.next_token(), Token::EndTag("p".into()));
983        assert_eq!(t.next_token(), Token::Eof);
984    }
985
986    #[test]
987    fn test_nested_tags() {
988        let mut t = HtmlTokenizer::from_text("<div><span></span></div>");
989        let tokens = t.tokenize_all();
990        assert_eq!(tokens[0], Token::StartTag("div".into(), vec![], false));
991        assert_eq!(tokens[1], Token::StartTag("span".into(), vec![], false));
992        assert_eq!(tokens[2], Token::EndTag("span".into()));
993        assert_eq!(tokens[3], Token::EndTag("div".into()));
994        assert_eq!(tokens[4], Token::Eof);
995    }
996
997    #[test]
998    fn test_attribute_no_value_before_another() {
999        let mut t = HtmlTokenizer::from_text("<input disabled type=\"text\">");
1000        let token = t.next_token();
1001        assert_eq!(
1002            token,
1003            Token::StartTag(
1004                "input".into(),
1005                vec![
1006                    Attribute {
1007                        name: "disabled".into(),
1008                        value: String::new()
1009                    },
1010                    Attribute {
1011                        name: "type".into(),
1012                        value: "text".into()
1013                    },
1014                ],
1015                true
1016            )
1017        );
1018    }
1019
1020    #[test]
1021    fn test_entity_nbsp() {
1022        let mut t = HtmlTokenizer::from_text("&nbsp;");
1023        assert_eq!(t.next_token(), Token::Character('\u{00A0}'));
1024    }
1025
1026    #[test]
1027    fn test_entity_apos() {
1028        let mut t = HtmlTokenizer::from_text("&apos;");
1029        assert_eq!(t.next_token(), Token::Character('\''));
1030    }
1031
1032    #[test]
1033    fn test_processing_instruction() {
1034        let mut t = HtmlTokenizer::from_text("<?xml version=\"1.0\"?>");
1035        let token = t.next_token();
1036        // Treated as bogus comment
1037        if let Token::Comment(_) = token {
1038            // ok
1039        } else {
1040            panic!("Expected Comment for PI");
1041        }
1042    }
1043
1044    #[test]
1045    fn test_mixed_content() {
1046        let mut t = HtmlTokenizer::from_text("a<b>c</b>d");
1047        assert_eq!(t.next_token(), Token::Character('a'));
1048        assert_eq!(t.next_token(), Token::StartTag("b".into(), vec![], false));
1049        assert_eq!(t.next_token(), Token::Character('c'));
1050        assert_eq!(t.next_token(), Token::EndTag("b".into()));
1051        assert_eq!(t.next_token(), Token::Character('d'));
1052        assert_eq!(t.next_token(), Token::Eof);
1053    }
1054
1055    #[test]
1056    fn test_whitespace_in_tags() {
1057        let mut t = HtmlTokenizer::from_text("<  p  >");
1058        // '<' followed by space is not a tag
1059        assert_eq!(t.next_token(), Token::Character('<'));
1060    }
1061
1062    #[test]
1063    fn test_comment_with_dashes() {
1064        let mut t = HtmlTokenizer::from_text("<!-- a-b -->");
1065        let token = t.next_token();
1066        assert_eq!(token, Token::Comment(" a-b ".into()));
1067    }
1068
1069    #[test]
1070    fn test_self_closing_nonvoid() {
1071        let mut t = HtmlTokenizer::from_text("<div/>");
1072        let token = t.next_token();
1073        // Self-closing on non-void is allowed in tokenizer
1074        assert_eq!(token, Token::StartTag("div".into(), vec![], true));
1075    }
1076}