⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/browser/
tree_builder.rs

1//! HTML Tree Builder
2//!
3//! Consumes tokens from the HTML tokenizer and builds a DOM tree.
4//! Implements a simplified version of the HTML5 tree construction
5//! algorithm with insertion modes, auto-closing, and formatting
6//! element reconstruction.
7
8#![allow(dead_code)]
9
10use alloc::{
11    collections::BTreeMap,
12    string::{String, ToString},
13    vec::Vec,
14};
15
16use super::{
17    dom::{Document, NodeId, NodeType},
18    html_tokenizer::{Attribute, HtmlTokenizer, Token},
19};
20
21/// Insertion modes for the tree builder
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23enum InsertionMode {
24    Initial,
25    BeforeHtml,
26    BeforeHead,
27    InHead,
28    AfterHead,
29    InBody,
30    InTable,
31    InTableBody,
32    InRow,
33    InCell,
34    AfterBody,
35    AfterAfterBody,
36}
37
38/// Elements that auto-close a <p> tag
39const P_CLOSING_TAGS: &[&str] = &[
40    "address",
41    "article",
42    "aside",
43    "blockquote",
44    "details",
45    "div",
46    "dl",
47    "fieldset",
48    "figcaption",
49    "figure",
50    "footer",
51    "form",
52    "h1",
53    "h2",
54    "h3",
55    "h4",
56    "h5",
57    "h6",
58    "header",
59    "hgroup",
60    "hr",
61    "li",
62    "main",
63    "nav",
64    "ol",
65    "p",
66    "pre",
67    "section",
68    "table",
69    "ul",
70];
71
72/// Elements that auto-close themselves
73const SELF_CLOSING_TAGS: &[(&str, &[&str])] = &[
74    ("li", &["li"]),
75    ("dt", &["dt", "dd"]),
76    ("dd", &["dt", "dd"]),
77    ("td", &["td", "th"]),
78    ("th", &["td", "th"]),
79    ("tr", &["tr"]),
80    ("thead", &["tbody", "tfoot"]),
81    ("tbody", &["tbody", "tfoot"]),
82    ("tfoot", &["tbody"]),
83    ("option", &["option"]),
84    ("optgroup", &["optgroup"]),
85];
86
87/// Formatting element names
88const FORMATTING_ELEMENTS: &[&str] = &[
89    "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u",
90];
91
92/// Active formatting entry
93#[derive(Debug, Clone)]
94struct FormattingEntry {
95    tag_name: String,
96    node_id: NodeId,
97    attrs: BTreeMap<String, String>,
98}
99
100/// HTML tree builder that constructs a DOM from tokens
101pub struct TreeBuilder {
102    document: Document,
103    insertion_mode: InsertionMode,
104    open_elements: Vec<NodeId>,
105    head_element: Option<NodeId>,
106    body_element: Option<NodeId>,
107    foster_parenting: bool,
108    active_formatting: Vec<Option<FormattingEntry>>,
109    frameset_ok: bool,
110}
111
112impl TreeBuilder {
113    /// Create a new tree builder
114    pub fn new() -> Self {
115        Self {
116            document: Document::new(),
117            insertion_mode: InsertionMode::Initial,
118            open_elements: Vec::new(),
119            head_element: None,
120            body_element: None,
121            foster_parenting: false,
122            active_formatting: Vec::new(),
123            frameset_ok: true,
124        }
125    }
126
127    /// Build a DOM tree from an HTML string
128    pub fn build(html: &str) -> Document {
129        let mut tokenizer = HtmlTokenizer::from_text(html);
130        let mut builder = Self::new();
131        loop {
132            let token = tokenizer.next_token();
133            let is_eof = token == Token::Eof;
134            builder.process_token(token);
135            if is_eof {
136                break;
137            }
138        }
139        builder.document
140    }
141
142    /// Get the current insertion point (last open element or document root)
143    fn current_node(&self) -> NodeId {
144        self.open_elements
145            .last()
146            .copied()
147            .unwrap_or(self.document.root)
148    }
149
150    /// Get the tag name of the current node
151    fn current_tag_name(&self) -> Option<String> {
152        let id = self.current_node();
153        self.document.tag_name(id).map(|s| s.to_string())
154    }
155
156    /// Convert tokenizer attributes to a BTreeMap
157    fn attrs_to_map(attrs: &[Attribute]) -> BTreeMap<String, String> {
158        let mut map = BTreeMap::new();
159        for attr in attrs {
160            map.insert(attr.name.clone(), attr.value.clone());
161        }
162        map
163    }
164
165    /// Insert an element and push to open elements
166    fn insert_element(&mut self, tag_name: &str, attrs: &[Attribute]) -> NodeId {
167        let attr_map = Self::attrs_to_map(attrs);
168        let node_id = self.document.create_element_with_attrs(tag_name, attr_map);
169        let parent = self.current_node();
170        self.document.append_child(parent, node_id);
171        self.open_elements.push(node_id);
172        node_id
173    }
174
175    /// Insert a void element (no push to open elements)
176    fn insert_void_element(&mut self, tag_name: &str, attrs: &[Attribute]) -> NodeId {
177        let attr_map = Self::attrs_to_map(attrs);
178        let node_id = self.document.create_element_with_attrs(tag_name, attr_map);
179        let parent = self.current_node();
180        self.document.append_child(parent, node_id);
181        node_id
182    }
183
184    /// Insert text at the current insertion point
185    fn insert_text(&mut self, ch: char) {
186        let parent = self.current_node();
187        // Try to append to existing text node
188        if let Some(node) = self.document.arena.get(parent) {
189            if let Some(&last_child_id) = node.children.last() {
190                if let Some(last_child) = self.document.arena.get_mut(last_child_id) {
191                    if last_child.node_type == NodeType::Text {
192                        if let Some(ref mut text) = last_child.text_content {
193                            text.push(ch);
194                            return;
195                        }
196                    }
197                }
198            }
199        }
200        // Create new text node
201        let text_id = self.document.create_text(&String::from(ch));
202        self.document.append_child(parent, text_id);
203    }
204
205    /// Auto-close elements when a new tag implies closing
206    fn auto_close_for_tag(&mut self, tag_name: &str) {
207        // Close <p> if needed
208        if P_CLOSING_TAGS.contains(&tag_name) {
209            self.close_p_if_in_scope();
210        }
211
212        // Check self-closing patterns (li, dt, dd, etc.)
213        for &(new_tag, closers) in SELF_CLOSING_TAGS {
214            if new_tag == tag_name {
215                for &closer in closers {
216                    if self.current_tag_name().as_deref() == Some(closer) {
217                        self.open_elements.pop();
218                        break;
219                    }
220                }
221                break;
222            }
223        }
224    }
225
226    /// Close <p> if one is in button scope
227    fn close_p_if_in_scope(&mut self) {
228        if self.has_in_scope("p") {
229            self.pop_until("p");
230        }
231    }
232
233    /// Check if a tag is in scope
234    fn has_in_scope(&self, tag: &str) -> bool {
235        for &id in self.open_elements.iter().rev() {
236            if self.document.tag_name(id) == Some(tag) {
237                return true;
238            }
239            // Scope-breaking elements
240            let name = self.document.tag_name(id).unwrap_or("");
241            if matches!(
242                name,
243                "applet"
244                    | "caption"
245                    | "html"
246                    | "table"
247                    | "td"
248                    | "th"
249                    | "marquee"
250                    | "object"
251                    | "template"
252            ) {
253                return false;
254            }
255        }
256        false
257    }
258
259    /// Pop elements until we find the given tag
260    fn pop_until(&mut self, tag: &str) {
261        while let Some(&id) = self.open_elements.last() {
262            self.open_elements.pop();
263            if self.document.tag_name(id) == Some(tag) {
264                break;
265            }
266        }
267    }
268
269    /// Push a formatting marker
270    fn push_formatting_marker(&mut self) {
271        self.active_formatting.push(None);
272    }
273
274    /// Add a formatting element
275    fn add_formatting_element(&mut self, tag_name: &str, node_id: NodeId, attrs: &[Attribute]) {
276        let entry = FormattingEntry {
277            tag_name: tag_name.to_string(),
278            node_id,
279            attrs: Self::attrs_to_map(attrs),
280        };
281        self.active_formatting.push(Some(entry));
282    }
283
284    /// Reconstruct active formatting elements
285    fn reconstruct_formatting(&mut self) {
286        if self.active_formatting.is_empty() {
287            return;
288        }
289
290        // Find the last marker or start
291        let last = self.active_formatting.len() - 1;
292        if self.active_formatting[last].is_none() {
293            return;
294        }
295
296        // Check if the last entry's element is on the stack
297        if let Some(ref entry) = self.active_formatting[last] {
298            if self.open_elements.contains(&entry.node_id) {
299                return;
300            }
301        }
302
303        // Reconstruct: re-insert formatting elements
304        let entries_to_reopen: Vec<_> = self
305            .active_formatting
306            .iter()
307            .rev()
308            .take_while(|e| e.is_some())
309            .filter_map(|e| e.clone())
310            .collect();
311
312        for entry in entries_to_reopen.into_iter().rev() {
313            if !self.open_elements.contains(&entry.node_id) {
314                let attrs: Vec<Attribute> = entry
315                    .attrs
316                    .iter()
317                    .map(|(k, v)| Attribute {
318                        name: k.clone(),
319                        value: v.clone(),
320                    })
321                    .collect();
322                let new_id = self.insert_element(&entry.tag_name, &attrs);
323                // Update the formatting list
324                for e in self.active_formatting.iter_mut().flatten() {
325                    if e.node_id == entry.node_id {
326                        e.node_id = new_id;
327                    }
328                }
329            }
330        }
331    }
332
333    /// Is the given tag a formatting element
334    fn is_formatting_element(tag: &str) -> bool {
335        FORMATTING_ELEMENTS.contains(&tag)
336    }
337
338    /// Process a single token
339    pub fn process_token(&mut self, token: Token) {
340        match self.insertion_mode {
341            InsertionMode::Initial => self.process_initial(token),
342            InsertionMode::BeforeHtml => self.process_before_html(token),
343            InsertionMode::BeforeHead => self.process_before_head(token),
344            InsertionMode::InHead => self.process_in_head(token),
345            InsertionMode::AfterHead => self.process_after_head(token),
346            InsertionMode::InBody => self.process_in_body(token),
347            InsertionMode::InTable => self.process_in_table(token),
348            InsertionMode::InTableBody => self.process_in_table_body(token),
349            InsertionMode::InRow => self.process_in_row(token),
350            InsertionMode::InCell => self.process_in_cell(token),
351            InsertionMode::AfterBody => self.process_after_body(token),
352            InsertionMode::AfterAfterBody => self.process_after_after_body(token),
353        }
354    }
355
356    fn process_initial(&mut self, token: Token) {
357        match token {
358            Token::Doctype(name) => {
359                let dt = self.document.create_doctype(&name);
360                self.document.append_child(self.document.root, dt);
361                self.insertion_mode = InsertionMode::BeforeHtml;
362            }
363            Token::Character(c) if c.is_whitespace() => {
364                // Ignore whitespace in initial mode
365            }
366            Token::Comment(text) => {
367                let c = self.document.create_comment(&text);
368                self.document.append_child(self.document.root, c);
369            }
370            _ => {
371                // No doctype, switch to before html and reprocess
372                self.insertion_mode = InsertionMode::BeforeHtml;
373                self.process_token(token);
374            }
375        }
376    }
377
378    fn process_before_html(&mut self, token: Token) {
379        match token {
380            Token::StartTag(ref name, ref attrs, _) if name == "html" => {
381                let id = self.insert_element(name, attrs);
382                let _ = id;
383                self.insertion_mode = InsertionMode::BeforeHead;
384            }
385            Token::Character(c) if c.is_whitespace() => {
386                // ignore
387            }
388            Token::Comment(text) => {
389                let c = self.document.create_comment(&text);
390                self.document.append_child(self.document.root, c);
391            }
392            _ => {
393                // Implied <html>
394                let html = self.document.create_element("html");
395                self.document.append_child(self.document.root, html);
396                self.open_elements.push(html);
397                self.insertion_mode = InsertionMode::BeforeHead;
398                self.process_token(token);
399            }
400        }
401    }
402
403    fn process_before_head(&mut self, token: Token) {
404        match token {
405            Token::StartTag(ref name, ref attrs, _) if name == "head" => {
406                let id = self.insert_element(name, attrs);
407                self.head_element = Some(id);
408                self.insertion_mode = InsertionMode::InHead;
409            }
410            Token::Character(c) if c.is_whitespace() => {
411                // ignore
412            }
413            _ => {
414                // Implied <head>
415                let head = self.document.create_element("head");
416                let parent = self.current_node();
417                self.document.append_child(parent, head);
418                self.open_elements.push(head);
419                self.head_element = Some(head);
420                self.insertion_mode = InsertionMode::InHead;
421                self.process_token(token);
422            }
423        }
424    }
425
426    fn process_in_head(&mut self, token: Token) {
427        match token {
428            Token::Character(c) if c.is_whitespace() => {
429                self.insert_text(c);
430            }
431            Token::Character(c) => {
432                // Non-whitespace text inside <title>/<style>/<script>
433                let current = self.current_node();
434                let is_raw_text = self
435                    .document
436                    .arena
437                    .get(current)
438                    .and_then(|n| n.element_data.as_ref())
439                    .map(|ed| {
440                        matches!(
441                            ed.tag_name.as_str(),
442                            "title" | "style" | "script" | "noscript"
443                        )
444                    })
445                    .unwrap_or(false);
446                if is_raw_text {
447                    self.insert_text(c);
448                } else {
449                    self.open_elements.pop();
450                    self.insertion_mode = InsertionMode::AfterHead;
451                    self.process_token(Token::Character(c));
452                }
453            }
454            Token::Comment(text) => {
455                let c = self.document.create_comment(&text);
456                let parent = self.current_node();
457                self.document.append_child(parent, c);
458            }
459            Token::StartTag(ref name, ref attrs, self_closing) => {
460                match name.as_str() {
461                    "title" | "style" | "script" | "noscript" => {
462                        if self_closing {
463                            self.insert_void_element(name, attrs);
464                        } else {
465                            self.insert_element(name, attrs);
466                        }
467                    }
468                    "meta" | "link" | "base" => {
469                        self.insert_void_element(name, attrs);
470                    }
471                    "head" => {
472                        // Ignore duplicate head
473                    }
474                    _ => {
475                        // Implied </head>
476                        self.open_elements.pop(); // pop head
477                        self.insertion_mode = InsertionMode::AfterHead;
478                        self.process_token(Token::StartTag(
479                            name.clone(),
480                            attrs.clone(),
481                            self_closing,
482                        ));
483                    }
484                }
485            }
486            Token::EndTag(ref name) => {
487                match name.as_str() {
488                    "head" => {
489                        self.open_elements.pop();
490                        self.insertion_mode = InsertionMode::AfterHead;
491                    }
492                    "title" | "style" | "script" | "noscript" => {
493                        self.open_elements.pop();
494                    }
495                    _ => {
496                        // Implied </head>
497                        self.open_elements.pop();
498                        self.insertion_mode = InsertionMode::AfterHead;
499                    }
500                }
501            }
502            Token::Eof => {
503                self.open_elements.pop();
504                self.insertion_mode = InsertionMode::AfterHead;
505                self.process_token(Token::Eof);
506            }
507            _ => {
508                self.open_elements.pop();
509                self.insertion_mode = InsertionMode::AfterHead;
510                self.process_token(token);
511            }
512        }
513    }
514
515    fn process_after_head(&mut self, token: Token) {
516        match token {
517            Token::StartTag(ref name, ref attrs, _) if name == "body" => {
518                let id = self.insert_element(name, attrs);
519                self.body_element = Some(id);
520                self.insertion_mode = InsertionMode::InBody;
521            }
522            Token::Character(c) if c.is_whitespace() => {
523                self.insert_text(c);
524            }
525            Token::Comment(text) => {
526                let c = self.document.create_comment(&text);
527                let parent = self.current_node();
528                self.document.append_child(parent, c);
529            }
530            _ => {
531                // Implied <body>
532                let body = self.document.create_element("body");
533                let parent = self.current_node();
534                self.document.append_child(parent, body);
535                self.open_elements.push(body);
536                self.body_element = Some(body);
537                self.insertion_mode = InsertionMode::InBody;
538                self.process_token(token);
539            }
540        }
541    }
542
543    fn process_in_body(&mut self, token: Token) {
544        match token {
545            Token::Character(c) => {
546                self.reconstruct_formatting();
547                self.insert_text(c);
548            }
549            Token::Comment(text) => {
550                let c = self.document.create_comment(&text);
551                let parent = self.current_node();
552                self.document.append_child(parent, c);
553            }
554            Token::StartTag(ref name, ref attrs, self_closing) => {
555                self.process_in_body_start_tag(name.clone(), attrs.clone(), self_closing);
556            }
557            Token::EndTag(ref name) => {
558                self.process_in_body_end_tag(name.clone());
559            }
560            Token::Eof => {
561                // Done
562            }
563            Token::Doctype(_) => {
564                // Ignore in body
565            }
566        }
567    }
568
569    fn process_in_body_start_tag(
570        &mut self,
571        name: String,
572        attrs: Vec<Attribute>,
573        self_closing: bool,
574    ) {
575        match name.as_str() {
576            "html" => {
577                // Merge attributes onto existing html element
578            }
579            "body" => {
580                // Merge attributes onto existing body element
581            }
582            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
583                self.auto_close_for_tag(&name);
584                // Auto-close other heading if open
585                if let Some(tag) = self.current_tag_name() {
586                    if matches!(tag.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
587                        self.open_elements.pop();
588                    }
589                }
590                self.insert_element(&name, &attrs);
591            }
592            "p" | "div" | "section" | "article" | "aside" | "nav" | "header" | "footer"
593            | "main" | "address" | "blockquote" | "figure" | "figcaption" | "details"
594            | "summary" | "ul" | "ol" | "dl" | "pre" | "fieldset" | "form" | "hgroup" => {
595                self.auto_close_for_tag(&name);
596                self.insert_element(&name, &attrs);
597            }
598            "li" => {
599                self.auto_close_for_tag("li");
600                self.insert_element(&name, &attrs);
601            }
602            "dt" | "dd" => {
603                self.auto_close_for_tag(&name);
604                self.insert_element(&name, &attrs);
605            }
606            "table" => {
607                self.close_p_if_in_scope();
608                self.insert_element(&name, &attrs);
609                self.insertion_mode = InsertionMode::InTable;
610            }
611            "b" | "big" | "code" | "em" | "font" | "i" | "s" | "small" | "strike" | "strong"
612            | "tt" | "u" => {
613                self.reconstruct_formatting();
614                let id = self.insert_element(&name, &attrs);
615                self.add_formatting_element(&name, id, &attrs);
616            }
617            "a" => {
618                self.reconstruct_formatting();
619                let id = self.insert_element(&name, &attrs);
620                self.add_formatting_element(&name, id, &attrs);
621            }
622            "br" | "hr" | "img" | "input" | "meta" | "link" | "embed" | "source" | "track"
623            | "wbr" | "area" | "col" | "param" | "base" => {
624                self.reconstruct_formatting();
625                self.insert_void_element(&name, &attrs);
626            }
627            "span" | "label" | "abbr" | "cite" | "dfn" | "kbd" | "mark" | "q" | "sub" | "sup"
628            | "time" | "var" | "data" | "ruby" | "rt" | "rp" | "bdi" | "bdo" | "output"
629            | "progress" | "meter" | "slot" | "canvas" | "dialog" | "picture" | "video"
630            | "audio" | "map" | "object" | "iframe" | "button" | "select" | "textarea" => {
631                self.reconstruct_formatting();
632                if self_closing {
633                    self.insert_void_element(&name, &attrs);
634                } else {
635                    self.insert_element(&name, &attrs);
636                }
637            }
638            _ => {
639                self.reconstruct_formatting();
640                if self_closing {
641                    self.insert_void_element(&name, &attrs);
642                } else {
643                    self.insert_element(&name, &attrs);
644                }
645            }
646        }
647    }
648
649    fn process_in_body_end_tag(&mut self, name: String) {
650        match name.as_str() {
651            "body" => {
652                self.insertion_mode = InsertionMode::AfterBody;
653            }
654            "html" => {
655                self.insertion_mode = InsertionMode::AfterBody;
656                self.process_token(Token::EndTag(name));
657            }
658            "p" => {
659                if !self.has_in_scope("p") {
660                    // Create empty <p>
661                    let p = self.document.create_element("p");
662                    let parent = self.current_node();
663                    self.document.append_child(parent, p);
664                } else {
665                    self.pop_until("p");
666                }
667            }
668            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
669                // Pop until any heading
670                let mut found = false;
671                for &id in self.open_elements.iter().rev() {
672                    if let Some(tag) = self.document.tag_name(id) {
673                        if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
674                            found = true;
675                            break;
676                        }
677                    }
678                }
679                if found {
680                    while let Some(&id) = self.open_elements.last() {
681                        self.open_elements.pop();
682                        if let Some(tag) = self.document.tag_name(id) {
683                            if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
684                                break;
685                            }
686                        }
687                    }
688                }
689            }
690            "div" | "section" | "article" | "aside" | "nav" | "header" | "footer" | "main"
691            | "address" | "blockquote" | "figure" | "figcaption" | "details" | "summary" | "ul"
692            | "ol" | "dl" | "pre" | "fieldset" | "form" | "hgroup" | "li" | "dd" | "dt" => {
693                if self.has_in_scope(&name) {
694                    self.pop_until(&name);
695                }
696            }
697            "b" | "big" | "code" | "em" | "font" | "i" | "s" | "small" | "strike" | "strong"
698            | "tt" | "u" | "a" => {
699                // Adoption agency algorithm (simplified)
700                if self.has_in_scope(&name) {
701                    self.pop_until(&name);
702                    // Remove from active formatting
703                    self.active_formatting.retain(|e| {
704                        if let Some(ref entry) = e {
705                            entry.tag_name != name
706                        } else {
707                            true
708                        }
709                    });
710                }
711            }
712            _ => {
713                // Any other end tag
714                self.pop_matching_end_tag(&name);
715            }
716        }
717    }
718
719    /// Pop elements to close a matching start tag
720    fn pop_matching_end_tag(&mut self, tag: &str) {
721        for i in (0..self.open_elements.len()).rev() {
722            let id = self.open_elements[i];
723            if self.document.tag_name(id) == Some(tag) {
724                self.open_elements.truncate(i);
725                return;
726            }
727            // Stop at special elements
728            if let Some(name) = self.document.tag_name(id) {
729                if matches!(
730                    name,
731                    "address"
732                        | "applet"
733                        | "area"
734                        | "article"
735                        | "aside"
736                        | "base"
737                        | "basefont"
738                        | "bgsound"
739                        | "blockquote"
740                        | "body"
741                        | "br"
742                        | "button"
743                        | "caption"
744                        | "center"
745                        | "col"
746                        | "colgroup"
747                        | "dd"
748                        | "details"
749                        | "dir"
750                        | "div"
751                        | "dl"
752                        | "dt"
753                        | "embed"
754                        | "fieldset"
755                        | "figcaption"
756                        | "figure"
757                        | "footer"
758                        | "form"
759                        | "frame"
760                        | "frameset"
761                        | "h1"
762                        | "h2"
763                        | "h3"
764                        | "h4"
765                        | "h5"
766                        | "h6"
767                        | "head"
768                        | "header"
769                        | "hgroup"
770                        | "hr"
771                        | "html"
772                        | "iframe"
773                        | "img"
774                        | "input"
775                        | "li"
776                        | "link"
777                        | "listing"
778                        | "main"
779                        | "marquee"
780                        | "menu"
781                        | "meta"
782                        | "nav"
783                        | "noembed"
784                        | "noframes"
785                        | "noscript"
786                        | "object"
787                        | "ol"
788                        | "p"
789                        | "param"
790                        | "plaintext"
791                        | "pre"
792                        | "script"
793                        | "section"
794                        | "select"
795                        | "source"
796                        | "style"
797                        | "summary"
798                        | "table"
799                        | "tbody"
800                        | "td"
801                        | "template"
802                        | "textarea"
803                        | "tfoot"
804                        | "th"
805                        | "thead"
806                        | "title"
807                        | "tr"
808                        | "track"
809                        | "ul"
810                        | "wbr"
811                ) {
812                    return;
813                }
814            }
815        }
816    }
817
818    fn process_in_table(&mut self, token: Token) {
819        match token {
820            Token::StartTag(ref name, ref attrs, _) => {
821                match name.as_str() {
822                    "caption" | "colgroup" | "col" => {
823                        self.insert_element(name, attrs);
824                    }
825                    "thead" | "tbody" | "tfoot" => {
826                        self.insert_element(name, attrs);
827                        self.insertion_mode = InsertionMode::InTableBody;
828                    }
829                    "tr" => {
830                        // Implied <tbody>
831                        let tbody = self.document.create_element("tbody");
832                        let parent = self.current_node();
833                        self.document.append_child(parent, tbody);
834                        self.open_elements.push(tbody);
835                        self.insert_element(name, attrs);
836                        self.insertion_mode = InsertionMode::InRow;
837                    }
838                    "td" | "th" => {
839                        // Implied <tbody><tr>
840                        let tbody = self.document.create_element("tbody");
841                        let parent = self.current_node();
842                        self.document.append_child(parent, tbody);
843                        self.open_elements.push(tbody);
844                        let tr = self.document.create_element("tr");
845                        self.document.append_child(tbody, tr);
846                        self.open_elements.push(tr);
847                        self.insert_element(name, attrs);
848                        self.insertion_mode = InsertionMode::InCell;
849                    }
850                    _ => {
851                        // Foster parenting: process as in body
852                        self.process_in_body(Token::StartTag(name.clone(), attrs.clone(), false));
853                    }
854                }
855            }
856            Token::EndTag(ref name) if name == "table" => {
857                self.pop_until("table");
858                self.insertion_mode = InsertionMode::InBody;
859            }
860            Token::Character(_) | Token::Comment(_) => {
861                self.process_in_body(token);
862            }
863            Token::Eof => {}
864            _ => {
865                self.process_in_body(token);
866            }
867        }
868    }
869
870    fn process_in_table_body(&mut self, token: Token) {
871        match token {
872            Token::StartTag(ref name, ref attrs, _) => match name.as_str() {
873                "tr" => {
874                    self.insert_element(name, attrs);
875                    self.insertion_mode = InsertionMode::InRow;
876                }
877                "td" | "th" => {
878                    let tr = self.document.create_element("tr");
879                    let parent = self.current_node();
880                    self.document.append_child(parent, tr);
881                    self.open_elements.push(tr);
882                    self.insert_element(name, attrs);
883                    self.insertion_mode = InsertionMode::InCell;
884                }
885                _ => {
886                    self.process_in_table(token);
887                }
888            },
889            Token::EndTag(ref name) => {
890                match name.as_str() {
891                    "thead" | "tbody" | "tfoot" => {
892                        self.pop_until(name);
893                        self.insertion_mode = InsertionMode::InTable;
894                    }
895                    "table" => {
896                        // Close table body first
897                        if let Some(tag) = self.current_tag_name() {
898                            if matches!(tag.as_str(), "thead" | "tbody" | "tfoot") {
899                                self.open_elements.pop();
900                            }
901                        }
902                        self.insertion_mode = InsertionMode::InTable;
903                        self.process_token(token);
904                    }
905                    _ => {
906                        self.process_in_table(token);
907                    }
908                }
909            }
910            _ => {
911                self.process_in_table(token);
912            }
913        }
914    }
915
916    fn process_in_row(&mut self, token: Token) {
917        match token {
918            Token::StartTag(ref name, ref attrs, _) => match name.as_str() {
919                "td" | "th" => {
920                    self.insert_element(name, attrs);
921                    self.insertion_mode = InsertionMode::InCell;
922                    self.push_formatting_marker();
923                }
924                _ => {
925                    self.process_in_table(token);
926                }
927            },
928            Token::EndTag(ref name) => match name.as_str() {
929                "tr" => {
930                    self.pop_until("tr");
931                    self.insertion_mode = InsertionMode::InTableBody;
932                }
933                "table" => {
934                    self.pop_until("tr");
935                    self.insertion_mode = InsertionMode::InTableBody;
936                    self.process_token(token);
937                }
938                _ => {
939                    self.process_in_table(token);
940                }
941            },
942            _ => {
943                self.process_in_table(token);
944            }
945        }
946    }
947
948    fn process_in_cell(&mut self, token: Token) {
949        match token {
950            Token::EndTag(ref name) if name == "td" || name == "th" => {
951                self.pop_until(name);
952                self.insertion_mode = InsertionMode::InRow;
953            }
954            Token::StartTag(ref name, _, _) if matches!(name.as_str(), "td" | "th" | "tr") => {
955                // Close current cell
956                if let Some(tag) = self.current_tag_name() {
957                    if tag == "td" || tag == "th" {
958                        self.open_elements.pop();
959                    }
960                }
961                self.insertion_mode = InsertionMode::InRow;
962                self.process_token(token);
963            }
964            Token::EndTag(ref name) if name == "table" => {
965                // Close cell, close row
966                if let Some(tag) = self.current_tag_name() {
967                    if tag == "td" || tag == "th" {
968                        self.open_elements.pop();
969                    }
970                }
971                self.insertion_mode = InsertionMode::InRow;
972                self.process_token(token);
973            }
974            _ => {
975                self.process_in_body(token);
976            }
977        }
978    }
979
980    fn process_after_body(&mut self, token: Token) {
981        match token {
982            Token::Character(c) if c.is_whitespace() => {
983                self.insert_text(c);
984            }
985            Token::Comment(text) => {
986                let c = self.document.create_comment(&text);
987                self.document.append_child(self.document.root, c);
988            }
989            Token::EndTag(ref name) if name == "html" => {
990                self.insertion_mode = InsertionMode::AfterAfterBody;
991            }
992            Token::Eof => {}
993            _ => {
994                self.insertion_mode = InsertionMode::InBody;
995                self.process_token(token);
996            }
997        }
998    }
999
1000    fn process_after_after_body(&mut self, token: Token) {
1001        match token {
1002            Token::Character(c) if c.is_whitespace() => {}
1003            Token::Comment(text) => {
1004                let c = self.document.create_comment(&text);
1005                self.document.append_child(self.document.root, c);
1006            }
1007            Token::Eof => {}
1008            _ => {
1009                self.insertion_mode = InsertionMode::InBody;
1010                self.process_token(token);
1011            }
1012        }
1013    }
1014}
1015
1016impl Default for TreeBuilder {
1017    fn default() -> Self {
1018        Self::new()
1019    }
1020}
1021
1022#[cfg(test)]
1023mod tests {
1024    #[allow(unused_imports)]
1025    use alloc::vec;
1026
1027    use super::*;
1028
1029    #[test]
1030    fn test_empty_document() {
1031        let doc = TreeBuilder::build("");
1032        assert!(doc.node_count() >= 1);
1033    }
1034
1035    #[test]
1036    fn test_simple_paragraph() {
1037        let doc = TreeBuilder::build("<p>Hello</p>");
1038        let ps = doc.get_elements_by_tag_name("p");
1039        assert_eq!(ps.len(), 1);
1040        assert_eq!(doc.inner_text(ps[0]), "Hello");
1041    }
1042
1043    #[test]
1044    fn test_full_html_document() {
1045        let doc = TreeBuilder::build(
1046            "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>",
1047        );
1048        let titles = doc.get_elements_by_tag_name("title");
1049        assert_eq!(titles.len(), 1);
1050        let ps = doc.get_elements_by_tag_name("p");
1051        assert_eq!(ps.len(), 1);
1052    }
1053
1054    #[test]
1055    fn test_implied_html_head_body() {
1056        let doc = TreeBuilder::build("<p>Text</p>");
1057        // Should have implied html, head, body
1058        let htmls = doc.get_elements_by_tag_name("html");
1059        assert!(!htmls.is_empty());
1060        let bodies = doc.get_elements_by_tag_name("body");
1061        assert!(!bodies.is_empty());
1062    }
1063
1064    #[test]
1065    fn test_nested_elements() {
1066        let doc = TreeBuilder::build("<div><span>A</span><span>B</span></div>");
1067        let spans = doc.get_elements_by_tag_name("span");
1068        assert_eq!(spans.len(), 2);
1069    }
1070
1071    #[test]
1072    fn test_auto_close_p() {
1073        let doc = TreeBuilder::build("<p>one<p>two");
1074        let ps = doc.get_elements_by_tag_name("p");
1075        assert_eq!(ps.len(), 2);
1076    }
1077
1078    #[test]
1079    fn test_auto_close_li() {
1080        let doc = TreeBuilder::build("<ul><li>a<li>b<li>c</ul>");
1081        let lis = doc.get_elements_by_tag_name("li");
1082        assert_eq!(lis.len(), 3);
1083    }
1084
1085    #[test]
1086    fn test_headings() {
1087        let doc = TreeBuilder::build("<h1>Title</h1><h2>Sub</h2>");
1088        let h1s = doc.get_elements_by_tag_name("h1");
1089        let h2s = doc.get_elements_by_tag_name("h2");
1090        assert_eq!(h1s.len(), 1);
1091        assert_eq!(h2s.len(), 1);
1092    }
1093
1094    #[test]
1095    fn test_heading_auto_close() {
1096        let doc = TreeBuilder::build("<h1>A<h2>B");
1097        let h1s = doc.get_elements_by_tag_name("h1");
1098        let h2s = doc.get_elements_by_tag_name("h2");
1099        assert_eq!(h1s.len(), 1);
1100        assert_eq!(h2s.len(), 1);
1101    }
1102
1103    #[test]
1104    fn test_void_elements() {
1105        let doc = TreeBuilder::build("<p>before<br>after</p>");
1106        let brs = doc.get_elements_by_tag_name("br");
1107        assert_eq!(brs.len(), 1);
1108        let ps = doc.get_elements_by_tag_name("p");
1109        assert_eq!(doc.inner_text(ps[0]), "beforeafter");
1110    }
1111
1112    #[test]
1113    fn test_formatting_elements() {
1114        let doc = TreeBuilder::build("<p><b>bold</b> normal</p>");
1115        let bs = doc.get_elements_by_tag_name("b");
1116        assert_eq!(bs.len(), 1);
1117        assert_eq!(doc.inner_text(bs[0]), "bold");
1118    }
1119
1120    #[test]
1121    fn test_comment_in_body() {
1122        let doc = TreeBuilder::build("<body><!-- comment --><p>text</p></body>");
1123        let ps = doc.get_elements_by_tag_name("p");
1124        assert_eq!(ps.len(), 1);
1125    }
1126
1127    #[test]
1128    fn test_doctype() {
1129        let doc = TreeBuilder::build("<!DOCTYPE html><html><body></body></html>");
1130        // Should have doctype node
1131        let root_node = doc.arena.get(doc.root).unwrap();
1132        let first_child = root_node.children[0];
1133        let dt = doc.arena.get(first_child).unwrap();
1134        assert_eq!(dt.node_type, NodeType::DocumentType);
1135    }
1136
1137    #[test]
1138    fn test_text_concatenation() {
1139        let doc = TreeBuilder::build("<p>abc</p>");
1140        let ps = doc.get_elements_by_tag_name("p");
1141        assert_eq!(doc.inner_text(ps[0]), "abc");
1142    }
1143
1144    #[test]
1145    fn test_mixed_text_and_elements() {
1146        let doc = TreeBuilder::build("<div>Hello <b>world</b>!</div>");
1147        let divs = doc.get_elements_by_tag_name("div");
1148        assert_eq!(doc.inner_text(divs[0]), "Hello world!");
1149    }
1150
1151    #[test]
1152    fn test_attributes_preserved() {
1153        let doc = TreeBuilder::build("<div id=\"main\" class=\"container\">text</div>");
1154        let el = doc.get_element_by_id("main");
1155        assert!(el.is_some());
1156    }
1157
1158    #[test]
1159    fn test_table_basic() {
1160        let doc = TreeBuilder::build("<table><tr><td>cell</td></tr></table>");
1161        let tds = doc.get_elements_by_tag_name("td");
1162        assert_eq!(tds.len(), 1);
1163        assert_eq!(doc.inner_text(tds[0]), "cell");
1164    }
1165
1166    #[test]
1167    fn test_table_implied_tbody() {
1168        let doc = TreeBuilder::build("<table><tr><td>A</td></tr></table>");
1169        let tbodies = doc.get_elements_by_tag_name("tbody");
1170        assert!(!tbodies.is_empty());
1171    }
1172
1173    #[test]
1174    fn test_table_with_thead() {
1175        let doc = TreeBuilder::build(
1176            "<table><thead><tr><th>H</th></tr></thead><tbody><tr><td>D</td></tr></tbody></table>",
1177        );
1178        let ths = doc.get_elements_by_tag_name("th");
1179        let tds = doc.get_elements_by_tag_name("td");
1180        assert_eq!(ths.len(), 1);
1181        assert_eq!(tds.len(), 1);
1182    }
1183
1184    #[test]
1185    fn test_dt_dd_auto_close() {
1186        let doc = TreeBuilder::build("<dl><dt>term<dd>def<dt>term2<dd>def2</dl>");
1187        let dts = doc.get_elements_by_tag_name("dt");
1188        let dds = doc.get_elements_by_tag_name("dd");
1189        assert_eq!(dts.len(), 2);
1190        assert_eq!(dds.len(), 2);
1191    }
1192
1193    #[test]
1194    fn test_no_body_text() {
1195        let doc = TreeBuilder::build("just text");
1196        // Should still produce a document with implied body
1197        let bodies = doc.get_elements_by_tag_name("body");
1198        assert!(!bodies.is_empty());
1199    }
1200
1201    #[test]
1202    fn test_anchor_tag() {
1203        let doc = TreeBuilder::build("<a href=\"/page\">link</a>");
1204        let anchors = doc.get_elements_by_tag_name("a");
1205        assert_eq!(anchors.len(), 1);
1206        assert_eq!(
1207            doc.get_attribute(anchors[0], "href"),
1208            Some("/page".to_string())
1209        );
1210    }
1211
1212    #[test]
1213    fn test_img_in_body() {
1214        let doc = TreeBuilder::build("<body><img src=\"pic.png\"></body>");
1215        let imgs = doc.get_elements_by_tag_name("img");
1216        assert_eq!(imgs.len(), 1);
1217    }
1218
1219    #[test]
1220    fn test_multiple_text_runs() {
1221        let doc = TreeBuilder::build("<p>Hello</p><p>World</p>");
1222        let ps = doc.get_elements_by_tag_name("p");
1223        assert_eq!(ps.len(), 2);
1224        assert_eq!(doc.inner_text(ps[0]), "Hello");
1225        assert_eq!(doc.inner_text(ps[1]), "World");
1226    }
1227
1228    #[test]
1229    fn test_deeply_nested() {
1230        let doc = TreeBuilder::build("<div><div><div><span>deep</span></div></div></div>");
1231        let spans = doc.get_elements_by_tag_name("span");
1232        assert_eq!(spans.len(), 1);
1233        assert_eq!(doc.inner_text(spans[0]), "deep");
1234    }
1235
1236    #[test]
1237    fn test_form_in_body() {
1238        let doc = TreeBuilder::build("<form><input type=\"text\"><button>Submit</button></form>");
1239        let forms = doc.get_elements_by_tag_name("form");
1240        assert_eq!(forms.len(), 1);
1241        let inputs = doc.get_elements_by_tag_name("input");
1242        assert_eq!(inputs.len(), 1);
1243    }
1244
1245    #[test]
1246    fn test_style_in_head() {
1247        let doc = TreeBuilder::build(
1248            "<html><head><style>body{color:red}</style></head><body>text</body></html>",
1249        );
1250        let styles = doc.get_elements_by_tag_name("style");
1251        assert_eq!(styles.len(), 1);
1252    }
1253
1254    #[test]
1255    fn test_meta_in_head() {
1256        let doc =
1257            TreeBuilder::build("<html><head><meta charset=\"utf-8\"></head><body></body></html>");
1258        let metas = doc.get_elements_by_tag_name("meta");
1259        assert_eq!(metas.len(), 1);
1260    }
1261
1262    #[test]
1263    fn test_link_in_head() {
1264        let doc = TreeBuilder::build(
1265            "<html><head><link rel=\"stylesheet\" href=\"s.css\"></head><body></body></html>",
1266        );
1267        let links = doc.get_elements_by_tag_name("link");
1268        assert_eq!(links.len(), 1);
1269    }
1270
1271    #[test]
1272    fn test_strong_em() {
1273        let doc = TreeBuilder::build("<p><strong>bold</strong> and <em>italic</em></p>");
1274        let strong = doc.get_elements_by_tag_name("strong");
1275        let em = doc.get_elements_by_tag_name("em");
1276        assert_eq!(strong.len(), 1);
1277        assert_eq!(em.len(), 1);
1278    }
1279
1280    #[test]
1281    fn test_default_tree_builder() {
1282        let _builder = TreeBuilder::default();
1283    }
1284
1285    #[test]
1286    fn test_ul_ol_nesting() {
1287        let doc = TreeBuilder::build("<ul><li>a</li><li>b<ul><li>c</li></ul></li></ul>");
1288        let lis = doc.get_elements_by_tag_name("li");
1289        assert_eq!(lis.len(), 3);
1290    }
1291
1292    #[test]
1293    fn test_entity_in_text() {
1294        let doc = TreeBuilder::build("<p>&amp;</p>");
1295        let ps = doc.get_elements_by_tag_name("p");
1296        assert_eq!(doc.inner_text(ps[0]), "&");
1297    }
1298}