1#![allow(dead_code)]
9
10use alloc::{
11 collections::BTreeMap,
12 string::{String, ToString},
13 vec::Vec,
14};
15
16use super::{
17 dom::{Document, NodeId, NodeType},
18 html_tokenizer::{Attribute, HtmlTokenizer, Token},
19};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23enum InsertionMode {
24 Initial,
25 BeforeHtml,
26 BeforeHead,
27 InHead,
28 AfterHead,
29 InBody,
30 InTable,
31 InTableBody,
32 InRow,
33 InCell,
34 AfterBody,
35 AfterAfterBody,
36}
37
38const P_CLOSING_TAGS: &[&str] = &[
40 "address",
41 "article",
42 "aside",
43 "blockquote",
44 "details",
45 "div",
46 "dl",
47 "fieldset",
48 "figcaption",
49 "figure",
50 "footer",
51 "form",
52 "h1",
53 "h2",
54 "h3",
55 "h4",
56 "h5",
57 "h6",
58 "header",
59 "hgroup",
60 "hr",
61 "li",
62 "main",
63 "nav",
64 "ol",
65 "p",
66 "pre",
67 "section",
68 "table",
69 "ul",
70];
71
72const SELF_CLOSING_TAGS: &[(&str, &[&str])] = &[
74 ("li", &["li"]),
75 ("dt", &["dt", "dd"]),
76 ("dd", &["dt", "dd"]),
77 ("td", &["td", "th"]),
78 ("th", &["td", "th"]),
79 ("tr", &["tr"]),
80 ("thead", &["tbody", "tfoot"]),
81 ("tbody", &["tbody", "tfoot"]),
82 ("tfoot", &["tbody"]),
83 ("option", &["option"]),
84 ("optgroup", &["optgroup"]),
85];
86
87const FORMATTING_ELEMENTS: &[&str] = &[
89 "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u",
90];
91
92#[derive(Debug, Clone)]
94struct FormattingEntry {
95 tag_name: String,
96 node_id: NodeId,
97 attrs: BTreeMap<String, String>,
98}
99
100pub struct TreeBuilder {
102 document: Document,
103 insertion_mode: InsertionMode,
104 open_elements: Vec<NodeId>,
105 head_element: Option<NodeId>,
106 body_element: Option<NodeId>,
107 foster_parenting: bool,
108 active_formatting: Vec<Option<FormattingEntry>>,
109 frameset_ok: bool,
110}
111
112impl TreeBuilder {
113 pub fn new() -> Self {
115 Self {
116 document: Document::new(),
117 insertion_mode: InsertionMode::Initial,
118 open_elements: Vec::new(),
119 head_element: None,
120 body_element: None,
121 foster_parenting: false,
122 active_formatting: Vec::new(),
123 frameset_ok: true,
124 }
125 }
126
127 pub fn build(html: &str) -> Document {
129 let mut tokenizer = HtmlTokenizer::from_text(html);
130 let mut builder = Self::new();
131 loop {
132 let token = tokenizer.next_token();
133 let is_eof = token == Token::Eof;
134 builder.process_token(token);
135 if is_eof {
136 break;
137 }
138 }
139 builder.document
140 }
141
142 fn current_node(&self) -> NodeId {
144 self.open_elements
145 .last()
146 .copied()
147 .unwrap_or(self.document.root)
148 }
149
150 fn current_tag_name(&self) -> Option<String> {
152 let id = self.current_node();
153 self.document.tag_name(id).map(|s| s.to_string())
154 }
155
156 fn attrs_to_map(attrs: &[Attribute]) -> BTreeMap<String, String> {
158 let mut map = BTreeMap::new();
159 for attr in attrs {
160 map.insert(attr.name.clone(), attr.value.clone());
161 }
162 map
163 }
164
165 fn insert_element(&mut self, tag_name: &str, attrs: &[Attribute]) -> NodeId {
167 let attr_map = Self::attrs_to_map(attrs);
168 let node_id = self.document.create_element_with_attrs(tag_name, attr_map);
169 let parent = self.current_node();
170 self.document.append_child(parent, node_id);
171 self.open_elements.push(node_id);
172 node_id
173 }
174
175 fn insert_void_element(&mut self, tag_name: &str, attrs: &[Attribute]) -> NodeId {
177 let attr_map = Self::attrs_to_map(attrs);
178 let node_id = self.document.create_element_with_attrs(tag_name, attr_map);
179 let parent = self.current_node();
180 self.document.append_child(parent, node_id);
181 node_id
182 }
183
184 fn insert_text(&mut self, ch: char) {
186 let parent = self.current_node();
187 if let Some(node) = self.document.arena.get(parent) {
189 if let Some(&last_child_id) = node.children.last() {
190 if let Some(last_child) = self.document.arena.get_mut(last_child_id) {
191 if last_child.node_type == NodeType::Text {
192 if let Some(ref mut text) = last_child.text_content {
193 text.push(ch);
194 return;
195 }
196 }
197 }
198 }
199 }
200 let text_id = self.document.create_text(&String::from(ch));
202 self.document.append_child(parent, text_id);
203 }
204
205 fn auto_close_for_tag(&mut self, tag_name: &str) {
207 if P_CLOSING_TAGS.contains(&tag_name) {
209 self.close_p_if_in_scope();
210 }
211
212 for &(new_tag, closers) in SELF_CLOSING_TAGS {
214 if new_tag == tag_name {
215 for &closer in closers {
216 if self.current_tag_name().as_deref() == Some(closer) {
217 self.open_elements.pop();
218 break;
219 }
220 }
221 break;
222 }
223 }
224 }
225
226 fn close_p_if_in_scope(&mut self) {
228 if self.has_in_scope("p") {
229 self.pop_until("p");
230 }
231 }
232
233 fn has_in_scope(&self, tag: &str) -> bool {
235 for &id in self.open_elements.iter().rev() {
236 if self.document.tag_name(id) == Some(tag) {
237 return true;
238 }
239 let name = self.document.tag_name(id).unwrap_or("");
241 if matches!(
242 name,
243 "applet"
244 | "caption"
245 | "html"
246 | "table"
247 | "td"
248 | "th"
249 | "marquee"
250 | "object"
251 | "template"
252 ) {
253 return false;
254 }
255 }
256 false
257 }
258
259 fn pop_until(&mut self, tag: &str) {
261 while let Some(&id) = self.open_elements.last() {
262 self.open_elements.pop();
263 if self.document.tag_name(id) == Some(tag) {
264 break;
265 }
266 }
267 }
268
269 fn push_formatting_marker(&mut self) {
271 self.active_formatting.push(None);
272 }
273
274 fn add_formatting_element(&mut self, tag_name: &str, node_id: NodeId, attrs: &[Attribute]) {
276 let entry = FormattingEntry {
277 tag_name: tag_name.to_string(),
278 node_id,
279 attrs: Self::attrs_to_map(attrs),
280 };
281 self.active_formatting.push(Some(entry));
282 }
283
284 fn reconstruct_formatting(&mut self) {
286 if self.active_formatting.is_empty() {
287 return;
288 }
289
290 let last = self.active_formatting.len() - 1;
292 if self.active_formatting[last].is_none() {
293 return;
294 }
295
296 if let Some(ref entry) = self.active_formatting[last] {
298 if self.open_elements.contains(&entry.node_id) {
299 return;
300 }
301 }
302
303 let entries_to_reopen: Vec<_> = self
305 .active_formatting
306 .iter()
307 .rev()
308 .take_while(|e| e.is_some())
309 .filter_map(|e| e.clone())
310 .collect();
311
312 for entry in entries_to_reopen.into_iter().rev() {
313 if !self.open_elements.contains(&entry.node_id) {
314 let attrs: Vec<Attribute> = entry
315 .attrs
316 .iter()
317 .map(|(k, v)| Attribute {
318 name: k.clone(),
319 value: v.clone(),
320 })
321 .collect();
322 let new_id = self.insert_element(&entry.tag_name, &attrs);
323 for e in self.active_formatting.iter_mut().flatten() {
325 if e.node_id == entry.node_id {
326 e.node_id = new_id;
327 }
328 }
329 }
330 }
331 }
332
333 fn is_formatting_element(tag: &str) -> bool {
335 FORMATTING_ELEMENTS.contains(&tag)
336 }
337
338 pub fn process_token(&mut self, token: Token) {
340 match self.insertion_mode {
341 InsertionMode::Initial => self.process_initial(token),
342 InsertionMode::BeforeHtml => self.process_before_html(token),
343 InsertionMode::BeforeHead => self.process_before_head(token),
344 InsertionMode::InHead => self.process_in_head(token),
345 InsertionMode::AfterHead => self.process_after_head(token),
346 InsertionMode::InBody => self.process_in_body(token),
347 InsertionMode::InTable => self.process_in_table(token),
348 InsertionMode::InTableBody => self.process_in_table_body(token),
349 InsertionMode::InRow => self.process_in_row(token),
350 InsertionMode::InCell => self.process_in_cell(token),
351 InsertionMode::AfterBody => self.process_after_body(token),
352 InsertionMode::AfterAfterBody => self.process_after_after_body(token),
353 }
354 }
355
356 fn process_initial(&mut self, token: Token) {
357 match token {
358 Token::Doctype(name) => {
359 let dt = self.document.create_doctype(&name);
360 self.document.append_child(self.document.root, dt);
361 self.insertion_mode = InsertionMode::BeforeHtml;
362 }
363 Token::Character(c) if c.is_whitespace() => {
364 }
366 Token::Comment(text) => {
367 let c = self.document.create_comment(&text);
368 self.document.append_child(self.document.root, c);
369 }
370 _ => {
371 self.insertion_mode = InsertionMode::BeforeHtml;
373 self.process_token(token);
374 }
375 }
376 }
377
378 fn process_before_html(&mut self, token: Token) {
379 match token {
380 Token::StartTag(ref name, ref attrs, _) if name == "html" => {
381 let id = self.insert_element(name, attrs);
382 let _ = id;
383 self.insertion_mode = InsertionMode::BeforeHead;
384 }
385 Token::Character(c) if c.is_whitespace() => {
386 }
388 Token::Comment(text) => {
389 let c = self.document.create_comment(&text);
390 self.document.append_child(self.document.root, c);
391 }
392 _ => {
393 let html = self.document.create_element("html");
395 self.document.append_child(self.document.root, html);
396 self.open_elements.push(html);
397 self.insertion_mode = InsertionMode::BeforeHead;
398 self.process_token(token);
399 }
400 }
401 }
402
403 fn process_before_head(&mut self, token: Token) {
404 match token {
405 Token::StartTag(ref name, ref attrs, _) if name == "head" => {
406 let id = self.insert_element(name, attrs);
407 self.head_element = Some(id);
408 self.insertion_mode = InsertionMode::InHead;
409 }
410 Token::Character(c) if c.is_whitespace() => {
411 }
413 _ => {
414 let head = self.document.create_element("head");
416 let parent = self.current_node();
417 self.document.append_child(parent, head);
418 self.open_elements.push(head);
419 self.head_element = Some(head);
420 self.insertion_mode = InsertionMode::InHead;
421 self.process_token(token);
422 }
423 }
424 }
425
426 fn process_in_head(&mut self, token: Token) {
427 match token {
428 Token::Character(c) if c.is_whitespace() => {
429 self.insert_text(c);
430 }
431 Token::Character(c) => {
432 let current = self.current_node();
434 let is_raw_text = self
435 .document
436 .arena
437 .get(current)
438 .and_then(|n| n.element_data.as_ref())
439 .map(|ed| {
440 matches!(
441 ed.tag_name.as_str(),
442 "title" | "style" | "script" | "noscript"
443 )
444 })
445 .unwrap_or(false);
446 if is_raw_text {
447 self.insert_text(c);
448 } else {
449 self.open_elements.pop();
450 self.insertion_mode = InsertionMode::AfterHead;
451 self.process_token(Token::Character(c));
452 }
453 }
454 Token::Comment(text) => {
455 let c = self.document.create_comment(&text);
456 let parent = self.current_node();
457 self.document.append_child(parent, c);
458 }
459 Token::StartTag(ref name, ref attrs, self_closing) => {
460 match name.as_str() {
461 "title" | "style" | "script" | "noscript" => {
462 if self_closing {
463 self.insert_void_element(name, attrs);
464 } else {
465 self.insert_element(name, attrs);
466 }
467 }
468 "meta" | "link" | "base" => {
469 self.insert_void_element(name, attrs);
470 }
471 "head" => {
472 }
474 _ => {
475 self.open_elements.pop(); self.insertion_mode = InsertionMode::AfterHead;
478 self.process_token(Token::StartTag(
479 name.clone(),
480 attrs.clone(),
481 self_closing,
482 ));
483 }
484 }
485 }
486 Token::EndTag(ref name) => {
487 match name.as_str() {
488 "head" => {
489 self.open_elements.pop();
490 self.insertion_mode = InsertionMode::AfterHead;
491 }
492 "title" | "style" | "script" | "noscript" => {
493 self.open_elements.pop();
494 }
495 _ => {
496 self.open_elements.pop();
498 self.insertion_mode = InsertionMode::AfterHead;
499 }
500 }
501 }
502 Token::Eof => {
503 self.open_elements.pop();
504 self.insertion_mode = InsertionMode::AfterHead;
505 self.process_token(Token::Eof);
506 }
507 _ => {
508 self.open_elements.pop();
509 self.insertion_mode = InsertionMode::AfterHead;
510 self.process_token(token);
511 }
512 }
513 }
514
515 fn process_after_head(&mut self, token: Token) {
516 match token {
517 Token::StartTag(ref name, ref attrs, _) if name == "body" => {
518 let id = self.insert_element(name, attrs);
519 self.body_element = Some(id);
520 self.insertion_mode = InsertionMode::InBody;
521 }
522 Token::Character(c) if c.is_whitespace() => {
523 self.insert_text(c);
524 }
525 Token::Comment(text) => {
526 let c = self.document.create_comment(&text);
527 let parent = self.current_node();
528 self.document.append_child(parent, c);
529 }
530 _ => {
531 let body = self.document.create_element("body");
533 let parent = self.current_node();
534 self.document.append_child(parent, body);
535 self.open_elements.push(body);
536 self.body_element = Some(body);
537 self.insertion_mode = InsertionMode::InBody;
538 self.process_token(token);
539 }
540 }
541 }
542
543 fn process_in_body(&mut self, token: Token) {
544 match token {
545 Token::Character(c) => {
546 self.reconstruct_formatting();
547 self.insert_text(c);
548 }
549 Token::Comment(text) => {
550 let c = self.document.create_comment(&text);
551 let parent = self.current_node();
552 self.document.append_child(parent, c);
553 }
554 Token::StartTag(ref name, ref attrs, self_closing) => {
555 self.process_in_body_start_tag(name.clone(), attrs.clone(), self_closing);
556 }
557 Token::EndTag(ref name) => {
558 self.process_in_body_end_tag(name.clone());
559 }
560 Token::Eof => {
561 }
563 Token::Doctype(_) => {
564 }
566 }
567 }
568
569 fn process_in_body_start_tag(
570 &mut self,
571 name: String,
572 attrs: Vec<Attribute>,
573 self_closing: bool,
574 ) {
575 match name.as_str() {
576 "html" => {
577 }
579 "body" => {
580 }
582 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
583 self.auto_close_for_tag(&name);
584 if let Some(tag) = self.current_tag_name() {
586 if matches!(tag.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
587 self.open_elements.pop();
588 }
589 }
590 self.insert_element(&name, &attrs);
591 }
592 "p" | "div" | "section" | "article" | "aside" | "nav" | "header" | "footer"
593 | "main" | "address" | "blockquote" | "figure" | "figcaption" | "details"
594 | "summary" | "ul" | "ol" | "dl" | "pre" | "fieldset" | "form" | "hgroup" => {
595 self.auto_close_for_tag(&name);
596 self.insert_element(&name, &attrs);
597 }
598 "li" => {
599 self.auto_close_for_tag("li");
600 self.insert_element(&name, &attrs);
601 }
602 "dt" | "dd" => {
603 self.auto_close_for_tag(&name);
604 self.insert_element(&name, &attrs);
605 }
606 "table" => {
607 self.close_p_if_in_scope();
608 self.insert_element(&name, &attrs);
609 self.insertion_mode = InsertionMode::InTable;
610 }
611 "b" | "big" | "code" | "em" | "font" | "i" | "s" | "small" | "strike" | "strong"
612 | "tt" | "u" => {
613 self.reconstruct_formatting();
614 let id = self.insert_element(&name, &attrs);
615 self.add_formatting_element(&name, id, &attrs);
616 }
617 "a" => {
618 self.reconstruct_formatting();
619 let id = self.insert_element(&name, &attrs);
620 self.add_formatting_element(&name, id, &attrs);
621 }
622 "br" | "hr" | "img" | "input" | "meta" | "link" | "embed" | "source" | "track"
623 | "wbr" | "area" | "col" | "param" | "base" => {
624 self.reconstruct_formatting();
625 self.insert_void_element(&name, &attrs);
626 }
627 "span" | "label" | "abbr" | "cite" | "dfn" | "kbd" | "mark" | "q" | "sub" | "sup"
628 | "time" | "var" | "data" | "ruby" | "rt" | "rp" | "bdi" | "bdo" | "output"
629 | "progress" | "meter" | "slot" | "canvas" | "dialog" | "picture" | "video"
630 | "audio" | "map" | "object" | "iframe" | "button" | "select" | "textarea" => {
631 self.reconstruct_formatting();
632 if self_closing {
633 self.insert_void_element(&name, &attrs);
634 } else {
635 self.insert_element(&name, &attrs);
636 }
637 }
638 _ => {
639 self.reconstruct_formatting();
640 if self_closing {
641 self.insert_void_element(&name, &attrs);
642 } else {
643 self.insert_element(&name, &attrs);
644 }
645 }
646 }
647 }
648
649 fn process_in_body_end_tag(&mut self, name: String) {
650 match name.as_str() {
651 "body" => {
652 self.insertion_mode = InsertionMode::AfterBody;
653 }
654 "html" => {
655 self.insertion_mode = InsertionMode::AfterBody;
656 self.process_token(Token::EndTag(name));
657 }
658 "p" => {
659 if !self.has_in_scope("p") {
660 let p = self.document.create_element("p");
662 let parent = self.current_node();
663 self.document.append_child(parent, p);
664 } else {
665 self.pop_until("p");
666 }
667 }
668 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
669 let mut found = false;
671 for &id in self.open_elements.iter().rev() {
672 if let Some(tag) = self.document.tag_name(id) {
673 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
674 found = true;
675 break;
676 }
677 }
678 }
679 if found {
680 while let Some(&id) = self.open_elements.last() {
681 self.open_elements.pop();
682 if let Some(tag) = self.document.tag_name(id) {
683 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
684 break;
685 }
686 }
687 }
688 }
689 }
690 "div" | "section" | "article" | "aside" | "nav" | "header" | "footer" | "main"
691 | "address" | "blockquote" | "figure" | "figcaption" | "details" | "summary" | "ul"
692 | "ol" | "dl" | "pre" | "fieldset" | "form" | "hgroup" | "li" | "dd" | "dt" => {
693 if self.has_in_scope(&name) {
694 self.pop_until(&name);
695 }
696 }
697 "b" | "big" | "code" | "em" | "font" | "i" | "s" | "small" | "strike" | "strong"
698 | "tt" | "u" | "a" => {
699 if self.has_in_scope(&name) {
701 self.pop_until(&name);
702 self.active_formatting.retain(|e| {
704 if let Some(ref entry) = e {
705 entry.tag_name != name
706 } else {
707 true
708 }
709 });
710 }
711 }
712 _ => {
713 self.pop_matching_end_tag(&name);
715 }
716 }
717 }
718
719 fn pop_matching_end_tag(&mut self, tag: &str) {
721 for i in (0..self.open_elements.len()).rev() {
722 let id = self.open_elements[i];
723 if self.document.tag_name(id) == Some(tag) {
724 self.open_elements.truncate(i);
725 return;
726 }
727 if let Some(name) = self.document.tag_name(id) {
729 if matches!(
730 name,
731 "address"
732 | "applet"
733 | "area"
734 | "article"
735 | "aside"
736 | "base"
737 | "basefont"
738 | "bgsound"
739 | "blockquote"
740 | "body"
741 | "br"
742 | "button"
743 | "caption"
744 | "center"
745 | "col"
746 | "colgroup"
747 | "dd"
748 | "details"
749 | "dir"
750 | "div"
751 | "dl"
752 | "dt"
753 | "embed"
754 | "fieldset"
755 | "figcaption"
756 | "figure"
757 | "footer"
758 | "form"
759 | "frame"
760 | "frameset"
761 | "h1"
762 | "h2"
763 | "h3"
764 | "h4"
765 | "h5"
766 | "h6"
767 | "head"
768 | "header"
769 | "hgroup"
770 | "hr"
771 | "html"
772 | "iframe"
773 | "img"
774 | "input"
775 | "li"
776 | "link"
777 | "listing"
778 | "main"
779 | "marquee"
780 | "menu"
781 | "meta"
782 | "nav"
783 | "noembed"
784 | "noframes"
785 | "noscript"
786 | "object"
787 | "ol"
788 | "p"
789 | "param"
790 | "plaintext"
791 | "pre"
792 | "script"
793 | "section"
794 | "select"
795 | "source"
796 | "style"
797 | "summary"
798 | "table"
799 | "tbody"
800 | "td"
801 | "template"
802 | "textarea"
803 | "tfoot"
804 | "th"
805 | "thead"
806 | "title"
807 | "tr"
808 | "track"
809 | "ul"
810 | "wbr"
811 ) {
812 return;
813 }
814 }
815 }
816 }
817
818 fn process_in_table(&mut self, token: Token) {
819 match token {
820 Token::StartTag(ref name, ref attrs, _) => {
821 match name.as_str() {
822 "caption" | "colgroup" | "col" => {
823 self.insert_element(name, attrs);
824 }
825 "thead" | "tbody" | "tfoot" => {
826 self.insert_element(name, attrs);
827 self.insertion_mode = InsertionMode::InTableBody;
828 }
829 "tr" => {
830 let tbody = self.document.create_element("tbody");
832 let parent = self.current_node();
833 self.document.append_child(parent, tbody);
834 self.open_elements.push(tbody);
835 self.insert_element(name, attrs);
836 self.insertion_mode = InsertionMode::InRow;
837 }
838 "td" | "th" => {
839 let tbody = self.document.create_element("tbody");
841 let parent = self.current_node();
842 self.document.append_child(parent, tbody);
843 self.open_elements.push(tbody);
844 let tr = self.document.create_element("tr");
845 self.document.append_child(tbody, tr);
846 self.open_elements.push(tr);
847 self.insert_element(name, attrs);
848 self.insertion_mode = InsertionMode::InCell;
849 }
850 _ => {
851 self.process_in_body(Token::StartTag(name.clone(), attrs.clone(), false));
853 }
854 }
855 }
856 Token::EndTag(ref name) if name == "table" => {
857 self.pop_until("table");
858 self.insertion_mode = InsertionMode::InBody;
859 }
860 Token::Character(_) | Token::Comment(_) => {
861 self.process_in_body(token);
862 }
863 Token::Eof => {}
864 _ => {
865 self.process_in_body(token);
866 }
867 }
868 }
869
870 fn process_in_table_body(&mut self, token: Token) {
871 match token {
872 Token::StartTag(ref name, ref attrs, _) => match name.as_str() {
873 "tr" => {
874 self.insert_element(name, attrs);
875 self.insertion_mode = InsertionMode::InRow;
876 }
877 "td" | "th" => {
878 let tr = self.document.create_element("tr");
879 let parent = self.current_node();
880 self.document.append_child(parent, tr);
881 self.open_elements.push(tr);
882 self.insert_element(name, attrs);
883 self.insertion_mode = InsertionMode::InCell;
884 }
885 _ => {
886 self.process_in_table(token);
887 }
888 },
889 Token::EndTag(ref name) => {
890 match name.as_str() {
891 "thead" | "tbody" | "tfoot" => {
892 self.pop_until(name);
893 self.insertion_mode = InsertionMode::InTable;
894 }
895 "table" => {
896 if let Some(tag) = self.current_tag_name() {
898 if matches!(tag.as_str(), "thead" | "tbody" | "tfoot") {
899 self.open_elements.pop();
900 }
901 }
902 self.insertion_mode = InsertionMode::InTable;
903 self.process_token(token);
904 }
905 _ => {
906 self.process_in_table(token);
907 }
908 }
909 }
910 _ => {
911 self.process_in_table(token);
912 }
913 }
914 }
915
916 fn process_in_row(&mut self, token: Token) {
917 match token {
918 Token::StartTag(ref name, ref attrs, _) => match name.as_str() {
919 "td" | "th" => {
920 self.insert_element(name, attrs);
921 self.insertion_mode = InsertionMode::InCell;
922 self.push_formatting_marker();
923 }
924 _ => {
925 self.process_in_table(token);
926 }
927 },
928 Token::EndTag(ref name) => match name.as_str() {
929 "tr" => {
930 self.pop_until("tr");
931 self.insertion_mode = InsertionMode::InTableBody;
932 }
933 "table" => {
934 self.pop_until("tr");
935 self.insertion_mode = InsertionMode::InTableBody;
936 self.process_token(token);
937 }
938 _ => {
939 self.process_in_table(token);
940 }
941 },
942 _ => {
943 self.process_in_table(token);
944 }
945 }
946 }
947
948 fn process_in_cell(&mut self, token: Token) {
949 match token {
950 Token::EndTag(ref name) if name == "td" || name == "th" => {
951 self.pop_until(name);
952 self.insertion_mode = InsertionMode::InRow;
953 }
954 Token::StartTag(ref name, _, _) if matches!(name.as_str(), "td" | "th" | "tr") => {
955 if let Some(tag) = self.current_tag_name() {
957 if tag == "td" || tag == "th" {
958 self.open_elements.pop();
959 }
960 }
961 self.insertion_mode = InsertionMode::InRow;
962 self.process_token(token);
963 }
964 Token::EndTag(ref name) if name == "table" => {
965 if let Some(tag) = self.current_tag_name() {
967 if tag == "td" || tag == "th" {
968 self.open_elements.pop();
969 }
970 }
971 self.insertion_mode = InsertionMode::InRow;
972 self.process_token(token);
973 }
974 _ => {
975 self.process_in_body(token);
976 }
977 }
978 }
979
980 fn process_after_body(&mut self, token: Token) {
981 match token {
982 Token::Character(c) if c.is_whitespace() => {
983 self.insert_text(c);
984 }
985 Token::Comment(text) => {
986 let c = self.document.create_comment(&text);
987 self.document.append_child(self.document.root, c);
988 }
989 Token::EndTag(ref name) if name == "html" => {
990 self.insertion_mode = InsertionMode::AfterAfterBody;
991 }
992 Token::Eof => {}
993 _ => {
994 self.insertion_mode = InsertionMode::InBody;
995 self.process_token(token);
996 }
997 }
998 }
999
1000 fn process_after_after_body(&mut self, token: Token) {
1001 match token {
1002 Token::Character(c) if c.is_whitespace() => {}
1003 Token::Comment(text) => {
1004 let c = self.document.create_comment(&text);
1005 self.document.append_child(self.document.root, c);
1006 }
1007 Token::Eof => {}
1008 _ => {
1009 self.insertion_mode = InsertionMode::InBody;
1010 self.process_token(token);
1011 }
1012 }
1013 }
1014}
1015
1016impl Default for TreeBuilder {
1017 fn default() -> Self {
1018 Self::new()
1019 }
1020}
1021
1022#[cfg(test)]
1023mod tests {
1024 #[allow(unused_imports)]
1025 use alloc::vec;
1026
1027 use super::*;
1028
1029 #[test]
1030 fn test_empty_document() {
1031 let doc = TreeBuilder::build("");
1032 assert!(doc.node_count() >= 1);
1033 }
1034
1035 #[test]
1036 fn test_simple_paragraph() {
1037 let doc = TreeBuilder::build("<p>Hello</p>");
1038 let ps = doc.get_elements_by_tag_name("p");
1039 assert_eq!(ps.len(), 1);
1040 assert_eq!(doc.inner_text(ps[0]), "Hello");
1041 }
1042
1043 #[test]
1044 fn test_full_html_document() {
1045 let doc = TreeBuilder::build(
1046 "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>",
1047 );
1048 let titles = doc.get_elements_by_tag_name("title");
1049 assert_eq!(titles.len(), 1);
1050 let ps = doc.get_elements_by_tag_name("p");
1051 assert_eq!(ps.len(), 1);
1052 }
1053
1054 #[test]
1055 fn test_implied_html_head_body() {
1056 let doc = TreeBuilder::build("<p>Text</p>");
1057 let htmls = doc.get_elements_by_tag_name("html");
1059 assert!(!htmls.is_empty());
1060 let bodies = doc.get_elements_by_tag_name("body");
1061 assert!(!bodies.is_empty());
1062 }
1063
1064 #[test]
1065 fn test_nested_elements() {
1066 let doc = TreeBuilder::build("<div><span>A</span><span>B</span></div>");
1067 let spans = doc.get_elements_by_tag_name("span");
1068 assert_eq!(spans.len(), 2);
1069 }
1070
1071 #[test]
1072 fn test_auto_close_p() {
1073 let doc = TreeBuilder::build("<p>one<p>two");
1074 let ps = doc.get_elements_by_tag_name("p");
1075 assert_eq!(ps.len(), 2);
1076 }
1077
1078 #[test]
1079 fn test_auto_close_li() {
1080 let doc = TreeBuilder::build("<ul><li>a<li>b<li>c</ul>");
1081 let lis = doc.get_elements_by_tag_name("li");
1082 assert_eq!(lis.len(), 3);
1083 }
1084
1085 #[test]
1086 fn test_headings() {
1087 let doc = TreeBuilder::build("<h1>Title</h1><h2>Sub</h2>");
1088 let h1s = doc.get_elements_by_tag_name("h1");
1089 let h2s = doc.get_elements_by_tag_name("h2");
1090 assert_eq!(h1s.len(), 1);
1091 assert_eq!(h2s.len(), 1);
1092 }
1093
1094 #[test]
1095 fn test_heading_auto_close() {
1096 let doc = TreeBuilder::build("<h1>A<h2>B");
1097 let h1s = doc.get_elements_by_tag_name("h1");
1098 let h2s = doc.get_elements_by_tag_name("h2");
1099 assert_eq!(h1s.len(), 1);
1100 assert_eq!(h2s.len(), 1);
1101 }
1102
1103 #[test]
1104 fn test_void_elements() {
1105 let doc = TreeBuilder::build("<p>before<br>after</p>");
1106 let brs = doc.get_elements_by_tag_name("br");
1107 assert_eq!(brs.len(), 1);
1108 let ps = doc.get_elements_by_tag_name("p");
1109 assert_eq!(doc.inner_text(ps[0]), "beforeafter");
1110 }
1111
1112 #[test]
1113 fn test_formatting_elements() {
1114 let doc = TreeBuilder::build("<p><b>bold</b> normal</p>");
1115 let bs = doc.get_elements_by_tag_name("b");
1116 assert_eq!(bs.len(), 1);
1117 assert_eq!(doc.inner_text(bs[0]), "bold");
1118 }
1119
1120 #[test]
1121 fn test_comment_in_body() {
1122 let doc = TreeBuilder::build("<body><!-- comment --><p>text</p></body>");
1123 let ps = doc.get_elements_by_tag_name("p");
1124 assert_eq!(ps.len(), 1);
1125 }
1126
1127 #[test]
1128 fn test_doctype() {
1129 let doc = TreeBuilder::build("<!DOCTYPE html><html><body></body></html>");
1130 let root_node = doc.arena.get(doc.root).unwrap();
1132 let first_child = root_node.children[0];
1133 let dt = doc.arena.get(first_child).unwrap();
1134 assert_eq!(dt.node_type, NodeType::DocumentType);
1135 }
1136
1137 #[test]
1138 fn test_text_concatenation() {
1139 let doc = TreeBuilder::build("<p>abc</p>");
1140 let ps = doc.get_elements_by_tag_name("p");
1141 assert_eq!(doc.inner_text(ps[0]), "abc");
1142 }
1143
1144 #[test]
1145 fn test_mixed_text_and_elements() {
1146 let doc = TreeBuilder::build("<div>Hello <b>world</b>!</div>");
1147 let divs = doc.get_elements_by_tag_name("div");
1148 assert_eq!(doc.inner_text(divs[0]), "Hello world!");
1149 }
1150
1151 #[test]
1152 fn test_attributes_preserved() {
1153 let doc = TreeBuilder::build("<div id=\"main\" class=\"container\">text</div>");
1154 let el = doc.get_element_by_id("main");
1155 assert!(el.is_some());
1156 }
1157
1158 #[test]
1159 fn test_table_basic() {
1160 let doc = TreeBuilder::build("<table><tr><td>cell</td></tr></table>");
1161 let tds = doc.get_elements_by_tag_name("td");
1162 assert_eq!(tds.len(), 1);
1163 assert_eq!(doc.inner_text(tds[0]), "cell");
1164 }
1165
1166 #[test]
1167 fn test_table_implied_tbody() {
1168 let doc = TreeBuilder::build("<table><tr><td>A</td></tr></table>");
1169 let tbodies = doc.get_elements_by_tag_name("tbody");
1170 assert!(!tbodies.is_empty());
1171 }
1172
1173 #[test]
1174 fn test_table_with_thead() {
1175 let doc = TreeBuilder::build(
1176 "<table><thead><tr><th>H</th></tr></thead><tbody><tr><td>D</td></tr></tbody></table>",
1177 );
1178 let ths = doc.get_elements_by_tag_name("th");
1179 let tds = doc.get_elements_by_tag_name("td");
1180 assert_eq!(ths.len(), 1);
1181 assert_eq!(tds.len(), 1);
1182 }
1183
1184 #[test]
1185 fn test_dt_dd_auto_close() {
1186 let doc = TreeBuilder::build("<dl><dt>term<dd>def<dt>term2<dd>def2</dl>");
1187 let dts = doc.get_elements_by_tag_name("dt");
1188 let dds = doc.get_elements_by_tag_name("dd");
1189 assert_eq!(dts.len(), 2);
1190 assert_eq!(dds.len(), 2);
1191 }
1192
1193 #[test]
1194 fn test_no_body_text() {
1195 let doc = TreeBuilder::build("just text");
1196 let bodies = doc.get_elements_by_tag_name("body");
1198 assert!(!bodies.is_empty());
1199 }
1200
1201 #[test]
1202 fn test_anchor_tag() {
1203 let doc = TreeBuilder::build("<a href=\"/page\">link</a>");
1204 let anchors = doc.get_elements_by_tag_name("a");
1205 assert_eq!(anchors.len(), 1);
1206 assert_eq!(
1207 doc.get_attribute(anchors[0], "href"),
1208 Some("/page".to_string())
1209 );
1210 }
1211
1212 #[test]
1213 fn test_img_in_body() {
1214 let doc = TreeBuilder::build("<body><img src=\"pic.png\"></body>");
1215 let imgs = doc.get_elements_by_tag_name("img");
1216 assert_eq!(imgs.len(), 1);
1217 }
1218
1219 #[test]
1220 fn test_multiple_text_runs() {
1221 let doc = TreeBuilder::build("<p>Hello</p><p>World</p>");
1222 let ps = doc.get_elements_by_tag_name("p");
1223 assert_eq!(ps.len(), 2);
1224 assert_eq!(doc.inner_text(ps[0]), "Hello");
1225 assert_eq!(doc.inner_text(ps[1]), "World");
1226 }
1227
1228 #[test]
1229 fn test_deeply_nested() {
1230 let doc = TreeBuilder::build("<div><div><div><span>deep</span></div></div></div>");
1231 let spans = doc.get_elements_by_tag_name("span");
1232 assert_eq!(spans.len(), 1);
1233 assert_eq!(doc.inner_text(spans[0]), "deep");
1234 }
1235
1236 #[test]
1237 fn test_form_in_body() {
1238 let doc = TreeBuilder::build("<form><input type=\"text\"><button>Submit</button></form>");
1239 let forms = doc.get_elements_by_tag_name("form");
1240 assert_eq!(forms.len(), 1);
1241 let inputs = doc.get_elements_by_tag_name("input");
1242 assert_eq!(inputs.len(), 1);
1243 }
1244
1245 #[test]
1246 fn test_style_in_head() {
1247 let doc = TreeBuilder::build(
1248 "<html><head><style>body{color:red}</style></head><body>text</body></html>",
1249 );
1250 let styles = doc.get_elements_by_tag_name("style");
1251 assert_eq!(styles.len(), 1);
1252 }
1253
1254 #[test]
1255 fn test_meta_in_head() {
1256 let doc =
1257 TreeBuilder::build("<html><head><meta charset=\"utf-8\"></head><body></body></html>");
1258 let metas = doc.get_elements_by_tag_name("meta");
1259 assert_eq!(metas.len(), 1);
1260 }
1261
1262 #[test]
1263 fn test_link_in_head() {
1264 let doc = TreeBuilder::build(
1265 "<html><head><link rel=\"stylesheet\" href=\"s.css\"></head><body></body></html>",
1266 );
1267 let links = doc.get_elements_by_tag_name("link");
1268 assert_eq!(links.len(), 1);
1269 }
1270
1271 #[test]
1272 fn test_strong_em() {
1273 let doc = TreeBuilder::build("<p><strong>bold</strong> and <em>italic</em></p>");
1274 let strong = doc.get_elements_by_tag_name("strong");
1275 let em = doc.get_elements_by_tag_name("em");
1276 assert_eq!(strong.len(), 1);
1277 assert_eq!(em.len(), 1);
1278 }
1279
1280 #[test]
1281 fn test_default_tree_builder() {
1282 let _builder = TreeBuilder::default();
1283 }
1284
1285 #[test]
1286 fn test_ul_ol_nesting() {
1287 let doc = TreeBuilder::build("<ul><li>a</li><li>b<ul><li>c</li></ul></li></ul>");
1288 let lis = doc.get_elements_by_tag_name("li");
1289 assert_eq!(lis.len(), 3);
1290 }
1291
1292 #[test]
1293 fn test_entity_in_text() {
1294 let doc = TreeBuilder::build("<p>&</p>");
1295 let ps = doc.get_elements_by_tag_name("p");
1296 assert_eq!(doc.inner_text(ps[0]), "&");
1297 }
1298}