⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/desktop/
syntax.rs

1//! Syntax Highlighting Engine
2//!
3//! Provides syntax highlighting for the text editor with support for
4//! Rust, C, and Shell languages. Uses a character-by-character state machine
5//! approach to tokenize lines into colored spans.
6
7#![allow(dead_code)]
8
9use alloc::{boxed::Box, vec::Vec};
10
11// ---------------------------------------------------------------------------
12// Core types
13// ---------------------------------------------------------------------------
14
15/// A single highlighted span within a line.
16#[derive(Debug, Clone)]
17pub struct SyntaxToken {
18    /// Byte offset of the first character (inclusive).
19    pub start: usize,
20    /// Byte offset one past the last character (exclusive).
21    pub end: usize,
22    /// Semantic category that determines the color.
23    pub token_type: TokenType,
24}
25
26/// Semantic token categories used by all highlighters.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum TokenType {
29    Keyword,
30    Type,
31    StringLit,
32    Comment,
33    Number,
34    Operator,
35    Punctuation,
36    Function,
37    Macro,
38    Attribute,
39    Lifetime,
40    Label,
41    Normal,
42}
43
44/// Supported source languages.
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum Language {
47    Rust,
48    C,
49    Cpp,
50    Shell,
51    Python,
52    Markdown,
53    Unknown,
54}
55
56/// Color theme mapping each token type to a BGRA u32 color.
57///
58/// Format: 0xAABBGGRR when written to the BGRA framebuffer, but the
59/// existing `draw_char_into_buffer` expects a *RGB* u32 (0x00RRGGBB)
60/// which it converts internally. We store colors in that same convention.
61#[derive(Debug, Clone)]
62pub struct SyntaxTheme {
63    pub keyword_color: u32,
64    pub type_color: u32,
65    pub string_color: u32,
66    pub comment_color: u32,
67    pub number_color: u32,
68    pub operator_color: u32,
69    pub function_color: u32,
70    pub macro_color: u32,
71    pub attribute_color: u32,
72    pub lifetime_color: u32,
73    pub normal_color: u32,
74}
75
76// ---------------------------------------------------------------------------
77// Trait
78// ---------------------------------------------------------------------------
79
80/// Language-specific line tokenizer.
81pub trait SyntaxHighlighter {
82    /// Break a single line into a sequence of colored tokens.
83    fn tokenize_line(&self, line: &str) -> Vec<SyntaxToken>;
84
85    /// The language this highlighter handles.
86    fn language(&self) -> Language;
87}
88
89// ---------------------------------------------------------------------------
90// Language detection and factory
91// ---------------------------------------------------------------------------
92
93/// Detect the source language from a filename extension.
94pub fn detect_language(filename: &str) -> Language {
95    // Find the last '.' to extract the extension.
96    let ext = match filename.rfind('.') {
97        Some(pos) => &filename[pos + 1..],
98        None => return Language::Unknown,
99    };
100
101    match ext {
102        "rs" => Language::Rust,
103        "c" | "h" => Language::C,
104        "cpp" | "cxx" | "cc" | "hpp" | "hxx" | "hh" => Language::Cpp,
105        "sh" | "bash" => Language::Shell,
106        "py" => Language::Python,
107        "md" | "markdown" => Language::Markdown,
108        _ => Language::Unknown,
109    }
110}
111
112/// Create the appropriate highlighter for a language (returns `None` for
113/// languages without a highlighter implementation).
114pub fn create_highlighter(lang: Language) -> Option<Box<dyn SyntaxHighlighter>> {
115    match lang {
116        Language::Rust => Some(Box::new(RustHighlighter)),
117        Language::C | Language::Cpp => Some(Box::new(CHighlighter)),
118        Language::Shell => Some(Box::new(ShHighlighter)),
119        _ => None,
120    }
121}
122
123/// Return a dark-theme color palette similar to VS Code Dark+.
124///
125/// Colors are in RGB format (0x00RRGGBB) to match the existing
126/// `draw_char_into_buffer` convention.
127pub fn default_theme() -> SyntaxTheme {
128    SyntaxTheme {
129        keyword_color: 0x569CD6,   // blue
130        type_color: 0x4EC9B0,      // teal
131        string_color: 0xCE9178,    // orange
132        comment_color: 0x6A9955,   // green
133        number_color: 0xB5CEA8,    // light green
134        operator_color: 0xD4D4D4,  // white
135        function_color: 0xDCDCAA,  // yellow
136        macro_color: 0x569CD6,     // blue (same as keyword)
137        attribute_color: 0x9CDCFE, // light blue
138        lifetime_color: 0xD7BA7D,  // gold
139        normal_color: 0xCCCCCC,    // light gray
140    }
141}
142
143/// Map a `TokenType` to its display color within a theme.
144pub fn get_token_color(token_type: &TokenType, theme: &SyntaxTheme) -> u32 {
145    match token_type {
146        TokenType::Keyword => theme.keyword_color,
147        TokenType::Type => theme.type_color,
148        TokenType::StringLit => theme.string_color,
149        TokenType::Comment => theme.comment_color,
150        TokenType::Number => theme.number_color,
151        TokenType::Operator => theme.operator_color,
152        TokenType::Punctuation => theme.operator_color,
153        TokenType::Function => theme.function_color,
154        TokenType::Macro => theme.macro_color,
155        TokenType::Attribute => theme.attribute_color,
156        TokenType::Lifetime => theme.lifetime_color,
157        TokenType::Label => theme.lifetime_color,
158        TokenType::Normal => theme.normal_color,
159    }
160}
161
162// ===========================================================================
163// Shared helpers
164// ===========================================================================
165
166/// Return `true` when `ch` can appear inside an identifier.
167fn is_ident_char(ch: u8) -> bool {
168    ch.is_ascii_alphanumeric() || ch == b'_'
169}
170
171/// Return `true` when `ch` is an operator character common across languages.
172fn is_operator(ch: u8) -> bool {
173    matches!(
174        ch,
175        b'+' | b'-' | b'*' | b'/' | b'%' | b'=' | b'!' | b'<' | b'>' | b'&' | b'|' | b'^' | b'~'
176    )
177}
178
179/// Return `true` when `ch` is punctuation.
180fn is_punctuation(ch: u8) -> bool {
181    matches!(
182        ch,
183        b'(' | b')' | b'{' | b'}' | b'[' | b']' | b',' | b';' | b':' | b'.'
184    )
185}
186
187/// Check whether `word` appears in `list`.
188fn word_in_list(word: &[u8], list: &[&[u8]]) -> bool {
189    for &entry in list {
190        if word == entry {
191            return true;
192        }
193    }
194    false
195}
196
197/// Emit a token, but only if start < end.
198fn push_token(tokens: &mut Vec<SyntaxToken>, start: usize, end: usize, tt: TokenType) {
199    if start < end {
200        tokens.push(SyntaxToken {
201            start,
202            end,
203            token_type: tt,
204        });
205    }
206}
207
208// ===========================================================================
209// Rust Highlighter
210// ===========================================================================
211
212/// Syntax highlighter for the Rust programming language.
213pub struct RustHighlighter;
214
215/// Rust keywords.
216const RUST_KEYWORDS: &[&[u8]] = &[
217    b"fn",
218    b"let",
219    b"mut",
220    b"const",
221    b"static",
222    b"if",
223    b"else",
224    b"match",
225    b"for",
226    b"while",
227    b"loop",
228    b"return",
229    b"break",
230    b"continue",
231    b"pub",
232    b"use",
233    b"mod",
234    b"struct",
235    b"enum",
236    b"impl",
237    b"trait",
238    b"where",
239    b"type",
240    b"as",
241    b"in",
242    b"ref",
243    b"self",
244    b"super",
245    b"crate",
246    b"unsafe",
247    b"async",
248    b"await",
249    b"move",
250    b"dyn",
251    b"extern",
252    b"true",
253    b"false",
254];
255
256/// Rust built-in / standard-library types.
257const RUST_TYPES: &[&[u8]] = &[
258    b"bool", b"u8", b"u16", b"u32", b"u64", b"u128", b"usize", b"i8", b"i16", b"i32", b"i64",
259    b"i128", b"isize", b"f32", b"f64", b"char", b"str", b"String", b"Vec", b"Option", b"Result",
260    b"Box", b"Rc", b"Arc", b"Self",
261];
262
263impl SyntaxHighlighter for RustHighlighter {
264    fn language(&self) -> Language {
265        Language::Rust
266    }
267
268    fn tokenize_line(&self, line: &str) -> Vec<SyntaxToken> {
269        let bytes = line.as_bytes();
270        let len = bytes.len();
271        let mut tokens: Vec<SyntaxToken> = Vec::new();
272        let mut i: usize = 0;
273
274        while i < len {
275            let ch = bytes[i];
276
277            // ----------------------------------------------------------
278            // Line comments: // or /// (doc comment)
279            // ----------------------------------------------------------
280            if ch == b'/' && i + 1 < len && bytes[i + 1] == b'/' {
281                push_token(&mut tokens, i, len, TokenType::Comment);
282                break; // rest of line is a comment
283            }
284
285            // ----------------------------------------------------------
286            // Block comment (single-line portion): /* ... */
287            // ----------------------------------------------------------
288            if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
289                let start = i;
290                i += 2;
291                while i + 1 < len {
292                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
293                        i += 2;
294                        break;
295                    }
296                    i += 1;
297                }
298                // If we ran off the end without closing, consume rest of line.
299                if i >= len {
300                    i = len;
301                }
302                push_token(&mut tokens, start, i, TokenType::Comment);
303                continue;
304            }
305
306            // ----------------------------------------------------------
307            // Attribute: #[...] or #![...]
308            // ----------------------------------------------------------
309            if ch == b'#' && i + 1 < len && (bytes[i + 1] == b'[' || bytes[i + 1] == b'!') {
310                let start = i;
311                // Consume until matching ']' or end of line.
312                let mut depth: usize = 0;
313                while i < len {
314                    if bytes[i] == b'[' {
315                        depth += 1;
316                    } else if bytes[i] == b']' {
317                        depth = depth.saturating_sub(1);
318                        if depth == 0 {
319                            i += 1;
320                            break;
321                        }
322                    }
323                    i += 1;
324                }
325                push_token(&mut tokens, start, i, TokenType::Attribute);
326                continue;
327            }
328
329            // ----------------------------------------------------------
330            // Raw string: r"..." or r#"..."#
331            // ----------------------------------------------------------
332            if ch == b'r' && i + 1 < len && (bytes[i + 1] == b'"' || bytes[i + 1] == b'#') {
333                // Count leading '#' symbols.
334                let start = i;
335                i += 1; // skip 'r'
336                let mut hashes: usize = 0;
337                while i < len && bytes[i] == b'#' {
338                    hashes += 1;
339                    i += 1;
340                }
341                if i < len && bytes[i] == b'"' {
342                    i += 1; // skip opening '"'
343                            // Scan for closing '"' followed by the same number of '#'.
344                    'raw_scan: while i < len {
345                        if bytes[i] == b'"' {
346                            let mut matched: usize = 0;
347                            let after_quote = i + 1;
348                            while matched < hashes
349                                && after_quote + matched < len
350                                && bytes[after_quote + matched] == b'#'
351                            {
352                                matched += 1;
353                            }
354                            if matched == hashes {
355                                i = after_quote + matched;
356                                break 'raw_scan;
357                            }
358                        }
359                        i += 1;
360                    }
361                    if i > len {
362                        i = len;
363                    }
364                    push_token(&mut tokens, start, i, TokenType::StringLit);
365                    continue;
366                }
367                // Not actually a raw string -- fall through and let the identifier
368                // path handle the 'r'.
369                i = start;
370            }
371
372            // ----------------------------------------------------------
373            // String literal: "..."
374            // ----------------------------------------------------------
375            if ch == b'"' {
376                let start = i;
377                i += 1;
378                while i < len {
379                    if bytes[i] == b'\\' {
380                        i += 2; // skip escaped character
381                        continue;
382                    }
383                    if bytes[i] == b'"' {
384                        i += 1;
385                        break;
386                    }
387                    i += 1;
388                }
389                push_token(&mut tokens, start, i, TokenType::StringLit);
390                continue;
391            }
392
393            // ----------------------------------------------------------
394            // Character literal: '.'  (but NOT lifetime 'a)
395            // ----------------------------------------------------------
396            if ch == b'\'' && i + 2 < len && bytes[i + 2] == b'\'' && bytes[i + 1] != b'\\' {
397                push_token(&mut tokens, i, i + 3, TokenType::StringLit);
398                i += 3;
399                continue;
400            }
401            // Escaped char literal: '\n' etc.
402            if ch == b'\'' && i + 3 < len && bytes[i + 1] == b'\\' && bytes[i + 3] == b'\'' {
403                push_token(&mut tokens, i, i + 4, TokenType::StringLit);
404                i += 4;
405                continue;
406            }
407
408            // ----------------------------------------------------------
409            // Lifetime: 'a, 'static, '_ etc.
410            // ----------------------------------------------------------
411            if ch == b'\''
412                && i + 1 < len
413                && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_')
414            {
415                let start = i;
416                i += 1; // skip the tick
417                while i < len && is_ident_char(bytes[i]) {
418                    i += 1;
419                }
420                push_token(&mut tokens, start, i, TokenType::Lifetime);
421                continue;
422            }
423
424            // ----------------------------------------------------------
425            // Number literals: 0x.., 0b.., 0o.., decimal, with _ separators
426            // ----------------------------------------------------------
427            if ch.is_ascii_digit() {
428                let start = i;
429                if ch == b'0' && i + 1 < len {
430                    match bytes[i + 1] {
431                        b'x' | b'X' => {
432                            i += 2;
433                            while i < len && (bytes[i].is_ascii_hexdigit() || bytes[i] == b'_') {
434                                i += 1;
435                            }
436                            push_token(&mut tokens, start, i, TokenType::Number);
437                            continue;
438                        }
439                        b'b' | b'B' => {
440                            i += 2;
441                            while i < len
442                                && (bytes[i] == b'0' || bytes[i] == b'1' || bytes[i] == b'_')
443                            {
444                                i += 1;
445                            }
446                            push_token(&mut tokens, start, i, TokenType::Number);
447                            continue;
448                        }
449                        b'o' | b'O' => {
450                            i += 2;
451                            while i < len
452                                && ((bytes[i] >= b'0' && bytes[i] <= b'7') || bytes[i] == b'_')
453                            {
454                                i += 1;
455                            }
456                            push_token(&mut tokens, start, i, TokenType::Number);
457                            continue;
458                        }
459                        _ => {}
460                    }
461                }
462                // Decimal (possibly with '.' for float, but we keep it simple
463                // and just consume digits + underscores).
464                while i < len && (bytes[i].is_ascii_digit() || bytes[i] == b'_' || bytes[i] == b'.')
465                {
466                    // Avoid consuming '..' (range operator) as part of a number.
467                    if bytes[i] == b'.' && i + 1 < len && bytes[i + 1] == b'.' {
468                        break;
469                    }
470                    i += 1;
471                }
472                // Optional type suffix like u32, i64, usize, etc.
473                if i < len && (bytes[i] == b'u' || bytes[i] == b'i' || bytes[i] == b'f') {
474                    while i < len && is_ident_char(bytes[i]) {
475                        i += 1;
476                    }
477                }
478                push_token(&mut tokens, start, i, TokenType::Number);
479                continue;
480            }
481
482            // ----------------------------------------------------------
483            // Identifiers, keywords, types, macros, functions
484            // ----------------------------------------------------------
485            if ch.is_ascii_alphabetic() || ch == b'_' {
486                let start = i;
487                while i < len && is_ident_char(bytes[i]) {
488                    i += 1;
489                }
490                let word = &bytes[start..i];
491
492                // Macro invocation: name followed by '!'
493                if i < len && bytes[i] == b'!' {
494                    push_token(&mut tokens, start, i + 1, TokenType::Macro);
495                    i += 1;
496                    continue;
497                }
498
499                // Function call: name followed by '('
500                if i < len && bytes[i] == b'(' {
501                    // But only if it is not a keyword (e.g. `if (...)` in C-style).
502                    if !word_in_list(word, RUST_KEYWORDS) {
503                        push_token(&mut tokens, start, i, TokenType::Function);
504                        continue;
505                    }
506                }
507
508                if word_in_list(word, RUST_KEYWORDS) {
509                    push_token(&mut tokens, start, i, TokenType::Keyword);
510                } else if word_in_list(word, RUST_TYPES) {
511                    push_token(&mut tokens, start, i, TokenType::Type);
512                } else {
513                    push_token(&mut tokens, start, i, TokenType::Normal);
514                }
515                continue;
516            }
517
518            // ----------------------------------------------------------
519            // Operators
520            // ----------------------------------------------------------
521            if is_operator(ch) {
522                let start = i;
523                // Consume consecutive operator characters so that `==` etc.
524                // are a single token.
525                while i < len && is_operator(bytes[i]) {
526                    i += 1;
527                }
528                push_token(&mut tokens, start, i, TokenType::Operator);
529                continue;
530            }
531
532            // ----------------------------------------------------------
533            // Punctuation
534            // ----------------------------------------------------------
535            if is_punctuation(ch) {
536                push_token(&mut tokens, i, i + 1, TokenType::Punctuation);
537                i += 1;
538                continue;
539            }
540
541            // ----------------------------------------------------------
542            // Whitespace and anything else -- skip without emitting
543            // ----------------------------------------------------------
544            i += 1;
545        }
546
547        tokens
548    }
549}
550
551// ===========================================================================
552// C / C++ Highlighter
553// ===========================================================================
554
555/// Syntax highlighter for C and C++ source code.
556pub struct CHighlighter;
557
558const C_KEYWORDS: &[&[u8]] = &[
559    b"if",
560    b"else",
561    b"for",
562    b"while",
563    b"do",
564    b"switch",
565    b"case",
566    b"break",
567    b"continue",
568    b"return",
569    b"goto",
570    b"typedef",
571    b"struct",
572    b"union",
573    b"enum",
574    b"sizeof",
575    b"void",
576    b"static",
577    b"extern",
578    b"const",
579    b"volatile",
580    b"register",
581    b"inline",
582    b"restrict",
583    b"default",
584    b"true",
585    b"false",
586    b"NULL",
587];
588
589const C_TYPES: &[&[u8]] = &[
590    b"int",
591    b"char",
592    b"float",
593    b"double",
594    b"long",
595    b"short",
596    b"unsigned",
597    b"signed",
598    b"size_t",
599    b"uint8_t",
600    b"uint16_t",
601    b"uint32_t",
602    b"uint64_t",
603    b"int8_t",
604    b"int16_t",
605    b"int32_t",
606    b"int64_t",
607    b"bool",
608    b"FILE",
609    b"ssize_t",
610    b"ptrdiff_t",
611];
612
613impl SyntaxHighlighter for CHighlighter {
614    fn language(&self) -> Language {
615        Language::C
616    }
617
618    fn tokenize_line(&self, line: &str) -> Vec<SyntaxToken> {
619        let bytes = line.as_bytes();
620        let len = bytes.len();
621        let mut tokens: Vec<SyntaxToken> = Vec::new();
622        let mut i: usize = 0;
623
624        // Skip leading whitespace to detect preprocessor directives.
625        let mut ws = 0;
626        while ws < len && (bytes[ws] == b' ' || bytes[ws] == b'\t') {
627            ws += 1;
628        }
629
630        // Preprocessor directive: line begins with optional whitespace then '#'
631        if ws < len && bytes[ws] == b'#' {
632            push_token(&mut tokens, ws, len, TokenType::Attribute);
633            return tokens;
634        }
635
636        while i < len {
637            let ch = bytes[i];
638
639            // ----------------------------------------------------------
640            // Line comment: //
641            // ----------------------------------------------------------
642            if ch == b'/' && i + 1 < len && bytes[i + 1] == b'/' {
643                push_token(&mut tokens, i, len, TokenType::Comment);
644                break;
645            }
646
647            // ----------------------------------------------------------
648            // Block comment: /* ... */
649            // ----------------------------------------------------------
650            if ch == b'/' && i + 1 < len && bytes[i + 1] == b'*' {
651                let start = i;
652                i += 2;
653                while i + 1 < len {
654                    if bytes[i] == b'*' && bytes[i + 1] == b'/' {
655                        i += 2;
656                        break;
657                    }
658                    i += 1;
659                }
660                if i >= len {
661                    i = len;
662                }
663                push_token(&mut tokens, start, i, TokenType::Comment);
664                continue;
665            }
666
667            // ----------------------------------------------------------
668            // String literal
669            // ----------------------------------------------------------
670            if ch == b'"' {
671                let start = i;
672                i += 1;
673                while i < len {
674                    if bytes[i] == b'\\' {
675                        i += 2;
676                        continue;
677                    }
678                    if bytes[i] == b'"' {
679                        i += 1;
680                        break;
681                    }
682                    i += 1;
683                }
684                push_token(&mut tokens, start, i, TokenType::StringLit);
685                continue;
686            }
687
688            // ----------------------------------------------------------
689            // Character literal
690            // ----------------------------------------------------------
691            if ch == b'\'' {
692                let start = i;
693                i += 1;
694                while i < len {
695                    if bytes[i] == b'\\' {
696                        i += 2;
697                        continue;
698                    }
699                    if bytes[i] == b'\'' {
700                        i += 1;
701                        break;
702                    }
703                    i += 1;
704                }
705                push_token(&mut tokens, start, i, TokenType::StringLit);
706                continue;
707            }
708
709            // ----------------------------------------------------------
710            // Number literals
711            // ----------------------------------------------------------
712            if ch.is_ascii_digit() {
713                let start = i;
714                if ch == b'0' && i + 1 < len {
715                    match bytes[i + 1] {
716                        b'x' | b'X' => {
717                            i += 2;
718                            while i < len && (bytes[i].is_ascii_hexdigit() || bytes[i] == b'_') {
719                                i += 1;
720                            }
721                            // Consume optional suffix (u, ul, ull, l, ll, etc.)
722                            while i < len && bytes[i].is_ascii_alphabetic() {
723                                i += 1;
724                            }
725                            push_token(&mut tokens, start, i, TokenType::Number);
726                            continue;
727                        }
728                        b'b' | b'B' => {
729                            i += 2;
730                            while i < len
731                                && (bytes[i] == b'0' || bytes[i] == b'1' || bytes[i] == b'_')
732                            {
733                                i += 1;
734                            }
735                            push_token(&mut tokens, start, i, TokenType::Number);
736                            continue;
737                        }
738                        _ => {}
739                    }
740                }
741                while i < len && (bytes[i].is_ascii_digit() || bytes[i] == b'.' || bytes[i] == b'_')
742                {
743                    i += 1;
744                }
745                // Optional suffix: f, l, u, ul, ull, etc.
746                while i < len && bytes[i].is_ascii_alphabetic() {
747                    i += 1;
748                }
749                push_token(&mut tokens, start, i, TokenType::Number);
750                continue;
751            }
752
753            // ----------------------------------------------------------
754            // Identifiers, keywords, types, functions
755            // ----------------------------------------------------------
756            if ch.is_ascii_alphabetic() || ch == b'_' {
757                let start = i;
758                while i < len && is_ident_char(bytes[i]) {
759                    i += 1;
760                }
761                let word = &bytes[start..i];
762
763                // Function call heuristic: identifier followed by '('
764                if i < len && bytes[i] == b'(' && !word_in_list(word, C_KEYWORDS) {
765                    push_token(&mut tokens, start, i, TokenType::Function);
766                    continue;
767                }
768
769                if word_in_list(word, C_KEYWORDS) {
770                    push_token(&mut tokens, start, i, TokenType::Keyword);
771                } else if word_in_list(word, C_TYPES) {
772                    push_token(&mut tokens, start, i, TokenType::Type);
773                } else {
774                    push_token(&mut tokens, start, i, TokenType::Normal);
775                }
776                continue;
777            }
778
779            // ----------------------------------------------------------
780            // Operators
781            // ----------------------------------------------------------
782            if is_operator(ch) {
783                let start = i;
784                while i < len && is_operator(bytes[i]) {
785                    i += 1;
786                }
787                push_token(&mut tokens, start, i, TokenType::Operator);
788                continue;
789            }
790
791            // ----------------------------------------------------------
792            // Punctuation
793            // ----------------------------------------------------------
794            if is_punctuation(ch) {
795                push_token(&mut tokens, i, i + 1, TokenType::Punctuation);
796                i += 1;
797                continue;
798            }
799
800            // Whitespace / other
801            i += 1;
802        }
803
804        tokens
805    }
806}
807
808// ===========================================================================
809// Shell / Bash Highlighter
810// ===========================================================================
811
812/// Syntax highlighter for shell (Bash) scripts.
813pub struct ShHighlighter;
814
815const SH_KEYWORDS: &[&[u8]] = &[
816    b"if",
817    b"then",
818    b"elif",
819    b"else",
820    b"fi",
821    b"for",
822    b"in",
823    b"do",
824    b"done",
825    b"while",
826    b"until",
827    b"case",
828    b"esac",
829    b"function",
830    b"return",
831    b"local",
832    b"export",
833    b"source",
834    b"eval",
835    b"exec",
836    b"exit",
837    b"break",
838    b"continue",
839    b"select",
840];
841
842const SH_BUILTINS: &[&[u8]] = &[
843    b"echo", b"cd", b"pwd", b"ls", b"cat", b"grep", b"sed", b"awk", b"find", b"test", b"read",
844    b"set", b"unset", b"shift", b"trap", b"printf", b"declare", b"typeset", b"let",
845];
846
847impl SyntaxHighlighter for ShHighlighter {
848    fn language(&self) -> Language {
849        Language::Shell
850    }
851
852    fn tokenize_line(&self, line: &str) -> Vec<SyntaxToken> {
853        let bytes = line.as_bytes();
854        let len = bytes.len();
855        let mut tokens: Vec<SyntaxToken> = Vec::new();
856        let mut i: usize = 0;
857
858        // Skip leading whitespace to detect comment lines.
859        let mut ws = 0;
860        while ws < len && (bytes[ws] == b' ' || bytes[ws] == b'\t') {
861            ws += 1;
862        }
863
864        while i < len {
865            let ch = bytes[i];
866
867            // ----------------------------------------------------------
868            // Comment: '#' (but not inside a string, and not #! shebang
869            // which we still highlight as a comment)
870            // ----------------------------------------------------------
871            if ch == b'#' {
872                push_token(&mut tokens, i, len, TokenType::Comment);
873                break;
874            }
875
876            // ----------------------------------------------------------
877            // Double-quoted string: "..." (allows $var expansion inside,
878            // but we just color the whole thing as a string for simplicity)
879            // ----------------------------------------------------------
880            if ch == b'"' {
881                let start = i;
882                i += 1;
883                while i < len {
884                    if bytes[i] == b'\\' {
885                        i += 2;
886                        continue;
887                    }
888                    if bytes[i] == b'"' {
889                        i += 1;
890                        break;
891                    }
892                    i += 1;
893                }
894                push_token(&mut tokens, start, i, TokenType::StringLit);
895                continue;
896            }
897
898            // ----------------------------------------------------------
899            // Single-quoted string: '...' (no escapes except '')
900            // ----------------------------------------------------------
901            if ch == b'\'' {
902                let start = i;
903                i += 1;
904                while i < len {
905                    if bytes[i] == b'\'' {
906                        i += 1;
907                        break;
908                    }
909                    i += 1;
910                }
911                push_token(&mut tokens, start, i, TokenType::StringLit);
912                continue;
913            }
914
915            // ----------------------------------------------------------
916            // Backtick command substitution: `...`
917            // ----------------------------------------------------------
918            if ch == b'`' {
919                let start = i;
920                i += 1;
921                while i < len {
922                    if bytes[i] == b'\\' {
923                        i += 2;
924                        continue;
925                    }
926                    if bytes[i] == b'`' {
927                        i += 1;
928                        break;
929                    }
930                    i += 1;
931                }
932                push_token(&mut tokens, start, i, TokenType::StringLit);
933                continue;
934            }
935
936            // ----------------------------------------------------------
937            // Variable / expansion: $var, ${var}, $(...), $((..))
938            // ----------------------------------------------------------
939            if ch == b'$' {
940                let start = i;
941                i += 1;
942                if i < len {
943                    match bytes[i] {
944                        b'{' => {
945                            // ${...}
946                            i += 1;
947                            while i < len && bytes[i] != b'}' {
948                                i += 1;
949                            }
950                            if i < len {
951                                i += 1;
952                            }
953                        }
954                        b'(' => {
955                            // $(...) or $((...))
956                            let mut depth: usize = 1;
957                            i += 1;
958                            while i < len && depth > 0 {
959                                if bytes[i] == b'(' {
960                                    depth += 1;
961                                } else if bytes[i] == b')' {
962                                    depth -= 1;
963                                }
964                                if depth > 0 {
965                                    i += 1;
966                                }
967                            }
968                            if i < len {
969                                i += 1; // skip closing ')'
970                            }
971                        }
972                        b'?' | b'!' | b'$' | b'#' | b'@' | b'*' | b'-' | b'0'..=b'9' => {
973                            // Special variable: $?, $$, $!, $#, $@, $*, $-, $0..$9
974                            i += 1;
975                        }
976                        _ => {
977                            // $VARIABLE_NAME
978                            while i < len && is_ident_char(bytes[i]) {
979                                i += 1;
980                            }
981                        }
982                    }
983                }
984                push_token(&mut tokens, start, i, TokenType::Macro);
985                continue;
986            }
987
988            // ----------------------------------------------------------
989            // Number literals
990            // ----------------------------------------------------------
991            if ch.is_ascii_digit() {
992                let start = i;
993                while i < len && bytes[i].is_ascii_digit() {
994                    i += 1;
995                }
996                push_token(&mut tokens, start, i, TokenType::Number);
997                continue;
998            }
999
1000            // ----------------------------------------------------------
1001            // Identifiers, keywords, builtins
1002            // ----------------------------------------------------------
1003            if ch.is_ascii_alphabetic() || ch == b'_' {
1004                let start = i;
1005                while i < len && (is_ident_char(bytes[i]) || bytes[i] == b'-') {
1006                    i += 1;
1007                }
1008                let word = &bytes[start..i];
1009
1010                if word_in_list(word, SH_KEYWORDS) {
1011                    push_token(&mut tokens, start, i, TokenType::Keyword);
1012                } else if word_in_list(word, SH_BUILTINS) {
1013                    push_token(&mut tokens, start, i, TokenType::Function);
1014                } else {
1015                    push_token(&mut tokens, start, i, TokenType::Normal);
1016                }
1017                continue;
1018            }
1019
1020            // ----------------------------------------------------------
1021            // Operators (shell-specific: |, ||, &&, ;, ;;, &, >, >>, <, etc.)
1022            // ----------------------------------------------------------
1023            if is_operator(ch) || ch == b'@' {
1024                let start = i;
1025                while i < len && is_operator(bytes[i]) {
1026                    i += 1;
1027                }
1028                push_token(&mut tokens, start, i, TokenType::Operator);
1029                continue;
1030            }
1031
1032            // ----------------------------------------------------------
1033            // Punctuation
1034            // ----------------------------------------------------------
1035            if is_punctuation(ch) {
1036                push_token(&mut tokens, i, i + 1, TokenType::Punctuation);
1037                i += 1;
1038                continue;
1039            }
1040
1041            // Whitespace / other
1042            i += 1;
1043        }
1044
1045        tokens
1046    }
1047}
1048
1049// ===========================================================================
1050// Tests
1051// ===========================================================================
1052
1053#[cfg(test)]
1054mod tests {
1055    use super::*;
1056
1057    // -- Language detection --------------------------------------------------
1058
1059    #[test]
1060    fn test_detect_rust() {
1061        assert_eq!(detect_language("main.rs"), Language::Rust);
1062    }
1063
1064    #[test]
1065    fn test_detect_c() {
1066        assert_eq!(detect_language("foo.c"), Language::C);
1067        assert_eq!(detect_language("bar.h"), Language::C);
1068    }
1069
1070    #[test]
1071    fn test_detect_shell() {
1072        assert_eq!(detect_language("run.sh"), Language::Shell);
1073        assert_eq!(detect_language("setup.bash"), Language::Shell);
1074    }
1075
1076    #[test]
1077    fn test_detect_unknown() {
1078        assert_eq!(detect_language("notes.txt"), Language::Unknown);
1079        assert_eq!(detect_language("Makefile"), Language::Unknown);
1080    }
1081
1082    // -- Theme ---------------------------------------------------------------
1083
1084    #[test]
1085    fn test_default_theme_nonzero() {
1086        let theme = default_theme();
1087        assert_ne!(theme.keyword_color, 0);
1088        assert_ne!(theme.comment_color, 0);
1089        assert_ne!(theme.string_color, 0);
1090    }
1091
1092    #[test]
1093    fn test_get_token_color() {
1094        let theme = default_theme();
1095        assert_eq!(
1096            get_token_color(&TokenType::Keyword, &theme),
1097            theme.keyword_color
1098        );
1099        assert_eq!(
1100            get_token_color(&TokenType::Comment, &theme),
1101            theme.comment_color
1102        );
1103    }
1104
1105    // -- Factory -------------------------------------------------------------
1106
1107    #[test]
1108    fn test_create_highlighter_rust() {
1109        let hl = create_highlighter(Language::Rust);
1110        assert!(hl.is_some());
1111        assert_eq!(hl.unwrap().language(), Language::Rust);
1112    }
1113
1114    #[test]
1115    fn test_create_highlighter_unknown() {
1116        assert!(create_highlighter(Language::Unknown).is_none());
1117    }
1118
1119    // -- Rust tokenizer ------------------------------------------------------
1120
1121    #[test]
1122    fn test_rust_keyword() {
1123        let hl = RustHighlighter;
1124        let tokens = hl.tokenize_line("fn main() {");
1125        assert!(tokens.len() >= 2);
1126        assert_eq!(tokens[0].token_type, TokenType::Keyword); // fn
1127        assert_eq!(tokens[1].token_type, TokenType::Function); // main
1128    }
1129
1130    #[test]
1131    fn test_rust_comment() {
1132        let hl = RustHighlighter;
1133        let tokens = hl.tokenize_line("// this is a comment");
1134        assert_eq!(tokens.len(), 1);
1135        assert_eq!(tokens[0].token_type, TokenType::Comment);
1136        assert_eq!(tokens[0].start, 0);
1137    }
1138
1139    #[test]
1140    fn test_rust_string() {
1141        let hl = RustHighlighter;
1142        let tokens = hl.tokenize_line("let s = \"hello\";");
1143        // Find the string token
1144        let string_tok = tokens.iter().find(|t| t.token_type == TokenType::StringLit);
1145        assert!(string_tok.is_some());
1146    }
1147
1148    #[test]
1149    fn test_rust_number_hex() {
1150        let hl = RustHighlighter;
1151        let tokens = hl.tokenize_line("let x = 0xFF;");
1152        let num_tok = tokens.iter().find(|t| t.token_type == TokenType::Number);
1153        assert!(num_tok.is_some());
1154    }
1155
1156    #[test]
1157    fn test_rust_lifetime() {
1158        let hl = RustHighlighter;
1159        let tokens = hl.tokenize_line("fn foo<'a>(x: &'a str)");
1160        let lt_tok = tokens.iter().find(|t| t.token_type == TokenType::Lifetime);
1161        assert!(lt_tok.is_some());
1162    }
1163
1164    #[test]
1165    fn test_rust_macro() {
1166        let hl = RustHighlighter;
1167        let tokens = hl.tokenize_line("println!(\"hi\");");
1168        assert_eq!(tokens[0].token_type, TokenType::Macro);
1169    }
1170
1171    #[test]
1172    fn test_rust_attribute() {
1173        let hl = RustHighlighter;
1174        let tokens = hl.tokenize_line("#[derive(Debug)]");
1175        assert_eq!(tokens[0].token_type, TokenType::Attribute);
1176    }
1177
1178    #[test]
1179    fn test_rust_type() {
1180        let hl = RustHighlighter;
1181        let tokens = hl.tokenize_line("let v: Vec<u32> = Vec::new();");
1182        let type_tok = tokens.iter().find(|t| t.token_type == TokenType::Type);
1183        assert!(type_tok.is_some());
1184    }
1185
1186    // -- C tokenizer ---------------------------------------------------------
1187
1188    #[test]
1189    fn test_c_preprocessor() {
1190        let hl = CHighlighter;
1191        let tokens = hl.tokenize_line("#include <stdio.h>");
1192        assert_eq!(tokens.len(), 1);
1193        assert_eq!(tokens[0].token_type, TokenType::Attribute);
1194    }
1195
1196    #[test]
1197    fn test_c_keyword() {
1198        let hl = CHighlighter;
1199        let tokens = hl.tokenize_line("if (x > 0) return 1;");
1200        assert_eq!(tokens[0].token_type, TokenType::Keyword); // if
1201    }
1202
1203    #[test]
1204    fn test_c_string() {
1205        let hl = CHighlighter;
1206        let tokens = hl.tokenize_line("char *s = \"hello\";");
1207        let string_tok = tokens.iter().find(|t| t.token_type == TokenType::StringLit);
1208        assert!(string_tok.is_some());
1209    }
1210
1211    #[test]
1212    fn test_c_function() {
1213        let hl = CHighlighter;
1214        let tokens = hl.tokenize_line("printf(\"hi\");");
1215        assert_eq!(tokens[0].token_type, TokenType::Function);
1216    }
1217
1218    // -- Shell tokenizer -----------------------------------------------------
1219
1220    #[test]
1221    fn test_sh_comment() {
1222        let hl = ShHighlighter;
1223        let tokens = hl.tokenize_line("# this is a comment");
1224        assert_eq!(tokens.len(), 1);
1225        assert_eq!(tokens[0].token_type, TokenType::Comment);
1226    }
1227
1228    #[test]
1229    fn test_sh_keyword() {
1230        let hl = ShHighlighter;
1231        let tokens = hl.tokenize_line("if [ -f file ]; then");
1232        assert_eq!(tokens[0].token_type, TokenType::Keyword); // if
1233    }
1234
1235    #[test]
1236    fn test_sh_variable() {
1237        let hl = ShHighlighter;
1238        let tokens = hl.tokenize_line("echo $HOME");
1239        let var_tok = tokens.iter().find(|t| t.token_type == TokenType::Macro);
1240        assert!(var_tok.is_some());
1241    }
1242
1243    #[test]
1244    fn test_sh_double_quoted() {
1245        let hl = ShHighlighter;
1246        let tokens = hl.tokenize_line("echo \"hello world\"");
1247        let str_tok = tokens.iter().find(|t| t.token_type == TokenType::StringLit);
1248        assert!(str_tok.is_some());
1249    }
1250
1251    #[test]
1252    fn test_sh_single_quoted() {
1253        let hl = ShHighlighter;
1254        let tokens = hl.tokenize_line("echo 'hello world'");
1255        let str_tok = tokens.iter().find(|t| t.token_type == TokenType::StringLit);
1256        assert!(str_tok.is_some());
1257    }
1258
1259    #[test]
1260    fn test_sh_builtin() {
1261        let hl = ShHighlighter;
1262        let tokens = hl.tokenize_line("echo hello");
1263        assert_eq!(tokens[0].token_type, TokenType::Function); // echo is a builtin
1264    }
1265
1266    #[test]
1267    fn test_sh_expansion() {
1268        let hl = ShHighlighter;
1269        let tokens = hl.tokenize_line("echo ${PATH}");
1270        let var_tok = tokens.iter().find(|t| t.token_type == TokenType::Macro);
1271        assert!(var_tok.is_some());
1272    }
1273
1274    // -- Edge cases ----------------------------------------------------------
1275
1276    #[test]
1277    fn test_empty_line() {
1278        let hl = RustHighlighter;
1279        let tokens = hl.tokenize_line("");
1280        assert!(tokens.is_empty());
1281    }
1282
1283    #[test]
1284    fn test_whitespace_only() {
1285        let hl = RustHighlighter;
1286        let tokens = hl.tokenize_line("    ");
1287        assert!(tokens.is_empty());
1288    }
1289
1290    #[test]
1291    fn test_rust_escaped_string() {
1292        let hl = RustHighlighter;
1293        let tokens = hl.tokenize_line(r#"let s = "he\"llo";"#);
1294        let str_tok = tokens.iter().find(|t| t.token_type == TokenType::StringLit);
1295        assert!(str_tok.is_some());
1296    }
1297
1298    #[test]
1299    fn test_rust_block_comment() {
1300        let hl = RustHighlighter;
1301        let tokens = hl.tokenize_line("let x = /* comment */ 5;");
1302        let comment_tok = tokens.iter().find(|t| t.token_type == TokenType::Comment);
1303        assert!(comment_tok.is_some());
1304        let num_tok = tokens.iter().find(|t| t.token_type == TokenType::Number);
1305        assert!(num_tok.is_some());
1306    }
1307
1308    #[test]
1309    fn test_token_spans_cover_text() {
1310        let hl = RustHighlighter;
1311        let line = "fn foo()";
1312        let tokens = hl.tokenize_line(line);
1313        // Verify that token spans reference valid byte offsets.
1314        for tok in &tokens {
1315            assert!(tok.start < line.len());
1316            assert!(tok.end <= line.len());
1317            assert!(tok.start < tok.end);
1318        }
1319    }
1320}