⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/virt/containers/
cgroups.rs

1//! Cgroup Memory and CPU Controllers - limits, usage tracking, OOM,
2//! hierarchical accounting, shares, quota/period, throttling, burst.
3
4use crate::error::KernelError;
5
6// ---------------------------------------------------------------------------
7// Cgroup Memory Controller
8// ---------------------------------------------------------------------------
9
10/// Memory statistics counters.
11#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
12pub struct MemoryStat {
13    /// Resident set size in bytes.
14    pub rss: u64,
15    /// Page cache usage in bytes.
16    pub cache: u64,
17    /// Memory-mapped file usage in bytes.
18    pub mapped_file: u64,
19    /// Anonymous memory usage in bytes.
20    pub anon: u64,
21    /// Swap usage in bytes.
22    pub swap: u64,
23}
24
25impl MemoryStat {
26    /// Total memory usage (rss + cache).
27    pub fn total(&self) -> u64 {
28        self.rss.saturating_add(self.cache)
29    }
30}
31
32/// OOM event information.
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub struct OomEvent {
35    /// Number of OOM events triggered.
36    pub oom_kill_count: u64,
37    /// Whether OOM kill is enabled.
38    pub oom_kill_enabled: bool,
39    /// Whether the group is currently under OOM.
40    pub under_oom: bool,
41}
42
43impl Default for OomEvent {
44    fn default() -> Self {
45        Self {
46            oom_kill_count: 0,
47            oom_kill_enabled: true,
48            under_oom: false,
49        }
50    }
51}
52
53/// Cgroup memory controller.
54#[derive(Debug, Clone)]
55pub struct CgroupMemoryController {
56    /// Hard memory limit in bytes (0 = unlimited).
57    pub limit_hard: u64,
58    /// Soft memory limit in bytes (0 = unlimited).
59    pub limit_soft: u64,
60    /// Current usage in bytes.
61    pub usage_current: u64,
62    /// Peak (maximum) usage in bytes.
63    pub usage_peak: u64,
64    /// Detailed memory statistics.
65    pub stat: MemoryStat,
66    /// OOM event state.
67    pub oom: OomEvent,
68    /// Parent cgroup ID for hierarchical accounting (0 = root).
69    pub parent_id: u64,
70    /// Unique cgroup ID.
71    pub cgroup_id: u64,
72}
73
74impl CgroupMemoryController {
75    pub fn new(cgroup_id: u64) -> Self {
76        Self {
77            limit_hard: 0,
78            limit_soft: 0,
79            usage_current: 0,
80            usage_peak: 0,
81            stat: MemoryStat::default(),
82            oom: OomEvent::default(),
83            parent_id: 0,
84            cgroup_id,
85        }
86    }
87
88    /// Set the hard limit. Returns error if current usage exceeds new limit.
89    pub fn set_hard_limit(&mut self, limit: u64) -> Result<(), KernelError> {
90        if limit > 0 && self.usage_current > limit {
91            // Trigger reclaim attempt
92            self.try_reclaim(self.usage_current.saturating_sub(limit));
93            if self.usage_current > limit {
94                return Err(KernelError::ResourceExhausted {
95                    resource: "cgroup memory",
96                });
97            }
98        }
99        self.limit_hard = limit;
100        Ok(())
101    }
102
103    /// Set the soft limit.
104    pub fn set_soft_limit(&mut self, limit: u64) {
105        self.limit_soft = limit;
106    }
107
108    /// Charge memory usage. Returns error if hard limit would be exceeded.
109    pub fn charge(&mut self, bytes: u64) -> Result<(), KernelError> {
110        let new_usage = self.usage_current.saturating_add(bytes);
111        if self.limit_hard > 0 && new_usage > self.limit_hard {
112            // Try reclaim first
113            self.try_reclaim(new_usage.saturating_sub(self.limit_hard));
114            let after_reclaim = self.usage_current.saturating_add(bytes);
115            if after_reclaim > self.limit_hard {
116                self.oom.under_oom = true;
117                self.oom.oom_kill_count = self.oom.oom_kill_count.saturating_add(1);
118                return Err(KernelError::OutOfMemory {
119                    requested: bytes as usize,
120                    available: self.limit_hard.saturating_sub(self.usage_current) as usize,
121                });
122            }
123        }
124        self.usage_current = self.usage_current.saturating_add(bytes);
125        if self.usage_current > self.usage_peak {
126            self.usage_peak = self.usage_current;
127        }
128        self.stat.rss = self.stat.rss.saturating_add(bytes);
129        Ok(())
130    }
131
132    /// Uncharge (release) memory usage.
133    pub fn uncharge(&mut self, bytes: u64) {
134        self.usage_current = self.usage_current.saturating_sub(bytes);
135        self.stat.rss = self.stat.rss.saturating_sub(bytes);
136        self.oom.under_oom = false;
137    }
138
139    /// Check if soft limit is exceeded (triggers reclaim pressure).
140    pub fn soft_limit_exceeded(&self) -> bool {
141        self.limit_soft > 0 && self.usage_current > self.limit_soft
142    }
143
144    /// Try to reclaim `target` bytes. Returns bytes reclaimed.
145    /// In a real implementation this would trigger page reclaim; here it
146    /// reclaims from cache.
147    fn try_reclaim(&mut self, target: u64) -> u64 {
148        let reclaimable = self.stat.cache;
149        let reclaimed = if reclaimable >= target {
150            target
151        } else {
152            reclaimable
153        };
154        self.stat.cache = self.stat.cache.saturating_sub(reclaimed);
155        self.usage_current = self.usage_current.saturating_sub(reclaimed);
156        reclaimed
157    }
158
159    /// Record a cache page addition.
160    pub fn add_cache(&mut self, bytes: u64) {
161        self.stat.cache = self.stat.cache.saturating_add(bytes);
162        self.usage_current = self.usage_current.saturating_add(bytes);
163        if self.usage_current > self.usage_peak {
164            self.usage_peak = self.usage_current;
165        }
166    }
167
168    /// Record a mapped file addition.
169    pub fn add_mapped_file(&mut self, bytes: u64) {
170        self.stat.mapped_file = self.stat.mapped_file.saturating_add(bytes);
171    }
172
173    /// Hierarchical usage including parent chain (simplified: just self).
174    pub fn hierarchical_usage(&self) -> u64 {
175        self.usage_current
176    }
177}
178
179// ---------------------------------------------------------------------------
180// Cgroup CPU Controller
181// ---------------------------------------------------------------------------
182
183/// CPU bandwidth statistics.
184#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
185pub struct CpuBandwidthStats {
186    /// Number of times throttled.
187    pub nr_throttled: u64,
188    /// Total throttled time in nanoseconds.
189    pub throttled_time_ns: u64,
190    /// Number of scheduling periods elapsed.
191    pub nr_periods: u64,
192    /// Burst time accumulated in nanoseconds.
193    pub nr_bursts: u64,
194    /// Total burst time used in nanoseconds.
195    pub burst_time_ns: u64,
196}
197
198/// Cgroup CPU controller with shares and bandwidth limiting.
199#[derive(Debug, Clone)]
200pub struct CgroupCpuController {
201    /// CPU shares (weight-based fair scheduling, default 1024).
202    pub shares: u32,
203    /// CPU quota in microseconds per period (0 = unlimited).
204    pub quota_us: u64,
205    /// CPU period in microseconds (default 100000 = 100ms).
206    pub period_us: u64,
207    /// Burst capacity in microseconds (0 = no burst).
208    pub burst_us: u64,
209    /// Accumulated burst budget in nanoseconds.
210    pub(crate) burst_budget_ns: u64,
211    /// Runtime consumed in the current period in nanoseconds.
212    runtime_consumed_ns: u64,
213    /// Bandwidth statistics.
214    pub stats: CpuBandwidthStats,
215    /// Whether currently throttled.
216    pub throttled: bool,
217    /// Parent cgroup ID for hierarchical distribution (0 = root).
218    pub parent_id: u64,
219    /// Unique cgroup ID.
220    pub cgroup_id: u64,
221}
222
223impl CgroupCpuController {
224    pub fn new(cgroup_id: u64) -> Self {
225        Self {
226            shares: 1024,
227            quota_us: 0,
228            period_us: 100_000,
229            burst_us: 0,
230            burst_budget_ns: 0,
231            runtime_consumed_ns: 0,
232            stats: CpuBandwidthStats::default(),
233            throttled: false,
234            parent_id: 0,
235            cgroup_id,
236        }
237    }
238
239    /// Set CPU shares (weight). Minimum 2, maximum 262144.
240    pub fn set_shares(&mut self, shares: u32) -> Result<(), KernelError> {
241        if !(2..=262144).contains(&shares) {
242            return Err(KernelError::InvalidArgument {
243                name: "cpu.shares",
244                value: "out of range [2, 262144]",
245            });
246        }
247        self.shares = shares;
248        Ok(())
249    }
250
251    /// Set CPU bandwidth quota and period.
252    /// quota_us=0 means unlimited. Period must be >= 1000us and <= 1000000us.
253    pub fn set_bandwidth(&mut self, quota_us: u64, period_us: u64) -> Result<(), KernelError> {
254        if !(1000..=1_000_000).contains(&period_us) {
255            return Err(KernelError::InvalidArgument {
256                name: "cpu.cfs_period_us",
257                value: "out of range [1000, 1000000]",
258            });
259        }
260        if quota_us > 0 && quota_us < 1000 {
261            return Err(KernelError::InvalidArgument {
262                name: "cpu.cfs_quota_us",
263                value: "must be >= 1000 or 0 (unlimited)",
264            });
265        }
266        self.quota_us = quota_us;
267        self.period_us = period_us;
268        Ok(())
269    }
270
271    /// Set burst capacity in microseconds.
272    pub fn set_burst(&mut self, burst_us: u64) {
273        self.burst_us = burst_us;
274    }
275
276    /// Consume runtime. Returns true if the task is now throttled.
277    pub fn consume_runtime(&mut self, ns: u64) -> bool {
278        self.runtime_consumed_ns = self.runtime_consumed_ns.saturating_add(ns);
279
280        if self.quota_us == 0 {
281            return false; // unlimited
282        }
283
284        // Convert quota from us to ns: quota_us * 1000
285        let quota_ns = self.quota_us.saturating_mul(1000);
286        let effective_quota = quota_ns.saturating_add(self.burst_budget_ns);
287
288        if self.runtime_consumed_ns > effective_quota {
289            self.throttled = true;
290            self.stats.nr_throttled = self.stats.nr_throttled.saturating_add(1);
291            let overshoot = self.runtime_consumed_ns.saturating_sub(effective_quota);
292            self.stats.throttled_time_ns = self.stats.throttled_time_ns.saturating_add(overshoot);
293            true
294        } else {
295            false
296        }
297    }
298
299    /// Begin a new scheduling period. Refills runtime and handles burst.
300    pub fn new_period(&mut self) {
301        self.stats.nr_periods = self.stats.nr_periods.saturating_add(1);
302
303        if self.quota_us > 0 {
304            let quota_ns = self.quota_us.saturating_mul(1000);
305            // Any unused runtime becomes burst budget (up to burst limit)
306            if self.runtime_consumed_ns < quota_ns {
307                let unused = quota_ns.saturating_sub(self.runtime_consumed_ns);
308                let burst_limit_ns = self.burst_us.saturating_mul(1000);
309                self.burst_budget_ns = self
310                    .burst_budget_ns
311                    .saturating_add(unused)
312                    .min(burst_limit_ns);
313                if unused > 0 {
314                    self.stats.nr_bursts = self.stats.nr_bursts.saturating_add(1);
315                    self.stats.burst_time_ns = self.stats.burst_time_ns.saturating_add(unused);
316                }
317            } else {
318                // Used from burst budget
319                let overdraft = self.runtime_consumed_ns.saturating_sub(quota_ns);
320                self.burst_budget_ns = self.burst_budget_ns.saturating_sub(overdraft);
321            }
322        }
323
324        self.runtime_consumed_ns = 0;
325        self.throttled = false;
326    }
327
328    /// Calculate the effective CPU percentage (quota/period * 100).
329    /// Returns percentage * 100 (fixed-point with 2 decimal digits).
330    /// For example, quota=50000, period=100000 returns 5000 (50.00%).
331    pub fn effective_cpu_percent_x100(&self) -> u64 {
332        if self.quota_us == 0 || self.period_us == 0 {
333            return 0; // unlimited or invalid
334        }
335        // (quota_us * 10000) / period_us gives percent * 100
336        self.quota_us
337            .saturating_mul(10000)
338            .checked_div(self.period_us)
339            .unwrap_or(0)
340    }
341
342    /// Compute the weight-proportional share of CPU time for this cgroup
343    /// relative to a total weight sum. Returns nanoseconds per period.
344    pub fn proportional_runtime_ns(&self, total_shares: u32) -> u64 {
345        if total_shares == 0 {
346            return 0;
347        }
348        let period_ns = self.period_us.saturating_mul(1000);
349        // (shares * period_ns) / total_shares
350        let shares_u64 = self.shares as u64;
351        shares_u64
352            .saturating_mul(period_ns)
353            .checked_div(total_shares as u64)
354            .unwrap_or(0)
355    }
356}