diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs index 18d817f822a..2141e9029ef 100644 --- a/crates/edit/benches/lib.rs +++ b/crates/edit/benches/lib.rs @@ -10,6 +10,7 @@ use edit::helpers::*; use edit::{buffer, glob, hash, json, oklab, simd, unicode}; use stdext::arena::{self, scratch_arena}; use stdext::collections::BVec; +use stdext::unicode::Utf8Chars; struct EditingTracePatch<'a>(usize, usize, &'a str); @@ -272,9 +273,7 @@ fn bench_unicode(c: &mut Criterion) { c.benchmark_group("unicode::Utf8Chars") .throughput(Throughput::Bytes(bytes.len() as u64)) .bench_function("next", |b| { - b.iter(|| { - unicode::Utf8Chars::new(bytes, 0).fold(0u32, |acc, ch| acc.wrapping_add(ch as u32)) - }) + b.iter(|| Utf8Chars::new(bytes, 0).fold(0u32, |acc, ch| acc.wrapping_add(ch as u32))) }); } diff --git a/crates/edit/src/buffer/mod.rs b/crates/edit/src/buffer/mod.rs index 51e4b38a46e..46350b77f01 100644 --- a/crates/edit/src/buffer/mod.rs +++ b/crates/edit/src/buffer/mod.rs @@ -36,6 +36,7 @@ use std::str; pub use gap_buffer::GapBuffer; use stdext::arena::{Arena, scratch_arena}; use stdext::collections::{BString, BVec}; +use stdext::unicode::Utf8Chars; use stdext::{ReplaceRange as _, arena_write_fmt, minmax, slice_as_uninit_mut, slice_copy_safe}; use crate::cell::SemiRefCell; @@ -45,7 +46,7 @@ use crate::framebuffer::{Framebuffer, IndexedColor}; use crate::helpers::*; use crate::oklab::StraightRgba; use crate::simd::memchr2; -use crate::unicode::{self, Cursor, MeasurementConfig, Utf8Chars}; +use crate::unicode::{self, Cursor, MeasurementConfig}; use crate::{icu, simd}; /// The margin template is used for line numbers. diff --git a/crates/edit/src/icu.rs b/crates/edit/src/icu.rs index c03e99da849..67578aaffe6 100644 --- a/crates/edit/src/icu.rs +++ b/crates/edit/src/icu.rs @@ -13,10 +13,10 @@ use std::{fmt, mem}; use stdext::arena::{Arena, scratch_arena}; use stdext::arena_format; use stdext::collections::{BString, BVec}; +use stdext::unicode::Utf8Chars; use crate::buffer::TextBuffer; use crate::sys; -use crate::unicode::Utf8Chars; pub(crate) const ILLEGAL_ARGUMENT_ERROR: Error = Error(1); // U_ILLEGAL_ARGUMENT_ERROR pub const ICU_MISSING_ERROR: Error = Error(0); diff --git a/crates/edit/src/unicode/measurement.rs b/crates/edit/src/unicode/measurement.rs index 806f8494901..aab03204589 100644 --- a/crates/edit/src/unicode/measurement.rs +++ b/crates/edit/src/unicode/measurement.rs @@ -2,8 +2,8 @@ // Licensed under the MIT License. use stdext::cold_path; +use stdext::unicode::Utf8Chars; -use super::Utf8Chars; use super::tables::*; use crate::document::ReadableDocument; use crate::helpers::{CoordType, Point}; diff --git a/crates/edit/src/unicode/mod.rs b/crates/edit/src/unicode/mod.rs index 20cf301c3e9..43bd25c361a 100644 --- a/crates/edit/src/unicode/mod.rs +++ b/crates/edit/src/unicode/mod.rs @@ -5,7 +5,5 @@ mod measurement; mod tables; -mod utf8; pub use measurement::*; -pub use utf8::*; diff --git a/crates/edit/src/vt.rs b/crates/edit/src/vt.rs index caf31633766..451d1a213ea 100644 --- a/crates/edit/src/vt.rs +++ b/crates/edit/src/vt.rs @@ -5,8 +5,9 @@ use std::time; +use stdext::unicode::Utf8Chars; + use crate::simd::memchr2; -use crate::unicode::Utf8Chars; /// The parser produces these tokens. pub enum Token<'parser, 'input> { diff --git a/crates/stdext/src/collections/string.rs b/crates/stdext/src/collections/string.rs index 23e32d2235c..229396c9072 100644 --- a/crates/stdext/src/collections/string.rs +++ b/crates/stdext/src/collections/string.rs @@ -112,12 +112,6 @@ impl<'a> BString<'a> { self.vec.is_empty() } - /// True if if the buffer is full. - #[inline] - pub fn is_full(&self) -> bool { - self.vec.is_full() - } - /// The raw UTF-8 bytes. #[inline] pub fn as_bytes(&self) -> &[u8] { diff --git a/crates/stdext/src/collections/vec.rs b/crates/stdext/src/collections/vec.rs index 18ef89cf5fe..1bddaa4b611 100644 --- a/crates/stdext/src/collections/vec.rs +++ b/crates/stdext/src/collections/vec.rs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +use std::hint::assert_unchecked; use std::iter::FusedIterator; use std::marker::PhantomData; use std::mem::MaybeUninit; @@ -109,12 +110,6 @@ impl<'a, T> BVec<'a, T> { self.len == 0 } - /// True if if the buffer is full. - #[inline] - pub fn is_full(&self) -> bool { - self.len == self.cap - } - /// Forcibly sets the length. /// /// # Safety @@ -186,28 +181,53 @@ impl<'a, T> BVec<'a, T> { /// Ensures space for at least `additional` more elements, with amortized growth. #[inline] pub fn reserve(&mut self, alloc: &'a dyn Allocator, additional: usize) { - if additional > self.cap - self.len { + let len = self.len; + let cap = self.cap; + if additional > cap - len { self.grow(alloc, self.cap, additional); } + unsafe { + // Right now the following asserts are somewhat useless, because they only work + // if grow() is inline(never). I don't know why that is either. But I'm leaving + // them here, in case we need them in the future - they don't hurt until then. + // First, we can tell the compiler that re-fetching self.len after grow() is unnecessary. + assert_unchecked(self.len == len); + // Next, we can assert that after reserve(4), we have room for 4 more elements. + // Naively you'd expect this to be `self.len + additional <= self.cap`, but LLVM doesn't + // work very well with `<=` bounds, so we use `<` here. It _must_ be `additional - 1`. + assert_unchecked(additional == 0 || self.len.unchecked_add(additional - 1) < self.cap); + } } /// Ensures space for at least `additional` more elements, without over-allocating. #[inline] pub fn reserve_exact(&mut self, alloc: &'a dyn Allocator, additional: usize) { - if additional > self.cap - self.len { + let len = self.len; + let cap = self.cap; + if additional > cap - len { self.grow(alloc, 0, additional); } + unsafe { + // See reserve(). + assert_unchecked(self.len == len); + assert_unchecked(additional == 0 || self.len.unchecked_add(additional - 1) < self.cap); + } } #[inline] fn reserve_one(&mut self, alloc: &'a dyn Allocator) { - if self.is_full() { - self.grow(alloc, self.cap, 1); + let len = self.len; + let cap = self.cap; + if len >= cap { + self.grow(alloc, cap, 1); + } + unsafe { + // See reserve(). + assert_unchecked(self.len == len); + assert_unchecked(self.len < self.cap); } } - // NOTE: I'm using dyn(amic dispatch) to avoid monomorphization bloat and more - // importantly because I counter-intuitively found it to boost performance by +20%. #[cold] fn grow(&mut self, alloc: &'a dyn Allocator, cap: usize, add: usize) { debug_assert!(add > 0, "growing by zero makes no sense"); @@ -247,6 +267,22 @@ impl<'a, T> BVec<'a, T> { } } + pub fn pop(&mut self) -> Option { + if self.is_empty() { + return None; + } + unsafe { + self.len -= 1; + + // See: https://github.com/rust-lang/rust/issues/114334 + // This assert helps the optimizer understand that + // after a pop it can push once without reallocating. + assert_unchecked(self.len < self.cap); + + Some(self.as_ptr().add(self.len).read()) + } + } + /// Append the items from the iterator `iter`. /// /// By assuming that your "exact size iterator" returns an *exact* size, @@ -400,6 +436,7 @@ impl<'a, T: Copy> BVec<'a, T> { } } +#[cfg(windows)] unsafe extern "system" { fn MultiByteToWideChar( CodePage: u32, @@ -413,8 +450,11 @@ unsafe extern "system" { impl<'a> BVec<'a, u16> { pub fn push_encode_utf16(&mut self, alloc: &'a dyn Allocator, utf8: &[u8]) { + self.reserve(alloc, utf8.len()); // worst case ASCII: 1 byte per char + + // MultiByteToWideChar is ~2x faster than the UTF8 loop below and saves space. + #[cfg(windows)] unsafe { - self.reserve(alloc, utf8.len()); // worst case ASCII: 1 byte per char let dst = self.spare_mut_ptr() as *mut u16; let len = MultiByteToWideChar( 65001, @@ -426,6 +466,26 @@ impl<'a> BVec<'a, u16> { ); self.len += len.max(0) as usize; } + + #[cfg(not(windows))] + unsafe { + let beg = self.spare_mut_ptr(); + let mut dst = beg; + + for ch in crate::unicode::Utf8Chars::new(utf8, 0) { + if ch <= '\u{FFFF}' { + (*dst).write(ch as u16); + dst = dst.add(1); + } else { + let ch = ch as u32 - 0x10000; + (*dst.add(0)).write(0xD800 | ((ch >> 10) as u16)); + (*dst.add(1)).write(0xDC00 | ((ch as u16) & 0x3FF)); + dst = dst.add(2); + } + } + + self.len += dst.offset_from_unsigned(beg); + } } } diff --git a/crates/stdext/src/lib.rs b/crates/stdext/src/lib.rs index 5a94e39612b..30b59d54903 100644 --- a/crates/stdext/src/lib.rs +++ b/crates/stdext/src/lib.rs @@ -9,5 +9,6 @@ pub mod collections; mod helpers; pub mod simd; pub mod sys; +pub mod unicode; pub use helpers::*; diff --git a/crates/stdext/src/unicode/mod.rs b/crates/stdext/src/unicode/mod.rs new file mode 100644 index 00000000000..4722eb7ae99 --- /dev/null +++ b/crates/stdext/src/unicode/mod.rs @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Everything related to Unicode lives here. + +mod utf8; + +pub use utf8::*; diff --git a/crates/edit/src/unicode/utf8.rs b/crates/stdext/src/unicode/utf8.rs similarity index 100% rename from crates/edit/src/unicode/utf8.rs rename to crates/stdext/src/unicode/utf8.rs