1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
use super::*;
use unicode_width::UnicodeWidthChar;
#[cfg(test)]
mod tests;
/// Finds all newlines, multi-byte characters, and non-narrow characters in a
/// SourceFile.
///
/// This function will use an SSE2 enhanced implementation if hardware support
/// is detected at runtime.
pub fn analyze_source_file(
    src: &str,
) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
    let mut lines = vec![RelativeBytePos::from_u32(0)];
    let mut multi_byte_chars = vec![];
    let mut non_narrow_chars = vec![];
    // Calls the right implementation, depending on hardware support available.
    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
    // The code above optimistically registers a new line *after* each \n
    // it encounters. If that point is already outside the source_file, remove
    // it again.
    if let Some(&last_line_start) = lines.last() {
        let source_file_end = RelativeBytePos::from_usize(src.len());
        assert!(source_file_end >= last_line_start);
        if last_line_start == source_file_end {
            lines.pop();
        }
    }
    (lines, multi_byte_chars, non_narrow_chars)
}
cfg_match! {
    cfg(any(target_arch = "x86", target_arch = "x86_64")) => {
        fn analyze_source_file_dispatch(
            src: &str,
            lines: &mut Vec<RelativeBytePos>,
            multi_byte_chars: &mut Vec<MultiByteChar>,
            non_narrow_chars: &mut Vec<NonNarrowChar>,
        ) {
            if is_x86_feature_detected!("sse2") {
                unsafe {
                    analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
                }
            } else {
                analyze_source_file_generic(
                    src,
                    src.len(),
                    RelativeBytePos::from_u32(0),
                    lines,
                    multi_byte_chars,
                    non_narrow_chars,
                );
            }
        }
        /// Checks 16 byte chunks of text at a time. If the chunk contains
        /// something other than printable ASCII characters and newlines, the
        /// function falls back to the generic implementation. Otherwise it uses
        /// SSE2 intrinsics to quickly find all newlines.
        #[target_feature(enable = "sse2")]
        unsafe fn analyze_source_file_sse2(
            src: &str,
            lines: &mut Vec<RelativeBytePos>,
            multi_byte_chars: &mut Vec<MultiByteChar>,
            non_narrow_chars: &mut Vec<NonNarrowChar>,
        ) {
            #[cfg(target_arch = "x86")]
            use std::arch::x86::*;
            #[cfg(target_arch = "x86_64")]
            use std::arch::x86_64::*;
            const CHUNK_SIZE: usize = 16;
            let src_bytes = src.as_bytes();
            let chunk_count = src.len() / CHUNK_SIZE;
            // This variable keeps track of where we should start decoding a
            // chunk. If a multi-byte character spans across chunk boundaries,
            // we need to skip that part in the next chunk because we already
            // handled it.
            let mut intra_chunk_offset = 0;
            for chunk_index in 0..chunk_count {
                let ptr = src_bytes.as_ptr() as *const __m128i;
                // We don't know if the pointer is aligned to 16 bytes, so we
                // use `loadu`, which supports unaligned loading.
                let chunk = unsafe { _mm_loadu_si128(ptr.add(chunk_index)) };
                // For character in the chunk, see if its byte value is < 0, which
                // indicates that it's part of a UTF-8 char.
                let multibyte_test = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)) };
                // Create a bit mask from the comparison results.
                let multibyte_mask = unsafe { _mm_movemask_epi8(multibyte_test) };
                // If the bit mask is all zero, we only have ASCII chars here:
                if multibyte_mask == 0 {
                    assert!(intra_chunk_offset == 0);
                    // Check if there are any control characters in the chunk. All
                    // control characters that we can encounter at this point have a
                    // byte value less than 32 or ...
                    let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
                    let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
                    // ... it's the ASCII 'DEL' character with a value of 127.
                    let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
                    let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
                    let control_char_mask = control_char_mask0 | control_char_mask1;
                    if control_char_mask != 0 {
                        // Check for newlines in the chunk
                        let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
                        let newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
                        if control_char_mask == newlines_mask {
                            // All control characters are newlines, record them
                            let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
                            let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
                            loop {
                                let index = newlines_mask.trailing_zeros();
                                if index >= CHUNK_SIZE as u32 {
                                    // We have arrived at the end of the chunk.
                                    break;
                                }
                                lines.push(RelativeBytePos(index) + output_offset);
                                // Clear the bit, so we can find the next one.
                                newlines_mask &= (!1) << index;
                            }
                            // We are done for this chunk. All control characters were
                            // newlines and we took care of those.
                            continue;
                        } else {
                            // Some of the control characters are not newlines,
                            // fall through to the slow path below.
                        }
                    } else {
                        // No control characters, nothing to record for this chunk
                        continue;
                    }
                }
                // The slow path.
                // There are control chars in here, fallback to generic decoding.
                let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
                intra_chunk_offset = analyze_source_file_generic(
                    &src[scan_start..],
                    CHUNK_SIZE - intra_chunk_offset,
                    RelativeBytePos::from_usize(scan_start),
                    lines,
                    multi_byte_chars,
                    non_narrow_chars,
                );
            }
            // There might still be a tail left to analyze
            let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
            if tail_start < src.len() {
                analyze_source_file_generic(
                    &src[tail_start..],
                    src.len() - tail_start,
                    RelativeBytePos::from_usize(tail_start),
                    lines,
                    multi_byte_chars,
                    non_narrow_chars,
                );
            }
        }
    }
    _ => {
        // The target (or compiler version) does not support SSE2 ...
        fn analyze_source_file_dispatch(
            src: &str,
            lines: &mut Vec<RelativeBytePos>,
            multi_byte_chars: &mut Vec<MultiByteChar>,
            non_narrow_chars: &mut Vec<NonNarrowChar>,
        ) {
            analyze_source_file_generic(
                src,
                src.len(),
                RelativeBytePos::from_u32(0),
                lines,
                multi_byte_chars,
                non_narrow_chars,
            );
        }
    }
}
// `scan_len` determines the number of bytes in `src` to scan. Note that the
// function can read past `scan_len` if a multi-byte character start within the
// range but extends past it. The overflow is returned by the function.
fn analyze_source_file_generic(
    src: &str,
    scan_len: usize,
    output_offset: RelativeBytePos,
    lines: &mut Vec<RelativeBytePos>,
    multi_byte_chars: &mut Vec<MultiByteChar>,
    non_narrow_chars: &mut Vec<NonNarrowChar>,
) -> usize {
    assert!(src.len() >= scan_len);
    let mut i = 0;
    let src_bytes = src.as_bytes();
    while i < scan_len {
        let byte = unsafe {
            // We verified that i < scan_len <= src.len()
            *src_bytes.get_unchecked(i)
        };
        // How much to advance in order to get to the next UTF-8 char in the
        // string.
        let mut char_len = 1;
        if byte < 32 {
            // This is an ASCII control character, it could be one of the cases
            // that are interesting to us.
            let pos = RelativeBytePos::from_usize(i) + output_offset;
            match byte {
                b'\n' => {
                    lines.push(pos + RelativeBytePos(1));
                }
                b'\t' => {
                    non_narrow_chars.push(NonNarrowChar::Tab(pos));
                }
                _ => {
                    non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
                }
            }
        } else if byte >= 127 {
            // The slow path:
            // This is either ASCII control character "DEL" or the beginning of
            // a multibyte char. Just decode to `char`.
            let c = src[i..].chars().next().unwrap();
            char_len = c.len_utf8();
            let pos = RelativeBytePos::from_usize(i) + output_offset;
            if char_len > 1 {
                assert!((2..=4).contains(&char_len));
                let mbc = MultiByteChar { pos, bytes: char_len as u8 };
                multi_byte_chars.push(mbc);
            }
            // Assume control characters are zero width.
            // FIXME: How can we decide between `width` and `width_cjk`?
            let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
            if char_width != 1 {
                non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
            }
        }
        i += char_len;
    }
    i - scan_len
}