diff --git a/Cargo.lock b/Cargo.lock index 89ac8c5bb..39c72e03a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.13" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] @@ -588,6 +588,7 @@ dependencies = [ "protobuf", "protobuf-codegen-pure", "rand 0.8.0", + "regex", "serde_json", "tempfile", "test-case", @@ -707,9 +708,9 @@ checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" [[package]] name = "memchr" -version = "2.3.3" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "memoffset" @@ -1050,14 +1051,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.2" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", ] [[package]] @@ -1071,9 +1071,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.21" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "remove_dir_all" @@ -1309,15 +1309,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - [[package]] name = "time" version = "0.1.44" diff --git a/lib/Cargo.toml b/lib/Cargo.toml index edb501f08..bf6bb7b1c 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -30,6 +30,7 @@ pest = "2.1.3" pest_derive = "2.1.0" protobuf = { version = "2.22.1", features = ["with-bytes"] } rand = "0.8.0" +regex = "1.5.4" serde_json = "1.0.60" tempfile = "3.1.0" thiserror = "1.0.22" diff --git a/lib/src/gitignore.rs b/lib/src/gitignore.rs new file mode 100644 index 000000000..68ec918d5 --- /dev/null +++ b/lib/src/gitignore.rs @@ -0,0 +1,413 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use regex::{escape as regex_escape, Regex}; + +pub enum GitIgnoreParseError {} + +struct GitIgnoreLine { + is_negative: bool, + regex: Regex, +} + +impl GitIgnoreLine { + // Remove trailing spaces (unless backslash-escaped) + fn remove_trailing_space(input: &str) -> &str { + let mut trimmed_len = 0; + let mut non_space_seen = false; + let mut prev_was_space = false; + let mut in_escape = false; + for (i, c) in input.char_indices() { + if !prev_was_space && non_space_seen { + trimmed_len = i; + } + if c == ' ' { + if in_escape { + in_escape = false; + } else { + prev_was_space = true; + } + } else { + non_space_seen = true; + prev_was_space = false; + in_escape = c == '\\' + } + } + if !prev_was_space && non_space_seen { + trimmed_len = input.len(); + } + &input.split_at(trimmed_len).0 + } + + fn parse(prefix: &str, input: &str) -> Result, GitIgnoreParseError> { + if input.starts_with('#') { + return Ok(None); + } + + let input = GitIgnoreLine::remove_trailing_space(input); + // Remove leading "!" before checking for empty to match git's implementation + // (i.e. just "!" matching nothing, not everything). + let (is_negative, input) = match input.strip_prefix('!') { + None => (false, input), + Some(rest) => (true, rest), + }; + if input.is_empty() { + return Ok(None); + } + + let (matches_only_directory, input) = match input.strip_suffix('/') { + None => (false, input), + Some(rest) => (true, rest), + }; + let (mut is_rooted, input) = match input.strip_prefix('/') { + None => (false, input), + Some(rest) => (true, rest), + }; + is_rooted |= input.contains('/'); + + let mut regex = String::new(); + if is_rooted { + regex.insert_str(0, prefix); + regex.insert(0, '^'); + } else { + regex.insert_str(0, "(^|/)"); + } + + let components: Vec<_> = input.split('/').collect(); + for (i, component) in components.iter().enumerate() { + if *component == "**" { + if i == components.len() - 1 { + regex.push_str(".*"); + } else { + regex.push_str("(.*/)?"); + } + } else { + let mut in_escape = false; + let mut character_class: Option = None; + for c in component.chars() { + if in_escape { + in_escape = false; + if !matches!(c, ' ' | '#' | '!' | '?' | '\\' | '*') { + regex.push_str(®ex_escape("\\")); + } + regex.push_str(®ex_escape(&c.to_string())); + } else if c == '\\' { + in_escape = true; + } else if let Some(characters) = &mut character_class { + if c == ']' { + regex.push('['); + regex.push_str(&characters); + regex.push(']'); + character_class = None; + } else { + characters.push(c); + } + } else { + in_escape = false; + if c == '?' { + regex.push_str("[^/]"); + } else if c == '*' { + regex.push_str("[^/]*"); + } else if c == '[' { + character_class = Some(String::new()); + } else { + regex.push_str(®ex_escape(&c.to_string())); + } + } + } + if in_escape { + regex.push_str(®ex_escape("\\")); + } + if i < components.len() - 1 { + regex.push('/'); + } + } + } + if matches_only_directory { + regex.push_str("/.*"); + } else { + regex.push_str("(/.*|$)"); + } + let regex = Regex::new(®ex).unwrap(); + + Ok(Some(GitIgnoreLine { is_negative, regex })) + } + + fn matches_file(&self, path: &str) -> bool { + self.regex.is_match(path) + } +} + +pub struct GitIgnoreFile { + parent: Option>, + lines: Vec, +} + +impl GitIgnoreFile { + pub fn empty() -> Arc { + Arc::new(GitIgnoreFile { + parent: None, + lines: vec![], + }) + } + + pub fn chain( + self: &Arc, + prefix: &str, + input: &[u8], + ) -> Result, GitIgnoreParseError> { + let mut lines = vec![]; + for input_line in input.split(|b| *b == b'\n') { + // Skip non-utf8 lines + if let Ok(line_string) = String::from_utf8(input_line.to_vec()) { + if let Some(line) = GitIgnoreLine::parse(prefix, &line_string)? { + lines.push(line); + } + } + } + + Ok(Arc::new(GitIgnoreFile { + parent: Some(self.clone()), + lines, + })) + } + + fn all_lines_reversed<'a>(&'a self) -> Box + 'a> { + if let Some(parent) = &self.parent { + Box::new(self.lines.iter().rev().chain(parent.all_lines_reversed())) + } else { + Box::new(self.lines.iter().rev()) + } + } + + pub fn matches_file(&self, path: &str) -> bool { + // Later lines take precedence, so check them in reverse + for line in self.all_lines_reversed() { + if line.matches_file(path) { + return !line.is_negative; + } + } + false + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + fn matches_file(input: &[u8], path: &str) -> bool { + let file = GitIgnoreFile::empty().chain("", input).ok().unwrap(); + file.matches_file(path) + } + + #[test] + fn test_gitignore_empty_file() { + let file = GitIgnoreFile::empty(); + assert!(!file.matches_file("foo")); + } + + #[test] + fn test_gitignore_empty_file_with_prefix() { + let file = GitIgnoreFile::empty().chain("dir/", b"").ok().unwrap(); + assert!(!file.matches_file("dir/foo")); + } + + #[test] + fn test_gitignore_literal() { + let file = GitIgnoreFile::empty().chain("", b"foo\n").ok().unwrap(); + assert!(file.matches_file("foo")); + assert!(file.matches_file("dir/foo")); + assert!(file.matches_file("dir/subdir/foo")); + assert!(!file.matches_file("food")); + assert!(!file.matches_file("dir/food")); + } + + #[test] + fn test_gitignore_literal_with_prefix() { + let file = GitIgnoreFile::empty().chain("dir/", b"foo\n").ok().unwrap(); + // I consider it undefined whether a file in a parent directory matches, but + // let's test it anyway + assert!(file.matches_file("foo")); + assert!(file.matches_file("dir/foo")); + assert!(file.matches_file("dir/subdir/foo")); + } + + #[test] + fn test_gitignore_rooted_literal() { + let file = GitIgnoreFile::empty().chain("", b"/foo\n").ok().unwrap(); + assert!(file.matches_file("foo")); + assert!(!file.matches_file("dir/foo")); + } + + #[test] + fn test_gitignore_rooted_literal_with_prefix() { + let file = GitIgnoreFile::empty() + .chain("dir/", b"/foo\n") + .ok() + .unwrap(); + // I consider it undefined whether a file in a parent directory matches, but + // let's test it anyway + assert!(!file.matches_file("foo")); + assert!(file.matches_file("dir/foo")); + assert!(!file.matches_file("dir/subdir/foo")); + } + + #[test] + fn test_gitignore_deep_dir() { + let file = GitIgnoreFile::empty() + .chain("", b"/dir1/dir2/dir3\n") + .ok() + .unwrap(); + assert!(!file.matches_file("foo")); + assert!(!file.matches_file("dir1/foo")); + assert!(!file.matches_file("dir1/dir2/foo")); + assert!(file.matches_file("dir1/dir2/dir3/foo")); + assert!(file.matches_file("dir1/dir2/dir3/dir4/foo")); + } + + #[test] + fn test_gitignore_match_only_dir() { + let file = GitIgnoreFile::empty().chain("", b"/dir/\n").ok().unwrap(); + assert!(!file.matches_file("dir")); + assert!(file.matches_file("dir/foo")); + assert!(file.matches_file("dir/subdir/foo")); + } + + #[test] + fn test_gitignore_unusual_symbols() { + assert!(matches_file(b"\\*\n", "*")); + assert!(!matches_file(b"\\*\n", "foo")); + assert!(matches_file(b"\\\n", "\\")); + assert!(matches_file(b"\\!\n", "!")); + assert!(matches_file(b"\\?\n", "?")); + assert!(!matches_file(b"\\?\n", "x")); + // Invalid escapes are treated like literal backslashes + assert!(matches_file(b"\\w\n", "\\w")); + assert!(!matches_file(b"\\w\n", "w")); + } + + #[test] + fn test_gitignore_whitespace() { + assert!(!matches_file(b" \n", " ")); + assert!(matches_file(b"\\ \n", " ")); + assert!(matches_file(b" a\n", " a")); + assert!(matches_file(b"a b\n", "a b")); + assert!(matches_file(b"a b \n", "a b")); + assert!(!matches_file(b"a b \n", "a b ")); + assert!(matches_file(b"a b\\ \\ \n", "a b ")); + // It's unclear how this should be interpreted, but we count spaces before + // escaped spaces + assert!(matches_file(b"a b \\ \n", "a b ")); + } + + #[test] + fn test_gitignore_glob() { + assert!(!matches_file(b"*.o\n", "foo")); + assert!(matches_file(b"*.o\n", "foo.o")); + assert!(!matches_file(b"foo.?\n", "foo")); + assert!(!matches_file(b"foo.?\n", "foo.")); + assert!(matches_file(b"foo.?\n", "foo.o")); + } + + #[test] + fn test_gitignore_range() { + assert!(!matches_file(b"foo.[az]\n", "foo")); + assert!(matches_file(b"foo.[az]\n", "foo.a")); + assert!(!matches_file(b"foo.[az]\n", "foo.g")); + assert!(matches_file(b"foo.[az]\n", "foo.z")); + assert!(!matches_file(b"foo.[a-z]\n", "foo")); + assert!(matches_file(b"foo.[a-z]\n", "foo.a")); + assert!(matches_file(b"foo.[a-z]\n", "foo.g")); + assert!(matches_file(b"foo.[a-z]\n", "foo.z")); + assert!(matches_file(b"foo.[0-9a-fA-F]\n", "foo.5")); + assert!(matches_file(b"foo.[0-9a-fA-F]\n", "foo.c")); + assert!(matches_file(b"foo.[0-9a-fA-F]\n", "foo.E")); + assert!(!matches_file(b"foo.[0-9a-fA-F]\n", "foo._")); + } + + #[test] + fn test_gitignore_leading_dir_glob() { + assert!(matches_file(b"**/foo\n", "foo")); + assert!(matches_file(b"**/foo\n", "dir1/dir2/foo")); + assert!(matches_file(b"**/foo\n", "foo/file")); + assert!(matches_file(b"**/dir/foo\n", "dir/foo")); + assert!(matches_file(b"**/dir/foo\n", "dir1/dir2/dir/foo")); + } + + #[test] + fn test_gitignore_leading_dir_glob_with_prefix() { + let file = GitIgnoreFile::empty() + .chain("dir1/dir2/", b"**/foo\n") + .ok() + .unwrap(); + // I consider it undefined whether a file in a parent directory matches, but + // let's test it anyway + assert!(!file.matches_file("foo")); + assert!(file.matches_file("dir1/dir2/foo")); + assert!(!file.matches_file("dir1/dir2/bar")); + assert!(file.matches_file("dir1/dir2/sub1/sub2/foo")); + assert!(!file.matches_file("dir1/dir2/sub1/sub2/bar")); + } + + #[test] + fn test_gitignore_trailing_dir_glob() { + assert!(!matches_file(b"abc/**\n", "abc")); + assert!(matches_file(b"abc/**\n", "abc/file")); + assert!(matches_file(b"abc/**\n", "abc/dir/file")); + } + + #[test] + fn test_gitignore_internal_dir_glob() { + assert!(matches_file(b"a/**/b\n", "a/b")); + assert!(matches_file(b"a/**/b\n", "a/x/b")); + assert!(matches_file(b"a/**/b\n", "a/x/y/b")); + assert!(!matches_file(b"a/**/b\n", "ax/y/b")); + assert!(!matches_file(b"a/**/b\n", "a/x/yb")); + assert!(!matches_file(b"a/**/b\n", "ab")); + } + + #[test] + fn test_gitignore_internal_dir_glob_not_really() { + assert!(!matches_file(b"a/x**y/b\n", "a/b")); + assert!(matches_file(b"a/x**y/b\n", "a/xy/b")); + assert!(matches_file(b"a/x**y/b\n", "a/xzzzy/b")); + } + + #[test] + fn test_gitignore_line_ordering() { + assert!(matches_file(b"foo\n!foo/bar\n", "foo")); + assert!(!matches_file(b"foo\n!foo/bar\n", "foo/bar")); + assert!(matches_file(b"foo\n!foo/bar\n", "foo/baz")); + assert!(matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo")); + assert!(!matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar")); + assert!(matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar/baz")); + assert!(!matches_file(b"foo\n!foo/bar\nfoo/bar/baz", "foo/bar/quux")); + } + + #[test] + fn test_gitignore_file_ordering() { + let file1 = GitIgnoreFile::empty().chain("", b"foo\n").ok().unwrap(); + let file2 = file1.chain("foo/", b"!bar").ok().unwrap(); + let file3 = file2.chain("foo/bar/", b"baz").ok().unwrap(); + assert!(file1.matches_file("foo")); + assert!(file1.matches_file("foo/bar")); + assert!(!file2.matches_file("foo/bar")); + assert!(file2.matches_file("foo/baz")); + assert!(file3.matches_file("foo/bar/baz")); + assert!(!file3.matches_file("foo/bar/qux")); + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 40ae2682f..2d1454b94 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -33,6 +33,7 @@ pub mod evolution; pub mod files; pub mod git; pub mod git_store; +pub mod gitignore; pub mod index; pub mod index_store; pub mod local_store; diff --git a/lib/src/working_copy.rs b/lib/src/working_copy.rs index 1a22cabb0..2504ae20d 100644 --- a/lib/src/working_copy.rs +++ b/lib/src/working_copy.rs @@ -17,6 +17,7 @@ use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs; use std::fs::{File, OpenOptions}; +use std::io::Read; #[cfg(unix)] use std::os::unix::fs::symlink; #[cfg(unix)] @@ -27,13 +28,13 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::UNIX_EPOCH; -use git2::{Repository, RepositoryInitOptions}; use protobuf::Message; use tempfile::NamedTempFile; use thiserror::Error; use crate::commit::Commit; use crate::commit_builder::CommitBuilder; +use crate::gitignore::GitIgnoreFile; use crate::lock::FileLock; use crate::repo::ReadonlyRepo; use crate::repo_path::{ @@ -282,6 +283,25 @@ impl TreeState { self.store.write_symlink(path, str_target).unwrap() } + fn try_chain_gitignore( + base: &Arc, + prefix: &str, + file: PathBuf, + ) -> Arc { + if file.is_file() { + let mut file = File::open(file).unwrap(); + let mut buf = Vec::new(); + file.read_to_end(&mut buf).unwrap(); + if let Ok(chained) = base.chain(prefix, &buf) { + chained + } else { + base.clone() + } + } else { + base.clone() + } + } + // Look for changes to the working copy. If there are any changes, create // a new tree from it and return it, and also update the dirstate on disk. // TODO: respect ignores @@ -289,33 +309,30 @@ impl TreeState { // We create a temporary git repo with the working copy shared with ours only // so we can use libgit2's .gitignore check. // TODO: We should probably have the caller pass in the home directory to the - // library crate instead of depending on $HOME directly here (as we do because - // git2::Repository::status_should_ignore() reads the .gitignore there). - // TODO: Do this more cleanly, perhaps by reading .gitignore files ourselves. - let git_repo_dir = tempfile::tempdir().unwrap(); - let mut git_repo_options = RepositoryInitOptions::new(); - git_repo_options.workdir_path(&self.working_copy_path); - // Repository::init_opts creates a ".git" file in the working copy, - // which is undesired. On Windows it's worse because that ".git" makes - // the next Repository::init_opts fail with "Permission Denied". - // Automatically remove it. - let _cleanup_dot_git = { - struct Cleanup(PathBuf); - impl Drop for Cleanup { - fn drop(&mut self) { - let _ = fs::remove_file(&self.0); - } - } - Cleanup(self.working_copy_path.join(".git")) - }; - let git_repo = Repository::init_opts(git_repo_dir.path(), &git_repo_options).unwrap(); + // library crate instead of depending on $HOME directly here. We should also + // have the caller (within the library crate) chain that the + // .jj/git/info/exclude file if we're inside a git-backed repo. + let home_dir = std::env::var("HOME"); + let home_dir_path = PathBuf::from(home_dir.unwrap()); + let mut git_ignore = GitIgnoreFile::empty(); + git_ignore = + TreeState::try_chain_gitignore(&git_ignore, "", home_dir_path.join(".gitignore")); - let mut work = vec![(DirRepoPath::root(), self.working_copy_path.clone())]; + let mut work = vec![( + DirRepoPath::root(), + self.working_copy_path.clone(), + git_ignore, + )]; let mut tree_builder = self.store.tree_builder(self.tree_id.clone()); let mut deleted_files: HashSet<&FileRepoPath> = self.file_states.keys().collect(); let mut modified_files = BTreeMap::new(); while !work.is_empty() { - let (dir, disk_dir) = work.pop().unwrap(); + let (dir, disk_dir, git_ignore) = work.pop().unwrap(); + let git_ignore = TreeState::try_chain_gitignore( + &git_ignore, + &dir.to_internal_string(), + disk_dir.join(".gitignore"), + ); for maybe_entry in disk_dir.read_dir().unwrap() { let entry = maybe_entry.unwrap(); let file_type = entry.file_type().unwrap(); @@ -327,7 +344,7 @@ impl TreeState { if file_type.is_dir() { let subdir = dir.join(&DirRepoPathComponent::from(name)); let disk_subdir = disk_dir.join(file_name); - work.push((subdir, disk_subdir)); + work.push((subdir, disk_subdir, git_ignore.clone())); } else { let file = dir.join(&FileRepoPathComponent::from(name)); let disk_file = disk_dir.join(file_name); @@ -338,7 +355,7 @@ impl TreeState { match self.file_states.get(&file) { None => { // untracked - if git_repo.status_should_ignore(&disk_file).unwrap() { + if git_ignore.matches_file(&file.to_internal_string()) { continue; } clean = false;