commit 0f8550b517dac7896af254fb3c5d4590b89d86ab Author: Matthieu Jolimaitre Date: Fri Aug 22 12:29:55 2025 +0200 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c41cc9e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..cdbcb7f --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,14 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "mousquet" +version = "0.1.0" + +[[package]] +name = "mousquetaire" +version = "0.1.0" +dependencies = [ + "mousquet", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4dc58fa --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,2 @@ +[workspace] +members = ["mousquet", "mousquetaire"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..92547fe --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# Mousquet + +Utility for making rough, euristic estimations of code similarity between +implementations. + +The similarity algorithm is based on token sequence matching, like some other +software serving the same purpose. This aproach has many limitations but may fit +some use cases. + +## Example + +```bash +$ mousquetaire 'examples/primes_1.py' 'examples/primes_2.py' +``` + +![screenshot](./assets/screenshot.png) + +## Build + +### Dependencies + +- cargo + - Install cargo through rustup + - `pacman -S rustup` + - `curl --proto '=https' --tlsv1.2 -sSf 'https://sh.rustup.rs' | sh` + - Use any toolchain + - `rustup default stable` + +## Building + +```bash +cargo build --release +``` + +Find the binary at `mousquetaire/target/release/mousquetaire`. diff --git a/assets/screenshot.png b/assets/screenshot.png new file mode 100644 index 0000000..e286060 Binary files /dev/null and b/assets/screenshot.png differ diff --git a/mousquet/Cargo.toml b/mousquet/Cargo.toml new file mode 100644 index 0000000..fb1cab6 --- /dev/null +++ b/mousquet/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "mousquet" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/mousquet/examples/differents.rs b/mousquet/examples/differents.rs new file mode 100644 index 0000000..0d0979f --- /dev/null +++ b/mousquet/examples/differents.rs @@ -0,0 +1,10 @@ +fn main() { + let source_a = include_str!("primes_1.py"); + let source_b = include_str!("primes_2.py"); + let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b); + for sim in sims.token_matches { + let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n"); + let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n"); + println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'"); + } +} diff --git a/mousquet/examples/primes_1.py b/mousquet/examples/primes_1.py new file mode 100644 index 0000000..ba4d934 --- /dev/null +++ b/mousquet/examples/primes_1.py @@ -0,0 +1,21 @@ + +# Generates the sequence of prime numbers up to a maximum number. +def primes(max = 999_999_999): + found = list[int]() # Known primes for subsequent dividability checks. + for value in range(2, max): + is_prime = True # Prime until proven otherwise. + for prime in found: + if value % prime == 0: + is_prime = False + break + if is_prime: + yield value + found.append(value) + + +def main(): + for p in primes(): + print(p) + + +if __name__ == "__main__": main() diff --git a/mousquet/examples/primes_1_ren.py b/mousquet/examples/primes_1_ren.py new file mode 100644 index 0000000..8bb8edf --- /dev/null +++ b/mousquet/examples/primes_1_ren.py @@ -0,0 +1,24 @@ +# original file ? + + +def get_pr(limit = 999_999_999): + result = list[int]() + + for num in range(2, limit): + valid = True + for known in result: + if num % known == 0: + valid = False + break + + if valid: + yield num + result.append(num) + + +def main(): + for num in get_pr(): + print(num) + + +if __name__ == "__main__": main() diff --git a/mousquet/examples/primes_2.py b/mousquet/examples/primes_2.py new file mode 100644 index 0000000..26c7286 --- /dev/null +++ b/mousquet/examples/primes_2.py @@ -0,0 +1,24 @@ + +from typing import Generator + + +def prime_numbers(max = 999_999_999): + def rec_between(value: int, at: int, until: int) -> bool: + if at >= until: return True + if value % at == 0: return False + return rec_between(value, at + 1, until) + def rec(value: int) -> Generator[int]: + if value >= max: return + if rec_between(value, 2, value): yield value + for r in rec(value + 1): yield r + for r in rec(2): yield r + + +def print_all(): + for p in prime_numbers(): + print(p) + + +if __name__ == "__main__": print_all() + +# author: mb diff --git a/mousquet/examples/renamed.rs b/mousquet/examples/renamed.rs new file mode 100644 index 0000000..a8f67c6 --- /dev/null +++ b/mousquet/examples/renamed.rs @@ -0,0 +1,10 @@ +fn main() { + let source_a = include_str!("primes_1.py"); + let source_b = include_str!("primes_1_ren.py"); + let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b); + for sim in sims.token_matches { + let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n"); + let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n"); + println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'"); + } +} diff --git a/mousquet/examples/same.rs b/mousquet/examples/same.rs new file mode 100644 index 0000000..4d73f11 --- /dev/null +++ b/mousquet/examples/same.rs @@ -0,0 +1,10 @@ +fn main() { + let source_a = include_str!("primes_1.py"); + let source_b = include_str!("primes_1.py"); + let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b); + for sim in sims.token_matches { + let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n"); + let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n"); + println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'"); + } +} diff --git a/mousquet/examples/small.py b/mousquet/examples/small.py new file mode 100644 index 0000000..cc2ff96 --- /dev/null +++ b/mousquet/examples/small.py @@ -0,0 +1,2 @@ +def hello(): + print("Hello World") diff --git a/mousquet/examples/tokenize.rs b/mousquet/examples/tokenize.rs new file mode 100644 index 0000000..fe5e830 --- /dev/null +++ b/mousquet/examples/tokenize.rs @@ -0,0 +1,6 @@ +fn main() { + let source = include_str!("small.py"); + let language = mousquet::lang::python::LANG; + let tokens = (language.tokenizer)(source); + dbg!(&tokens, tokens.len()); +} diff --git a/mousquet/src/lang.rs b/mousquet/src/lang.rs new file mode 100644 index 0000000..fe51458 --- /dev/null +++ b/mousquet/src/lang.rs @@ -0,0 +1,39 @@ +use std::{fmt::Debug, ops::Range}; + +pub mod python; + +pub const PYTHON: Lang = python::LANG; +pub const ALL: &[Lang] = &[PYTHON]; + +#[derive(Debug, Clone, Copy)] +pub struct Lang { + pub id: &'static str, + pub tokenizer: fn(&str) -> Vec>, + pub ignored_token: &'static [&'static str], + pub ignored_token_content: &'static [&'static str], +} + +pub type Span = Range; +pub type Located = (Span, T); + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct Token { + pub kind: &'static str, + pub content: String, +} + +impl Token { + pub fn of_kind(kind: &'static str) -> impl Fn(&str) -> Token { + move |content| Token { + kind, + content: content.into(), + } + } +} + +impl Debug for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Token { kind, content } = self; + f.write_fmt(format_args!(r#"Token({kind},"{content}")"#)) + } +} diff --git a/mousquet/src/lang/python.rs b/mousquet/src/lang/python.rs new file mode 100644 index 0000000..f817b38 --- /dev/null +++ b/mousquet/src/lang/python.rs @@ -0,0 +1,171 @@ +use crate::lang::{Lang, Located, Token}; + +pub const LANG: Lang = Lang { + id: "python", + tokenizer, + ignored_token: &["sp"], + ignored_token_content: &["id"], +}; + +pub fn tokenizer(text: &str) -> Vec> { + Tokenizer(text, text.len()).collect() +} + +pub struct Tokenizer<'s>(&'s str, usize); + +impl<'s> Iterator for Tokenizer<'s> { + type Item = Located; + fn next(&mut self) -> Option { + if self.0.is_empty() { + return None; + } + self.skip_comment(); + let start = self.1 - self.0.len(); + let result = self + .parse_space() + .or_else(|| self.parse_op()) + .or_else(|| self.parse_str()) + .or_else(|| self.parse_ident()) + .or_else(|| self.parse_unknown()); + let end = self.1 - self.0.len(); + result.map(|r| ((start..end), r)) + } +} + +impl<'s> Tokenizer<'s> { + fn skip_comment(&mut self) { + while self.0.starts_with("#") { + let line_length = self.0.find("\n").unwrap_or(self.0.len()); + self.0 = &self.0[line_length..]; + } + } + + fn try_take(&mut self, word: &str) -> Option<&str> { + if self.0.strip_prefix(word).is_some() { + let (word, rest) = self.0.split_at(word.len()); + self.0 = rest; + Some(word) + } else { + None + } + } + + fn parse_space(&mut self) -> Option { + [" ", "\n", "\t", "\r"] + .iter() + .filter_map(|op| self.try_take(op).map(Token::of_kind("sp"))) + .next() + } + + fn parse_op(&mut self) -> Option { + OPERATORS + .iter() + .filter_map(|op| self.try_take(op).map(Token::of_kind("op"))) + .next() + } + + fn parse_str(&mut self) -> Option { + let (open, close) = STR_STARTS.iter().find(|(s, _)| self.0.starts_with(s))?; + let mut content_length = 0; + loop { + let prefix_length = open.len() + content_length; + let remainder = match self.0.get(prefix_length..) { + None => break, + Some("") => break, + Some(r) => r, + }; + if remainder.starts_with("\\") { + content_length += 2; + continue; + } + if remainder.starts_with(close) { + let length = open.len() + content_length + close.len(); + let content = &self.0[..length]; + return self.try_take(content).map(Token::of_kind("str")); + } + content_length += 1; + } + None + } + + fn parse_ident(&mut self) -> Option { + let forbidden = " \n\t\r!-@*/&%^+<=>|~()[]{}:;,."; + let length = self.0.chars().take_while(|c| !forbidden.contains(*c)).count(); + self.try_take(&self.0[..length]).map(|content| { + let kind = match KEYWORDS.contains(&content) { + true => "kw", + false => "id", + }; + Token::of_kind(kind)(content) + }) + } + + fn parse_unknown(&mut self) -> Option { + let next_break = self.0.find(' ').unwrap_or(self.0.len()); + let content = self.try_take(&self.0[..next_break]).unwrap(); + Some(Token::of_kind("unk")(content)) + } +} + +/// Ordered by size then alphabetically. +const OPERATORS: &[&str] = &[ + "**=", // + "//=", // + "<<=", // + ">>=", // + "-=", // + "!=", // + "[]", // + "@=", // + "**", // + "*=", // + "//", // + "/=", // + "&=", // + "%=", // + "^=", // + "+=", // + "<<", // + "<=", // + "==", // + ">=", // + ">>", // + "|=", // + "-", // + "@", // + "*", // + "/", // + "&", // + "%", // + "^", // + "+", // + "<", // + "=", // + ">", // + "|", // + "~", // + "(", // + ")", // + "[", // + "]", // + "{", // + "}", // + ":", // + ";", // + ",", // + ".", // +]; + +const KEYWORDS: &[&str] = &[ + "def", "and", "or", "not", "for", "while", "in", "try", "raise", "except", "yield", "return", "import", "from", + "as", +]; + +const STR_STARTS: &[(&str, &str)] = &[ + (r#"r""""#, r#"""""#), + (r#"b""""#, r#"""""#), + (r#"""""#, r#"""""#), + (r#"r""#, r#"""#), + (r#"b""#, r#"""#), + (r#"""#, r#"""#), +]; diff --git a/mousquet/src/lcs.rs b/mousquet/src/lcs.rs new file mode 100644 index 0000000..5d8d5bf --- /dev/null +++ b/mousquet/src/lcs.rs @@ -0,0 +1,46 @@ +use crate::lang::Span; + +pub fn longuest_common_section(a: &[T], b: &[T]) -> Option<(Span, Span)> { + let max_size = a.len().min(b.len()); + for size in (1..=max_size).rev() { + for a_start in 0..=(a.len() - size) { + let a_span = a_start..(a_start + size); + let a_section = &a[a_span.clone()]; + for b_start in 0..=(b.len() - size) { + let b_span = b_start..(b_start + size); + let b_section = &b[b_span.clone()]; + if a_section == b_section { + return Some((a_span, b_span)); + } + } + } + } + None +} + +#[test] +fn test_longuest_common_section() { + fn illustrate<'a>((a, b): (&'a [i32], &'a [i32]), (sa, sb): (Span, Span)) -> (&'a [i32], &'a [i32]) { + (&a[sa], &b[sb]) + } + + fn case(a: [i32; A], b: [i32; B], expected: [i32; E]) { + let res = longuest_common_section(&a, &b).unwrap(); + let ill = illustrate((&a, &b), res); + let exp: (&[i32], &[i32]) = (&expected, &expected); + assert_eq!(ill, exp); + } + + case( + /*****/ [1, 2, 3, 4, 5, 6, 7, 8, 9], + /**/ [8, 9, 2, 3, 4], + /********/ [2, 3, 4], + ); + + case( + // + [1, 2, 3, 4, 5, 6], + [1, 2, 3, 4, 5, 6], + [1, 2, 3, 4, 5, 6], + ); +} diff --git a/mousquet/src/lib.rs b/mousquet/src/lib.rs new file mode 100644 index 0000000..e92950b --- /dev/null +++ b/mousquet/src/lib.rs @@ -0,0 +1,92 @@ +use std::ops::Range; + +pub mod lang; +pub mod lcs; + +use crate::lang::{Lang, Located, Span, Token}; + +pub fn similarity(lang: Lang, source_a: &str, source_b: &str) -> Similarity { + let tokens_a = (lang.tokenizer)(source_a); + let tokens_b = (lang.tokenizer)(source_b); + + let exact_matches = Vec::new(); + // TODO + + let mut token_matches = Vec::new(); + { + let tokens_a = tokens_a.clone(); + let tokens_b = tokens_b.clone(); + let (tokens_a, comparables_a) = comparable_parts_of(&lang, &tokens_a); + let (tokens_b, comparables_b) = comparable_parts_of(&lang, &tokens_b); + let mut segments_a = vec![(tokens_a, comparables_a)]; + let mut segments_b = vec![(tokens_b, comparables_b)]; + + let length_threshold = 6; + while let Some(biggest_common_segment) = segments_a + .iter() + .enumerate() + .flat_map(|(segment_index_a, (_, segment_a))| { + segments_b + .iter() + .enumerate() + .filter_map(move |(segment_index_b, (_, segment_b))| { + let common = lcs::longuest_common_section(segment_a, segment_b)?; + Some(((segment_index_a, segment_index_b), common)) + }) + }) + .filter(|(_, (range_a, _))| range_a.len() > length_threshold) + .max_by_key(|(_, (range_a, _))| range_a.len()) + { + let ((segment_index_a, segment_index_b), (token_range_a, token_range_b)) = biggest_common_segment; + let segment_a = segments_a.remove(segment_index_a); + let segment_b = segments_b.remove(segment_index_b); + + let (tokens_l, tokens_a, tokens_r) = slice_range(segment_a.0, token_range_a.clone()); + let (compas_l, _comps_a, compas_r) = slice_range(segment_a.1, token_range_a); + segments_a.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]); + + let (tokens_l, tokens_b, tokens_r) = slice_range(segment_b.0, token_range_b.clone()); + let (compas_l, _comps_b, compas_r) = slice_range(segment_b.1, token_range_b); + segments_b.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]); + + let (first, last) = (tokens_a.first().unwrap(), tokens_a.last().unwrap()); + let character_span_a = first.0.start..last.0.end; + let (first, last) = (tokens_b.first().unwrap(), tokens_b.last().unwrap()); + let character_span_b = first.0.start..last.0.end; + token_matches.push(Match(character_span_a, character_span_b)); + } + } + + Similarity { + exact_matches, + token_matches, + } +} + +fn slice_range(mut items: Vec, range: Span) -> (Vec, Vec, Vec) { + let end = items.split_off(range.end); + let middle = items.split_off(range.start); + let start = items; + (start, middle, end) +} + +type TokenAndContent = (&'static str, Option); +fn comparable_parts_of(lang: &Lang, tokens: &[(Range, Token)]) -> (Vec>, Vec) { + tokens + .iter() + .filter_map(|token @ (_, Token { kind, content })| match kind { + k if lang.ignored_token.contains(k) => None, + k if lang.ignored_token_content.contains(k) => Some(((token.clone()), (*k, None))), + k => Some((token.clone(), (k, Some(content.clone())))), + }) + .collect() +} + +#[derive(Debug, Clone)] +pub struct Similarity { + pub exact_matches: Vec, + pub token_matches: Vec, +} + +#[derive(Debug, Clone)] +pub struct Match(pub Range, pub Range); diff --git a/mousquetaire/Cargo.toml b/mousquetaire/Cargo.toml new file mode 100644 index 0000000..bc6eb51 --- /dev/null +++ b/mousquetaire/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "mousquetaire" +version = "0.1.0" +edition = "2024" + +[dependencies] +mousquet = { path = "../mousquet" } diff --git a/mousquetaire/src/main.rs b/mousquetaire/src/main.rs new file mode 100644 index 0000000..84d2875 --- /dev/null +++ b/mousquetaire/src/main.rs @@ -0,0 +1,61 @@ +use std::{env::args, fs}; + +use mousquet::lang::Span; + +pub struct Args { + file_a: String, + file_b: String, +} + +impl Args { + pub fn parse() -> Self { + let [_, file_a, file_b] = args() + .collect::>() + .try_into() + .expect("Usage: mousquet "); + Self { file_a, file_b } + } +} + +fn main() { + let Args { file_a, file_b } = Args::parse(); + let source_a = fs::read_to_string(&file_a).unwrap(); + let source_b = fs::read_to_string(&file_b).unwrap(); + let similarities = mousquet::similarity(mousquet::lang::PYTHON, &source_a, &source_b); + + let mut similarities_in_a: Vec<_> = similarities.token_matches.iter().map(|s| s.0.clone()).collect(); + let mut similarities_in_b: Vec<_> = similarities.token_matches.iter().map(|s| s.1.clone()).collect(); + similarities_in_a.sort_by_key(|s| s.start); + similarities_in_b.sort_by_key(|s| s.start); + println!(); + print_file_with_similarities(file_a, source_a, similarities_in_a); + print_file_with_similarities(file_b, source_b, similarities_in_b); +} + +fn print_file_with_similarities(file_name: String, file_content: String, sorted_similarities: Vec) { + println!("┌────────────────────────────────────────"); + println!("│ File '{file_name}':"); + println!("├────────────────────────────────────────"); + print!("│"); + let mut prev_end = 0; + for sim in sorted_similarities { + let before = &file_content[prev_end..sim.start]; + let inside = &file_content[sim.start..sim.end]; + prev_end = sim.end; + print_formatted_text(before, "│ ", (BLUE, RESET)); + print_formatted_text(inside, "│ ", (YELLOW, RESET)); + } + print_formatted_text(&file_content[prev_end..], "│ ", (BLUE, RESET)); + println!(); + println!("└────────────────────────────────────────"); +} + +fn print_formatted_text(text: &str, prefix: &str, color: (&str, &str)) { + let (col_start, col_end) = color; + let prefixed = text.replace("\n", &format!("{col_end}\n{prefix}{col_start}")); + print!("{col_start}{prefixed}{col_end}"); +} + +const YELLOW: &str = "\x1b[0;33m"; +const BLUE: &str = "\x1b[0;34m"; +const RESET: &str = "\x1b[0m"; diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..6917979 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,3 @@ + +max_width = 120 +hard_tabs = true