init

2025-08-22 12:29:55 +02:00 · 2025-08-22 12:29:55 +02:00 · 0f8550b517
commit 0f8550b517
21 changed files with 584 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,14 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 4
 [[package]]
 name = "mousquet"
 version = "0.1.0"
 [[package]]
 name = "mousquetaire"
 version = "0.1.0"
 dependencies = [
 "mousquet",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,2 @@
 [workspace]
 members = ["mousquet", "mousquetaire"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,35 @@
 # Mousquet
 Utility for making rough, euristic estimations of code similarity between
 implementations.
 The similarity algorithm is based on token sequence matching, like some other
 software serving the same purpose. This aproach has many limitations but may fit
 some use cases.
 ## Example
 ```bash
 $ mousquetaire 'examples/primes_1.py' 'examples/primes_2.py'
 ```
 ![screenshot](./assets/screenshot.png)
 ## Build
 ### Dependencies
 - cargo
  - Install cargo through rustup
    - `pacman -S rustup`
    - `curl --proto '=https' --tlsv1.2 -sSf 'https://sh.rustup.rs' | sh`
  - Use any toolchain
    - `rustup default stable`
 ## Building
 ```bash
 cargo build --release
 ```
 Find the binary at `mousquetaire/target/release/mousquetaire`.
--- a/assets/screenshot.png
+++ b/assets/screenshot.png
--- a/mousquet/Cargo.toml
+++ b/mousquet/Cargo.toml
@ -0,0 +1,6 @@
 [package]
 name = "mousquet"
 version = "0.1.0"
 edition = "2024"
 [dependencies]
--- a/mousquet/examples/differents.rs
+++ b/mousquet/examples/differents.rs
@ -0,0 +1,10 @@
 fn main() {
 	let source_a = include_str!("primes_1.py");
 	let source_b = include_str!("primes_2.py");
 	let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
 	for sim in sims.token_matches {
 		let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
 		let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
 		println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
 	}
 }
--- a/mousquet/examples/primes_1.py
+++ b/mousquet/examples/primes_1.py
@ -0,0 +1,21 @@
 # Generates the sequence of prime numbers up to a maximum number.
 def primes(max = 999_999_999):
    found  = list[int]() # Known primes for subsequent dividability checks.
    for value in range(2, max):
        is_prime = True # Prime until proven otherwise.
        for prime in found:
            if value % prime == 0:
                is_prime = False
                break
        if is_prime:
            yield value
            found.append(value)
 def main():
    for p in primes():
        print(p)
 if __name__ == "__main__": main()
--- a/mousquet/examples/primes_1_ren.py
+++ b/mousquet/examples/primes_1_ren.py
@ -0,0 +1,24 @@
 # original file ?
 def get_pr(limit = 999_999_999):
    result = list[int]()
    for num in range(2, limit):
        valid = True
        for known in result:
            if num % known == 0:
                valid = False
                break
        if valid:
            yield num
            result.append(num)
 def main():
    for num in get_pr():
        print(num)
 if __name__ == "__main__": main()
--- a/mousquet/examples/primes_2.py
+++ b/mousquet/examples/primes_2.py
@ -0,0 +1,24 @@
 from typing import Generator
 def prime_numbers(max = 999_999_999):
    def rec_between(value: int, at: int, until: int) -> bool:
        if at >= until: return True
        if value % at == 0: return False
        return rec_between(value, at + 1, until)
    def rec(value: int) -> Generator[int]:
        if value >= max: return
        if rec_between(value, 2, value): yield value
        for r in rec(value + 1): yield r
    for r in rec(2): yield r
 def print_all():
    for p in prime_numbers():
        print(p)
 if __name__ == "__main__": print_all()
 # author: mb
--- a/mousquet/examples/renamed.rs
+++ b/mousquet/examples/renamed.rs
@ -0,0 +1,10 @@
 fn main() {
 	let source_a = include_str!("primes_1.py");
 	let source_b = include_str!("primes_1_ren.py");
 	let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
 	for sim in sims.token_matches {
 		let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
 		let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
 		println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
 	}
 }
--- a/mousquet/examples/same.rs
+++ b/mousquet/examples/same.rs
@ -0,0 +1,10 @@
 fn main() {
 	let source_a = include_str!("primes_1.py");
 	let source_b = include_str!("primes_1.py");
 	let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
 	for sim in sims.token_matches {
 		let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
 		let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
 		println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
 	}
 }
--- a/mousquet/examples/small.py
+++ b/mousquet/examples/small.py
@ -0,0 +1,2 @@
 def hello():
    print("Hello World")
--- a/mousquet/examples/tokenize.rs
+++ b/mousquet/examples/tokenize.rs
@ -0,0 +1,6 @@
 fn main() {
 	let source = include_str!("small.py");
 	let language = mousquet::lang::python::LANG;
 	let tokens = (language.tokenizer)(source);
 	dbg!(&tokens, tokens.len());
 }
--- a/mousquet/src/lang.rs
+++ b/mousquet/src/lang.rs
@ -0,0 +1,39 @@
 use std::{fmt::Debug, ops::Range};
 pub mod python;
 pub const PYTHON: Lang = python::LANG;
 pub const ALL: &[Lang] = &[PYTHON];
 #[derive(Debug, Clone, Copy)]
 pub struct Lang {
 	pub id: &'static str,
 	pub tokenizer: fn(&str) -> Vec<Located<Token>>,
 	pub ignored_token: &'static [&'static str],
 	pub ignored_token_content: &'static [&'static str],
 }
 pub type Span = Range<usize>;
 pub type Located<T> = (Span, T);
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct Token {
 	pub kind: &'static str,
 	pub content: String,
 }
 impl Token {
 	pub fn of_kind(kind: &'static str) -> impl Fn(&str) -> Token {
 		move |content| Token {
 			kind,
 			content: content.into(),
 		}
 	}
 }
 impl Debug for Token {
 	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 		let Token { kind, content } = self;
 		f.write_fmt(format_args!(r#"Token({kind},"{content}")"#))
 	}
 }
--- a/mousquet/src/lang/python.rs
+++ b/mousquet/src/lang/python.rs
@ -0,0 +1,171 @@
 use crate::lang::{Lang, Located, Token};
 pub const LANG: Lang = Lang {
 	id: "python",
 	tokenizer,
 	ignored_token: &["sp"],
 	ignored_token_content: &["id"],
 };
 pub fn tokenizer(text: &str) -> Vec<Located<Token>> {
 	Tokenizer(text, text.len()).collect()
 }
 pub struct Tokenizer<'s>(&'s str, usize);
 impl<'s> Iterator for Tokenizer<'s> {
 	type Item = Located<Token>;
 	fn next(&mut self) -> Option<Self::Item> {
 		if self.0.is_empty() {
 			return None;
 		}
 		self.skip_comment();
 		let start = self.1 - self.0.len();
 		let result = self
 			.parse_space()
 			.or_else(|| self.parse_op())
 			.or_else(|| self.parse_str())
 			.or_else(|| self.parse_ident())
 			.or_else(|| self.parse_unknown());
 		let end = self.1 - self.0.len();
 		result.map(|r| ((start..end), r))
 	}
 }
 impl<'s> Tokenizer<'s> {
 	fn skip_comment(&mut self) {
 		while self.0.starts_with("#") {
 			let line_length = self.0.find("\n").unwrap_or(self.0.len());
 			self.0 = &self.0[line_length..];
 		}
 	}
 	fn try_take(&mut self, word: &str) -> Option<&str> {
 		if self.0.strip_prefix(word).is_some() {
 			let (word, rest) = self.0.split_at(word.len());
 			self.0 = rest;
 			Some(word)
 		} else {
 			None
 		}
 	}
 	fn parse_space(&mut self) -> Option<Token> {
 		[" ", "\n", "\t", "\r"]
 			.iter()
 			.filter_map(|op| self.try_take(op).map(Token::of_kind("sp")))
 			.next()
 	}
 	fn parse_op(&mut self) -> Option<Token> {
 		OPERATORS
 			.iter()
 			.filter_map(|op| self.try_take(op).map(Token::of_kind("op")))
 			.next()
 	}
 	fn parse_str(&mut self) -> Option<Token> {
 		let (open, close) = STR_STARTS.iter().find(|(s, _)| self.0.starts_with(s))?;
 		let mut content_length = 0;
 		loop {
 			let prefix_length = open.len() + content_length;
 			let remainder = match self.0.get(prefix_length..) {
 				None => break,
 				Some("") => break,
 				Some(r) => r,
 			};
 			if remainder.starts_with("\\") {
 				content_length += 2;
 				continue;
 			}
 			if remainder.starts_with(close) {
 				let length = open.len() + content_length + close.len();
 				let content = &self.0[..length];
 				return self.try_take(content).map(Token::of_kind("str"));
 			}
 			content_length += 1;
 		}
 		None
 	}
 	fn parse_ident(&mut self) -> Option<Token> {
 		let forbidden = " \n\t\r!-@*/&%^+<=>|~()[]{}:;,.";
 		let length = self.0.chars().take_while(|c| !forbidden.contains(*c)).count();
 		self.try_take(&self.0[..length]).map(|content| {
 			let kind = match KEYWORDS.contains(&content) {
 				true => "kw",
 				false => "id",
 			};
 			Token::of_kind(kind)(content)
 		})
 	}
 	fn parse_unknown(&mut self) -> Option<Token> {
 		let next_break = self.0.find(' ').unwrap_or(self.0.len());
 		let content = self.try_take(&self.0[..next_break]).unwrap();
 		Some(Token::of_kind("unk")(content))
 	}
 }
 /// Ordered by size then alphabetically.
 const OPERATORS: &[&str] = &[
 	"**=", //
 	"//=", //
 	"<<=", //
 	">>=", //
 	"-=",  //
 	"!=",  //
 	"[]",  //
 	"@=",  //
 	"**",  //
 	"*=",  //
 	"//",  //
 	"/=",  //
 	"&=",  //
 	"%=",  //
 	"^=",  //
 	"+=",  //
 	"<<",  //
 	"<=",  //
 	"==",  //
 	">=",  //
 	">>",  //
 	"|=",  //
 	"-",   //
 	"@",   //
 	"*",   //
 	"/",   //
 	"&",   //
 	"%",   //
 	"^",   //
 	"+",   //
 	"<",   //
 	"=",   //
 	">",   //
 	"|",   //
 	"~",   //
 	"(",   //
 	")",   //
 	"[",   //
 	"]",   //
 	"{",   //
 	"}",   //
 	":",   //
 	";",   //
 	",",   //
 	".",   //
 ];
 const KEYWORDS: &[&str] = &[
 	"def", "and", "or", "not", "for", "while", "in", "try", "raise", "except", "yield", "return", "import", "from",
 	"as",
 ];
 const STR_STARTS: &[(&str, &str)] = &[
 	(r#"r""""#, r#"""""#),
 	(r#"b""""#, r#"""""#),
 	(r#"""""#, r#"""""#),
 	(r#"r""#, r#"""#),
 	(r#"b""#, r#"""#),
 	(r#"""#, r#"""#),
 ];
--- a/mousquet/src/lcs.rs
+++ b/mousquet/src/lcs.rs
@ -0,0 +1,46 @@
 use crate::lang::Span;
 pub fn longuest_common_section<T: Eq>(a: &[T], b: &[T]) -> Option<(Span, Span)> {
 	let max_size = a.len().min(b.len());
 	for size in (1..=max_size).rev() {
 		for a_start in 0..=(a.len() - size) {
 			let a_span = a_start..(a_start + size);
 			let a_section = &a[a_span.clone()];
 			for b_start in 0..=(b.len() - size) {
 				let b_span = b_start..(b_start + size);
 				let b_section = &b[b_span.clone()];
 				if a_section == b_section {
 					return Some((a_span, b_span));
 				}
 			}
 		}
 	}
 	None
 }
 #[test]
 fn test_longuest_common_section() {
 	fn illustrate<'a>((a, b): (&'a [i32], &'a [i32]), (sa, sb): (Span, Span)) -> (&'a [i32], &'a [i32]) {
 		(&a[sa], &b[sb])
 	}
 	fn case<const A: usize, const B: usize, const E: usize>(a: [i32; A], b: [i32; B], expected: [i32; E]) {
 		let res = longuest_common_section(&a, &b).unwrap();
 		let ill = illustrate((&a, &b), res);
 		let exp: (&[i32], &[i32]) = (&expected, &expected);
 		assert_eq!(ill, exp);
 	}
 	case(
 		/*****/ [1, 2, 3, 4, 5, 6, 7, 8, 9],
 		/**/ [8, 9, 2, 3, 4],
 		/********/ [2, 3, 4],
 	);
 	case(
 		//
 		[1, 2, 3, 4, 5, 6],
 		[1, 2, 3, 4, 5, 6],
 		[1, 2, 3, 4, 5, 6],
 	);
 }
--- a/mousquet/src/lib.rs
+++ b/mousquet/src/lib.rs
@ -0,0 +1,92 @@
 use std::ops::Range;
 pub mod lang;
 pub mod lcs;
 use crate::lang::{Lang, Located, Span, Token};
 pub fn similarity(lang: Lang, source_a: &str, source_b: &str) -> Similarity {
 	let tokens_a = (lang.tokenizer)(source_a);
 	let tokens_b = (lang.tokenizer)(source_b);
 	let exact_matches = Vec::new();
 	// TODO
 	let mut token_matches = Vec::new();
 	{
 		let tokens_a = tokens_a.clone();
 		let tokens_b = tokens_b.clone();
 		let (tokens_a, comparables_a) = comparable_parts_of(&lang, &tokens_a);
 		let (tokens_b, comparables_b) = comparable_parts_of(&lang, &tokens_b);
 		let mut segments_a = vec![(tokens_a, comparables_a)];
 		let mut segments_b = vec![(tokens_b, comparables_b)];
 		let length_threshold = 6;
 		while let Some(biggest_common_segment) = segments_a
 			.iter()
 			.enumerate()
 			.flat_map(|(segment_index_a, (_, segment_a))| {
 				segments_b
 					.iter()
 					.enumerate()
 					.filter_map(move |(segment_index_b, (_, segment_b))| {
 						let common = lcs::longuest_common_section(segment_a, segment_b)?;
 						Some(((segment_index_a, segment_index_b), common))
 					})
 			})
 			.filter(|(_, (range_a, _))| range_a.len() > length_threshold)
 			.max_by_key(|(_, (range_a, _))| range_a.len())
 		{
 			let ((segment_index_a, segment_index_b), (token_range_a, token_range_b)) = biggest_common_segment;
 			let segment_a = segments_a.remove(segment_index_a);
 			let segment_b = segments_b.remove(segment_index_b);
 			let (tokens_l, tokens_a, tokens_r) = slice_range(segment_a.0, token_range_a.clone());
 			let (compas_l, _comps_a, compas_r) = slice_range(segment_a.1, token_range_a);
 			segments_a.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]);
 			let (tokens_l, tokens_b, tokens_r) = slice_range(segment_b.0, token_range_b.clone());
 			let (compas_l, _comps_b, compas_r) = slice_range(segment_b.1, token_range_b);
 			segments_b.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]);
 			let (first, last) = (tokens_a.first().unwrap(), tokens_a.last().unwrap());
 			let character_span_a = first.0.start..last.0.end;
 			let (first, last) = (tokens_b.first().unwrap(), tokens_b.last().unwrap());
 			let character_span_b = first.0.start..last.0.end;
 			token_matches.push(Match(character_span_a, character_span_b));
 		}
 	}
 	Similarity {
 		exact_matches,
 		token_matches,
 	}
 }
 fn slice_range<T>(mut items: Vec<T>, range: Span) -> (Vec<T>, Vec<T>, Vec<T>) {
 	let end = items.split_off(range.end);
 	let middle = items.split_off(range.start);
 	let start = items;
 	(start, middle, end)
 }
 type TokenAndContent = (&'static str, Option<String>);
 fn comparable_parts_of(lang: &Lang, tokens: &[(Range<usize>, Token)]) -> (Vec<Located<Token>>, Vec<TokenAndContent>) {
 	tokens
 		.iter()
 		.filter_map(|token @ (_, Token { kind, content })| match kind {
 			k if lang.ignored_token.contains(k) => None,
 			k if lang.ignored_token_content.contains(k) => Some(((token.clone()), (*k, None))),
 			k => Some((token.clone(), (k, Some(content.clone())))),
 		})
 		.collect()
 }
 #[derive(Debug, Clone)]
 pub struct Similarity {
 	pub exact_matches: Vec<Match>,
 	pub token_matches: Vec<Match>,
 }
 #[derive(Debug, Clone)]
 pub struct Match(pub Range<usize>, pub Range<usize>);
--- a/mousquetaire/Cargo.toml
+++ b/mousquetaire/Cargo.toml
@ -0,0 +1,7 @@
 [package]
 name = "mousquetaire"
 version = "0.1.0"
 edition = "2024"
 [dependencies]
 mousquet = { path = "../mousquet" }
--- a/mousquetaire/src/main.rs
+++ b/mousquetaire/src/main.rs
@ -0,0 +1,61 @@
 use std::{env::args, fs};
 use mousquet::lang::Span;
 pub struct Args {
 	file_a: String,
 	file_b: String,
 }
 impl Args {
 	pub fn parse() -> Self {
 		let [_, file_a, file_b] = args()
 			.collect::<Vec<_>>()
 			.try_into()
 			.expect("Usage: mousquet <file_a> <file_b>");
 		Self { file_a, file_b }
 	}
 }
 fn main() {
 	let Args { file_a, file_b } = Args::parse();
 	let source_a = fs::read_to_string(&file_a).unwrap();
 	let source_b = fs::read_to_string(&file_b).unwrap();
 	let similarities = mousquet::similarity(mousquet::lang::PYTHON, &source_a, &source_b);
 	let mut similarities_in_a: Vec<_> = similarities.token_matches.iter().map(|s| s.0.clone()).collect();
 	let mut similarities_in_b: Vec<_> = similarities.token_matches.iter().map(|s| s.1.clone()).collect();
 	similarities_in_a.sort_by_key(|s| s.start);
 	similarities_in_b.sort_by_key(|s| s.start);
 	println!();
 	print_file_with_similarities(file_a, source_a, similarities_in_a);
 	print_file_with_similarities(file_b, source_b, similarities_in_b);
 }
 fn print_file_with_similarities(file_name: String, file_content: String, sorted_similarities: Vec<Span>) {
 	println!("┌────────────────────────────────────────");
 	println!("│ File '{file_name}':");
 	println!("├────────────────────────────────────────");
 	print!("│");
 	let mut prev_end = 0;
 	for sim in sorted_similarities {
 		let before = &file_content[prev_end..sim.start];
 		let inside = &file_content[sim.start..sim.end];
 		prev_end = sim.end;
 		print_formatted_text(before, "│ ", (BLUE, RESET));
 		print_formatted_text(inside, "│ ", (YELLOW, RESET));
 	}
 	print_formatted_text(&file_content[prev_end..], "│ ", (BLUE, RESET));
 	println!();
 	println!("└────────────────────────────────────────");
 }
 fn print_formatted_text(text: &str, prefix: &str, color: (&str, &str)) {
 	let (col_start, col_end) = color;
 	let prefixed = text.replace("\n", &format!("{col_end}\n{prefix}{col_start}"));
 	print!("{col_start}{prefixed}{col_end}");
 }
 const YELLOW: &str = "\x1b[0;33m";
 const BLUE: &str = "\x1b[0;34m";
 const RESET: &str = "\x1b[0m";
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -0,0 +1,3 @@
 max_width = 120
 hard_tabs = true
		`@ -0,0 +1,2 @@`
							`[workspace]`
							`members = ["mousquet", "mousquetaire"]`