init

2025-08-22 12:29:55 +02:00 · 2025-08-22 12:29:55 +02:00 · 0f8550b517
commit 0f8550b517
21 changed files with 584 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,14 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "mousquet"
+version = "0.1.0"
+
+[[package]]
+name = "mousquetaire"
+version = "0.1.0"
+dependencies = [
+ "mousquet",
+]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,2 @@
+[workspace]
+members = ["mousquet", "mousquetaire"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,35 @@
+# Mousquet
+
+Utility for making rough, euristic estimations of code similarity between
+implementations.
+
+The similarity algorithm is based on token sequence matching, like some other
+software serving the same purpose. This aproach has many limitations but may fit
+some use cases.
+
+## Example
+
+```bash
+$ mousquetaire 'examples/primes_1.py' 'examples/primes_2.py'
+```
+
+![screenshot](./assets/screenshot.png)
+
+## Build
+
+### Dependencies
+
+- cargo
+  - Install cargo through rustup
+    - `pacman -S rustup`
+    - `curl --proto '=https' --tlsv1.2 -sSf 'https://sh.rustup.rs' | sh`
+  - Use any toolchain
+    - `rustup default stable`
+
+## Building
+
+```bash
+cargo build --release
+```
+
+Find the binary at `mousquetaire/target/release/mousquetaire`.
--- a/assets/screenshot.png
+++ b/assets/screenshot.png
--- a/mousquet/Cargo.toml
+++ b/mousquet/Cargo.toml
@ -0,0 +1,6 @@
+[package]
+name = "mousquet"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
--- a/mousquet/examples/differents.rs
+++ b/mousquet/examples/differents.rs
@ -0,0 +1,10 @@
+fn main() {
+	let source_a = include_str!("primes_1.py");
+	let source_b = include_str!("primes_2.py");
+	let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
+	for sim in sims.token_matches {
+		let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
+		let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
+		println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
+	}
+}
--- a/mousquet/examples/primes_1.py
+++ b/mousquet/examples/primes_1.py
@ -0,0 +1,21 @@
+
+# Generates the sequence of prime numbers up to a maximum number.
+def primes(max = 999_999_999):
+    found  = list[int]() # Known primes for subsequent dividability checks.
+    for value in range(2, max):
+        is_prime = True # Prime until proven otherwise.
+        for prime in found:
+            if value % prime == 0:
+                is_prime = False
+                break
+        if is_prime:
+            yield value
+            found.append(value)
+
+
+def main():
+    for p in primes():
+        print(p)
+
+
+if __name__ == "__main__": main()
--- a/mousquet/examples/primes_1_ren.py
+++ b/mousquet/examples/primes_1_ren.py
@ -0,0 +1,24 @@
+# original file ?
+
+
+def get_pr(limit = 999_999_999):
+    result = list[int]()
+
+    for num in range(2, limit):
+        valid = True
+        for known in result:
+            if num % known == 0:
+                valid = False
+                break
+
+        if valid:
+            yield num
+            result.append(num)
+
+
+def main():
+    for num in get_pr():
+        print(num)
+
+
+if __name__ == "__main__": main()
--- a/mousquet/examples/primes_2.py
+++ b/mousquet/examples/primes_2.py
@ -0,0 +1,24 @@
+
+from typing import Generator
+
+
+def prime_numbers(max = 999_999_999):
+    def rec_between(value: int, at: int, until: int) -> bool:
+        if at >= until: return True
+        if value % at == 0: return False
+        return rec_between(value, at + 1, until)
+    def rec(value: int) -> Generator[int]:
+        if value >= max: return
+        if rec_between(value, 2, value): yield value
+        for r in rec(value + 1): yield r
+    for r in rec(2): yield r
+
+
+def print_all():
+    for p in prime_numbers():
+        print(p)
+
+
+if __name__ == "__main__": print_all()
+
+# author: mb
--- a/mousquet/examples/renamed.rs
+++ b/mousquet/examples/renamed.rs
@ -0,0 +1,10 @@
+fn main() {
+	let source_a = include_str!("primes_1.py");
+	let source_b = include_str!("primes_1_ren.py");
+	let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
+	for sim in sims.token_matches {
+		let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
+		let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
+		println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
+	}
+}
--- a/mousquet/examples/same.rs
+++ b/mousquet/examples/same.rs
@ -0,0 +1,10 @@
+fn main() {
+	let source_a = include_str!("primes_1.py");
+	let source_b = include_str!("primes_1.py");
+	let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
+	for sim in sims.token_matches {
+		let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
+		let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
+		println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
+	}
+}
--- a/mousquet/examples/small.py
+++ b/mousquet/examples/small.py
@ -0,0 +1,2 @@
+def hello():
+    print("Hello World")
--- a/mousquet/examples/tokenize.rs
+++ b/mousquet/examples/tokenize.rs
@ -0,0 +1,6 @@
+fn main() {
+	let source = include_str!("small.py");
+	let language = mousquet::lang::python::LANG;
+	let tokens = (language.tokenizer)(source);
+	dbg!(&tokens, tokens.len());
+}
--- a/mousquet/src/lang.rs
+++ b/mousquet/src/lang.rs
@ -0,0 +1,39 @@
+use std::{fmt::Debug, ops::Range};
+
+pub mod python;
+
+pub const PYTHON: Lang = python::LANG;
+pub const ALL: &[Lang] = &[PYTHON];
+
+#[derive(Debug, Clone, Copy)]
+pub struct Lang {
+	pub id: &'static str,
+	pub tokenizer: fn(&str) -> Vec<Located<Token>>,
+	pub ignored_token: &'static [&'static str],
+	pub ignored_token_content: &'static [&'static str],
+}
+
+pub type Span = Range<usize>;
+pub type Located<T> = (Span, T);
+
+#[derive(Clone, PartialEq, Eq, Hash)]
+pub struct Token {
+	pub kind: &'static str,
+	pub content: String,
+}
+
+impl Token {
+	pub fn of_kind(kind: &'static str) -> impl Fn(&str) -> Token {
+		move |content| Token {
+			kind,
+			content: content.into(),
+		}
+	}
+}
+
+impl Debug for Token {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		let Token { kind, content } = self;
+		f.write_fmt(format_args!(r#"Token({kind},"{content}")"#))
+	}
+}
--- a/mousquet/src/lang/python.rs
+++ b/mousquet/src/lang/python.rs
@ -0,0 +1,171 @@
+use crate::lang::{Lang, Located, Token};
+
+pub const LANG: Lang = Lang {
+	id: "python",
+	tokenizer,
+	ignored_token: &["sp"],
+	ignored_token_content: &["id"],
+};
+
+pub fn tokenizer(text: &str) -> Vec<Located<Token>> {
+	Tokenizer(text, text.len()).collect()
+}
+
+pub struct Tokenizer<'s>(&'s str, usize);
+
+impl<'s> Iterator for Tokenizer<'s> {
+	type Item = Located<Token>;
+	fn next(&mut self) -> Option<Self::Item> {
+		if self.0.is_empty() {
+			return None;
+		}
+		self.skip_comment();
+		let start = self.1 - self.0.len();
+		let result = self
+			.parse_space()
+			.or_else(|| self.parse_op())
+			.or_else(|| self.parse_str())
+			.or_else(|| self.parse_ident())
+			.or_else(|| self.parse_unknown());
+		let end = self.1 - self.0.len();
+		result.map(|r| ((start..end), r))
+	}
+}
+
+impl<'s> Tokenizer<'s> {
+	fn skip_comment(&mut self) {
+		while self.0.starts_with("#") {
+			let line_length = self.0.find("\n").unwrap_or(self.0.len());
+			self.0 = &self.0[line_length..];
+		}
+	}
+
+	fn try_take(&mut self, word: &str) -> Option<&str> {
+		if self.0.strip_prefix(word).is_some() {
+			let (word, rest) = self.0.split_at(word.len());
+			self.0 = rest;
+			Some(word)
+		} else {
+			None
+		}
+	}
+
+	fn parse_space(&mut self) -> Option<Token> {
+		[" ", "\n", "\t", "\r"]
+			.iter()
+			.filter_map(|op| self.try_take(op).map(Token::of_kind("sp")))
+			.next()
+	}
+
+	fn parse_op(&mut self) -> Option<Token> {
+		OPERATORS
+			.iter()
+			.filter_map(|op| self.try_take(op).map(Token::of_kind("op")))
+			.next()
+	}
+
+	fn parse_str(&mut self) -> Option<Token> {
+		let (open, close) = STR_STARTS.iter().find(|(s, _)| self.0.starts_with(s))?;
+		let mut content_length = 0;
+		loop {
+			let prefix_length = open.len() + content_length;
+			let remainder = match self.0.get(prefix_length..) {
+				None => break,
+				Some("") => break,
+				Some(r) => r,
+			};
+			if remainder.starts_with("\\") {
+				content_length += 2;
+				continue;
+			}
+			if remainder.starts_with(close) {
+				let length = open.len() + content_length + close.len();
+				let content = &self.0[..length];
+				return self.try_take(content).map(Token::of_kind("str"));
+			}
+			content_length += 1;
+		}
+		None
+	}
+
+	fn parse_ident(&mut self) -> Option<Token> {
+		let forbidden = " \n\t\r!-@*/&%^+<=>|~()[]{}:;,.";
+		let length = self.0.chars().take_while(|c| !forbidden.contains(*c)).count();
+		self.try_take(&self.0[..length]).map(|content| {
+			let kind = match KEYWORDS.contains(&content) {
+				true => "kw",
+				false => "id",
+			};
+			Token::of_kind(kind)(content)
+		})
+	}
+
+	fn parse_unknown(&mut self) -> Option<Token> {
+		let next_break = self.0.find(' ').unwrap_or(self.0.len());
+		let content = self.try_take(&self.0[..next_break]).unwrap();
+		Some(Token::of_kind("unk")(content))
+	}
+}
+
+/// Ordered by size then alphabetically.
+const OPERATORS: &[&str] = &[
+	"**=", //
+	"//=", //
+	"<<=", //
+	">>=", //
+	"-=",  //
+	"!=",  //
+	"[]",  //
+	"@=",  //
+	"**",  //
+	"*=",  //
+	"//",  //
+	"/=",  //
+	"&=",  //
+	"%=",  //
+	"^=",  //
+	"+=",  //
+	"<<",  //
+	"<=",  //
+	"==",  //
+	">=",  //
+	">>",  //
+	"|=",  //
+	"-",   //
+	"@",   //
+	"*",   //
+	"/",   //
+	"&",   //
+	"%",   //
+	"^",   //
+	"+",   //
+	"<",   //
+	"=",   //
+	">",   //
+	"|",   //
+	"~",   //
+	"(",   //
+	")",   //
+	"[",   //
+	"]",   //
+	"{",   //
+	"}",   //
+	":",   //
+	";",   //
+	",",   //
+	".",   //
+];
+
+const KEYWORDS: &[&str] = &[
+	"def", "and", "or", "not", "for", "while", "in", "try", "raise", "except", "yield", "return", "import", "from",
+	"as",
+];
+
+const STR_STARTS: &[(&str, &str)] = &[
+	(r#"r""""#, r#"""""#),
+	(r#"b""""#, r#"""""#),
+	(r#"""""#, r#"""""#),
+	(r#"r""#, r#"""#),
+	(r#"b""#, r#"""#),
+	(r#"""#, r#"""#),
+];
--- a/mousquet/src/lcs.rs
+++ b/mousquet/src/lcs.rs
@ -0,0 +1,46 @@
+use crate::lang::Span;
+
+pub fn longuest_common_section<T: Eq>(a: &[T], b: &[T]) -> Option<(Span, Span)> {
+	let max_size = a.len().min(b.len());
+	for size in (1..=max_size).rev() {
+		for a_start in 0..=(a.len() - size) {
+			let a_span = a_start..(a_start + size);
+			let a_section = &a[a_span.clone()];
+			for b_start in 0..=(b.len() - size) {
+				let b_span = b_start..(b_start + size);
+				let b_section = &b[b_span.clone()];
+				if a_section == b_section {
+					return Some((a_span, b_span));
+				}
+			}
+		}
+	}
+	None
+}
+
+#[test]
+fn test_longuest_common_section() {
+	fn illustrate<'a>((a, b): (&'a [i32], &'a [i32]), (sa, sb): (Span, Span)) -> (&'a [i32], &'a [i32]) {
+		(&a[sa], &b[sb])
+	}
+
+	fn case<const A: usize, const B: usize, const E: usize>(a: [i32; A], b: [i32; B], expected: [i32; E]) {
+		let res = longuest_common_section(&a, &b).unwrap();
+		let ill = illustrate((&a, &b), res);
+		let exp: (&[i32], &[i32]) = (&expected, &expected);
+		assert_eq!(ill, exp);
+	}
+
+	case(
+		/*****/ [1, 2, 3, 4, 5, 6, 7, 8, 9],
+		/**/ [8, 9, 2, 3, 4],
+		/********/ [2, 3, 4],
+	);
+
+	case(
+		//
+		[1, 2, 3, 4, 5, 6],
+		[1, 2, 3, 4, 5, 6],
+		[1, 2, 3, 4, 5, 6],
+	);
+}
--- a/mousquet/src/lib.rs
+++ b/mousquet/src/lib.rs
@ -0,0 +1,92 @@
+use std::ops::Range;
+
+pub mod lang;
+pub mod lcs;
+
+use crate::lang::{Lang, Located, Span, Token};
+
+pub fn similarity(lang: Lang, source_a: &str, source_b: &str) -> Similarity {
+	let tokens_a = (lang.tokenizer)(source_a);
+	let tokens_b = (lang.tokenizer)(source_b);
+
+	let exact_matches = Vec::new();
+	// TODO
+
+	let mut token_matches = Vec::new();
+	{
+		let tokens_a = tokens_a.clone();
+		let tokens_b = tokens_b.clone();
+		let (tokens_a, comparables_a) = comparable_parts_of(&lang, &tokens_a);
+		let (tokens_b, comparables_b) = comparable_parts_of(&lang, &tokens_b);
+		let mut segments_a = vec![(tokens_a, comparables_a)];
+		let mut segments_b = vec![(tokens_b, comparables_b)];
+
+		let length_threshold = 6;
+		while let Some(biggest_common_segment) = segments_a
+			.iter()
+			.enumerate()
+			.flat_map(|(segment_index_a, (_, segment_a))| {
+				segments_b
+					.iter()
+					.enumerate()
+					.filter_map(move |(segment_index_b, (_, segment_b))| {
+						let common = lcs::longuest_common_section(segment_a, segment_b)?;
+						Some(((segment_index_a, segment_index_b), common))
+					})
+			})
+			.filter(|(_, (range_a, _))| range_a.len() > length_threshold)
+			.max_by_key(|(_, (range_a, _))| range_a.len())
+		{
+			let ((segment_index_a, segment_index_b), (token_range_a, token_range_b)) = biggest_common_segment;
+			let segment_a = segments_a.remove(segment_index_a);
+			let segment_b = segments_b.remove(segment_index_b);
+
+			let (tokens_l, tokens_a, tokens_r) = slice_range(segment_a.0, token_range_a.clone());
+			let (compas_l, _comps_a, compas_r) = slice_range(segment_a.1, token_range_a);
+			segments_a.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]);
+
+			let (tokens_l, tokens_b, tokens_r) = slice_range(segment_b.0, token_range_b.clone());
+			let (compas_l, _comps_b, compas_r) = slice_range(segment_b.1, token_range_b);
+			segments_b.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]);
+
+			let (first, last) = (tokens_a.first().unwrap(), tokens_a.last().unwrap());
+			let character_span_a = first.0.start..last.0.end;
+			let (first, last) = (tokens_b.first().unwrap(), tokens_b.last().unwrap());
+			let character_span_b = first.0.start..last.0.end;
+			token_matches.push(Match(character_span_a, character_span_b));
+		}
+	}
+
+	Similarity {
+		exact_matches,
+		token_matches,
+	}
+}
+
+fn slice_range<T>(mut items: Vec<T>, range: Span) -> (Vec<T>, Vec<T>, Vec<T>) {
+	let end = items.split_off(range.end);
+	let middle = items.split_off(range.start);
+	let start = items;
+	(start, middle, end)
+}
+
+type TokenAndContent = (&'static str, Option<String>);
+fn comparable_parts_of(lang: &Lang, tokens: &[(Range<usize>, Token)]) -> (Vec<Located<Token>>, Vec<TokenAndContent>) {
+	tokens
+		.iter()
+		.filter_map(|token @ (_, Token { kind, content })| match kind {
+			k if lang.ignored_token.contains(k) => None,
+			k if lang.ignored_token_content.contains(k) => Some(((token.clone()), (*k, None))),
+			k => Some((token.clone(), (k, Some(content.clone())))),
+		})
+		.collect()
+}
+
+#[derive(Debug, Clone)]
+pub struct Similarity {
+	pub exact_matches: Vec<Match>,
+	pub token_matches: Vec<Match>,
+}
+
+#[derive(Debug, Clone)]
+pub struct Match(pub Range<usize>, pub Range<usize>);
--- a/mousquetaire/Cargo.toml
+++ b/mousquetaire/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "mousquetaire"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+mousquet = { path = "../mousquet" }
--- a/mousquetaire/src/main.rs
+++ b/mousquetaire/src/main.rs
@ -0,0 +1,61 @@
+use std::{env::args, fs};
+
+use mousquet::lang::Span;
+
+pub struct Args {
+	file_a: String,
+	file_b: String,
+}
+
+impl Args {
+	pub fn parse() -> Self {
+		let [_, file_a, file_b] = args()
+			.collect::<Vec<_>>()
+			.try_into()
+			.expect("Usage: mousquet <file_a> <file_b>");
+		Self { file_a, file_b }
+	}
+}
+
+fn main() {
+	let Args { file_a, file_b } = Args::parse();
+	let source_a = fs::read_to_string(&file_a).unwrap();
+	let source_b = fs::read_to_string(&file_b).unwrap();
+	let similarities = mousquet::similarity(mousquet::lang::PYTHON, &source_a, &source_b);
+
+	let mut similarities_in_a: Vec<_> = similarities.token_matches.iter().map(|s| s.0.clone()).collect();
+	let mut similarities_in_b: Vec<_> = similarities.token_matches.iter().map(|s| s.1.clone()).collect();
+	similarities_in_a.sort_by_key(|s| s.start);
+	similarities_in_b.sort_by_key(|s| s.start);
+	println!();
+	print_file_with_similarities(file_a, source_a, similarities_in_a);
+	print_file_with_similarities(file_b, source_b, similarities_in_b);
+}
+
+fn print_file_with_similarities(file_name: String, file_content: String, sorted_similarities: Vec<Span>) {
+	println!("┌────────────────────────────────────────");
+	println!("│ File '{file_name}':");
+	println!("├────────────────────────────────────────");
+	print!("│");
+	let mut prev_end = 0;
+	for sim in sorted_similarities {
+		let before = &file_content[prev_end..sim.start];
+		let inside = &file_content[sim.start..sim.end];
+		prev_end = sim.end;
+		print_formatted_text(before, "│ ", (BLUE, RESET));
+		print_formatted_text(inside, "│ ", (YELLOW, RESET));
+	}
+	print_formatted_text(&file_content[prev_end..], "│ ", (BLUE, RESET));
+	println!();
+	println!("└────────────────────────────────────────");
+}
+
+fn print_formatted_text(text: &str, prefix: &str, color: (&str, &str)) {
+	let (col_start, col_end) = color;
+	let prefixed = text.replace("\n", &format!("{col_end}\n{prefix}{col_start}"));
+	print!("{col_start}{prefixed}{col_end}");
+}
+
+const YELLOW: &str = "\x1b[0;33m";
+const BLUE: &str = "\x1b[0;34m";
+const RESET: &str = "\x1b[0m";
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -0,0 +1,3 @@
+
+max_width = 120
+hard_tabs = true