This commit is contained in:
Matthieu Jolimaitre 2025-08-22 12:29:55 +02:00
commit 0f8550b517
21 changed files with 584 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

14
Cargo.lock generated Normal file
View file

@ -0,0 +1,14 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "mousquet"
version = "0.1.0"
[[package]]
name = "mousquetaire"
version = "0.1.0"
dependencies = [
"mousquet",
]

2
Cargo.toml Normal file
View file

@ -0,0 +1,2 @@
[workspace]
members = ["mousquet", "mousquetaire"]

35
README.md Normal file
View file

@ -0,0 +1,35 @@
# Mousquet
Utility for making rough, euristic estimations of code similarity between
implementations.
The similarity algorithm is based on token sequence matching, like some other
software serving the same purpose. This aproach has many limitations but may fit
some use cases.
## Example
```bash
$ mousquetaire 'examples/primes_1.py' 'examples/primes_2.py'
```
![screenshot](./assets/screenshot.png)
## Build
### Dependencies
- cargo
- Install cargo through rustup
- `pacman -S rustup`
- `curl --proto '=https' --tlsv1.2 -sSf 'https://sh.rustup.rs' | sh`
- Use any toolchain
- `rustup default stable`
## Building
```bash
cargo build --release
```
Find the binary at `mousquetaire/target/release/mousquetaire`.

BIN
assets/screenshot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

6
mousquet/Cargo.toml Normal file
View file

@ -0,0 +1,6 @@
[package]
name = "mousquet"
version = "0.1.0"
edition = "2024"
[dependencies]

View file

@ -0,0 +1,10 @@
fn main() {
let source_a = include_str!("primes_1.py");
let source_b = include_str!("primes_2.py");
let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
for sim in sims.token_matches {
let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
}
}

View file

@ -0,0 +1,21 @@
# Generates the sequence of prime numbers up to a maximum number.
def primes(max = 999_999_999):
found = list[int]() # Known primes for subsequent dividability checks.
for value in range(2, max):
is_prime = True # Prime until proven otherwise.
for prime in found:
if value % prime == 0:
is_prime = False
break
if is_prime:
yield value
found.append(value)
def main():
for p in primes():
print(p)
if __name__ == "__main__": main()

View file

@ -0,0 +1,24 @@
# original file ?
def get_pr(limit = 999_999_999):
result = list[int]()
for num in range(2, limit):
valid = True
for known in result:
if num % known == 0:
valid = False
break
if valid:
yield num
result.append(num)
def main():
for num in get_pr():
print(num)
if __name__ == "__main__": main()

View file

@ -0,0 +1,24 @@
from typing import Generator
def prime_numbers(max = 999_999_999):
def rec_between(value: int, at: int, until: int) -> bool:
if at >= until: return True
if value % at == 0: return False
return rec_between(value, at + 1, until)
def rec(value: int) -> Generator[int]:
if value >= max: return
if rec_between(value, 2, value): yield value
for r in rec(value + 1): yield r
for r in rec(2): yield r
def print_all():
for p in prime_numbers():
print(p)
if __name__ == "__main__": print_all()
# author: mb

View file

@ -0,0 +1,10 @@
fn main() {
let source_a = include_str!("primes_1.py");
let source_b = include_str!("primes_1_ren.py");
let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
for sim in sims.token_matches {
let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
}
}

10
mousquet/examples/same.rs Normal file
View file

@ -0,0 +1,10 @@
fn main() {
let source_a = include_str!("primes_1.py");
let source_b = include_str!("primes_1.py");
let sims = mousquet::similarity(mousquet::lang::PYTHON, source_a, source_b);
for sim in sims.token_matches {
let text_in_a = &source_a[sim.0.clone()].to_string().replace("\n", "\\n");
let text_in_b = &source_b[sim.1.clone()].to_string().replace("\n", "\\n");
println!("Found similarity {sim:?}\n\ta: '{text_in_a}'\n\tb: '{text_in_b}'");
}
}

View file

@ -0,0 +1,2 @@
def hello():
print("Hello World")

View file

@ -0,0 +1,6 @@
fn main() {
let source = include_str!("small.py");
let language = mousquet::lang::python::LANG;
let tokens = (language.tokenizer)(source);
dbg!(&tokens, tokens.len());
}

39
mousquet/src/lang.rs Normal file
View file

@ -0,0 +1,39 @@
use std::{fmt::Debug, ops::Range};
pub mod python;
pub const PYTHON: Lang = python::LANG;
pub const ALL: &[Lang] = &[PYTHON];
#[derive(Debug, Clone, Copy)]
pub struct Lang {
pub id: &'static str,
pub tokenizer: fn(&str) -> Vec<Located<Token>>,
pub ignored_token: &'static [&'static str],
pub ignored_token_content: &'static [&'static str],
}
pub type Span = Range<usize>;
pub type Located<T> = (Span, T);
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct Token {
pub kind: &'static str,
pub content: String,
}
impl Token {
pub fn of_kind(kind: &'static str) -> impl Fn(&str) -> Token {
move |content| Token {
kind,
content: content.into(),
}
}
}
impl Debug for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let Token { kind, content } = self;
f.write_fmt(format_args!(r#"Token({kind},"{content}")"#))
}
}

171
mousquet/src/lang/python.rs Normal file
View file

@ -0,0 +1,171 @@
use crate::lang::{Lang, Located, Token};
pub const LANG: Lang = Lang {
id: "python",
tokenizer,
ignored_token: &["sp"],
ignored_token_content: &["id"],
};
pub fn tokenizer(text: &str) -> Vec<Located<Token>> {
Tokenizer(text, text.len()).collect()
}
pub struct Tokenizer<'s>(&'s str, usize);
impl<'s> Iterator for Tokenizer<'s> {
type Item = Located<Token>;
fn next(&mut self) -> Option<Self::Item> {
if self.0.is_empty() {
return None;
}
self.skip_comment();
let start = self.1 - self.0.len();
let result = self
.parse_space()
.or_else(|| self.parse_op())
.or_else(|| self.parse_str())
.or_else(|| self.parse_ident())
.or_else(|| self.parse_unknown());
let end = self.1 - self.0.len();
result.map(|r| ((start..end), r))
}
}
impl<'s> Tokenizer<'s> {
fn skip_comment(&mut self) {
while self.0.starts_with("#") {
let line_length = self.0.find("\n").unwrap_or(self.0.len());
self.0 = &self.0[line_length..];
}
}
fn try_take(&mut self, word: &str) -> Option<&str> {
if self.0.strip_prefix(word).is_some() {
let (word, rest) = self.0.split_at(word.len());
self.0 = rest;
Some(word)
} else {
None
}
}
fn parse_space(&mut self) -> Option<Token> {
[" ", "\n", "\t", "\r"]
.iter()
.filter_map(|op| self.try_take(op).map(Token::of_kind("sp")))
.next()
}
fn parse_op(&mut self) -> Option<Token> {
OPERATORS
.iter()
.filter_map(|op| self.try_take(op).map(Token::of_kind("op")))
.next()
}
fn parse_str(&mut self) -> Option<Token> {
let (open, close) = STR_STARTS.iter().find(|(s, _)| self.0.starts_with(s))?;
let mut content_length = 0;
loop {
let prefix_length = open.len() + content_length;
let remainder = match self.0.get(prefix_length..) {
None => break,
Some("") => break,
Some(r) => r,
};
if remainder.starts_with("\\") {
content_length += 2;
continue;
}
if remainder.starts_with(close) {
let length = open.len() + content_length + close.len();
let content = &self.0[..length];
return self.try_take(content).map(Token::of_kind("str"));
}
content_length += 1;
}
None
}
fn parse_ident(&mut self) -> Option<Token> {
let forbidden = " \n\t\r!-@*/&%^+<=>|~()[]{}:;,.";
let length = self.0.chars().take_while(|c| !forbidden.contains(*c)).count();
self.try_take(&self.0[..length]).map(|content| {
let kind = match KEYWORDS.contains(&content) {
true => "kw",
false => "id",
};
Token::of_kind(kind)(content)
})
}
fn parse_unknown(&mut self) -> Option<Token> {
let next_break = self.0.find(' ').unwrap_or(self.0.len());
let content = self.try_take(&self.0[..next_break]).unwrap();
Some(Token::of_kind("unk")(content))
}
}
/// Ordered by size then alphabetically.
const OPERATORS: &[&str] = &[
"**=", //
"//=", //
"<<=", //
">>=", //
"-=", //
"!=", //
"[]", //
"@=", //
"**", //
"*=", //
"//", //
"/=", //
"&=", //
"%=", //
"^=", //
"+=", //
"<<", //
"<=", //
"==", //
">=", //
">>", //
"|=", //
"-", //
"@", //
"*", //
"/", //
"&", //
"%", //
"^", //
"+", //
"<", //
"=", //
">", //
"|", //
"~", //
"(", //
")", //
"[", //
"]", //
"{", //
"}", //
":", //
";", //
",", //
".", //
];
const KEYWORDS: &[&str] = &[
"def", "and", "or", "not", "for", "while", "in", "try", "raise", "except", "yield", "return", "import", "from",
"as",
];
const STR_STARTS: &[(&str, &str)] = &[
(r#"r""""#, r#"""""#),
(r#"b""""#, r#"""""#),
(r#"""""#, r#"""""#),
(r#"r""#, r#"""#),
(r#"b""#, r#"""#),
(r#"""#, r#"""#),
];

46
mousquet/src/lcs.rs Normal file
View file

@ -0,0 +1,46 @@
use crate::lang::Span;
pub fn longuest_common_section<T: Eq>(a: &[T], b: &[T]) -> Option<(Span, Span)> {
let max_size = a.len().min(b.len());
for size in (1..=max_size).rev() {
for a_start in 0..=(a.len() - size) {
let a_span = a_start..(a_start + size);
let a_section = &a[a_span.clone()];
for b_start in 0..=(b.len() - size) {
let b_span = b_start..(b_start + size);
let b_section = &b[b_span.clone()];
if a_section == b_section {
return Some((a_span, b_span));
}
}
}
}
None
}
#[test]
fn test_longuest_common_section() {
fn illustrate<'a>((a, b): (&'a [i32], &'a [i32]), (sa, sb): (Span, Span)) -> (&'a [i32], &'a [i32]) {
(&a[sa], &b[sb])
}
fn case<const A: usize, const B: usize, const E: usize>(a: [i32; A], b: [i32; B], expected: [i32; E]) {
let res = longuest_common_section(&a, &b).unwrap();
let ill = illustrate((&a, &b), res);
let exp: (&[i32], &[i32]) = (&expected, &expected);
assert_eq!(ill, exp);
}
case(
/*****/ [1, 2, 3, 4, 5, 6, 7, 8, 9],
/**/ [8, 9, 2, 3, 4],
/********/ [2, 3, 4],
);
case(
//
[1, 2, 3, 4, 5, 6],
[1, 2, 3, 4, 5, 6],
[1, 2, 3, 4, 5, 6],
);
}

92
mousquet/src/lib.rs Normal file
View file

@ -0,0 +1,92 @@
use std::ops::Range;
pub mod lang;
pub mod lcs;
use crate::lang::{Lang, Located, Span, Token};
pub fn similarity(lang: Lang, source_a: &str, source_b: &str) -> Similarity {
let tokens_a = (lang.tokenizer)(source_a);
let tokens_b = (lang.tokenizer)(source_b);
let exact_matches = Vec::new();
// TODO
let mut token_matches = Vec::new();
{
let tokens_a = tokens_a.clone();
let tokens_b = tokens_b.clone();
let (tokens_a, comparables_a) = comparable_parts_of(&lang, &tokens_a);
let (tokens_b, comparables_b) = comparable_parts_of(&lang, &tokens_b);
let mut segments_a = vec![(tokens_a, comparables_a)];
let mut segments_b = vec![(tokens_b, comparables_b)];
let length_threshold = 6;
while let Some(biggest_common_segment) = segments_a
.iter()
.enumerate()
.flat_map(|(segment_index_a, (_, segment_a))| {
segments_b
.iter()
.enumerate()
.filter_map(move |(segment_index_b, (_, segment_b))| {
let common = lcs::longuest_common_section(segment_a, segment_b)?;
Some(((segment_index_a, segment_index_b), common))
})
})
.filter(|(_, (range_a, _))| range_a.len() > length_threshold)
.max_by_key(|(_, (range_a, _))| range_a.len())
{
let ((segment_index_a, segment_index_b), (token_range_a, token_range_b)) = biggest_common_segment;
let segment_a = segments_a.remove(segment_index_a);
let segment_b = segments_b.remove(segment_index_b);
let (tokens_l, tokens_a, tokens_r) = slice_range(segment_a.0, token_range_a.clone());
let (compas_l, _comps_a, compas_r) = slice_range(segment_a.1, token_range_a);
segments_a.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]);
let (tokens_l, tokens_b, tokens_r) = slice_range(segment_b.0, token_range_b.clone());
let (compas_l, _comps_b, compas_r) = slice_range(segment_b.1, token_range_b);
segments_b.extend_from_slice(&[(tokens_l, compas_l), (tokens_r, compas_r)]);
let (first, last) = (tokens_a.first().unwrap(), tokens_a.last().unwrap());
let character_span_a = first.0.start..last.0.end;
let (first, last) = (tokens_b.first().unwrap(), tokens_b.last().unwrap());
let character_span_b = first.0.start..last.0.end;
token_matches.push(Match(character_span_a, character_span_b));
}
}
Similarity {
exact_matches,
token_matches,
}
}
fn slice_range<T>(mut items: Vec<T>, range: Span) -> (Vec<T>, Vec<T>, Vec<T>) {
let end = items.split_off(range.end);
let middle = items.split_off(range.start);
let start = items;
(start, middle, end)
}
type TokenAndContent = (&'static str, Option<String>);
fn comparable_parts_of(lang: &Lang, tokens: &[(Range<usize>, Token)]) -> (Vec<Located<Token>>, Vec<TokenAndContent>) {
tokens
.iter()
.filter_map(|token @ (_, Token { kind, content })| match kind {
k if lang.ignored_token.contains(k) => None,
k if lang.ignored_token_content.contains(k) => Some(((token.clone()), (*k, None))),
k => Some((token.clone(), (k, Some(content.clone())))),
})
.collect()
}
#[derive(Debug, Clone)]
pub struct Similarity {
pub exact_matches: Vec<Match>,
pub token_matches: Vec<Match>,
}
#[derive(Debug, Clone)]
pub struct Match(pub Range<usize>, pub Range<usize>);

7
mousquetaire/Cargo.toml Normal file
View file

@ -0,0 +1,7 @@
[package]
name = "mousquetaire"
version = "0.1.0"
edition = "2024"
[dependencies]
mousquet = { path = "../mousquet" }

61
mousquetaire/src/main.rs Normal file
View file

@ -0,0 +1,61 @@
use std::{env::args, fs};
use mousquet::lang::Span;
pub struct Args {
file_a: String,
file_b: String,
}
impl Args {
pub fn parse() -> Self {
let [_, file_a, file_b] = args()
.collect::<Vec<_>>()
.try_into()
.expect("Usage: mousquet <file_a> <file_b>");
Self { file_a, file_b }
}
}
fn main() {
let Args { file_a, file_b } = Args::parse();
let source_a = fs::read_to_string(&file_a).unwrap();
let source_b = fs::read_to_string(&file_b).unwrap();
let similarities = mousquet::similarity(mousquet::lang::PYTHON, &source_a, &source_b);
let mut similarities_in_a: Vec<_> = similarities.token_matches.iter().map(|s| s.0.clone()).collect();
let mut similarities_in_b: Vec<_> = similarities.token_matches.iter().map(|s| s.1.clone()).collect();
similarities_in_a.sort_by_key(|s| s.start);
similarities_in_b.sort_by_key(|s| s.start);
println!();
print_file_with_similarities(file_a, source_a, similarities_in_a);
print_file_with_similarities(file_b, source_b, similarities_in_b);
}
fn print_file_with_similarities(file_name: String, file_content: String, sorted_similarities: Vec<Span>) {
println!("┌────────────────────────────────────────");
println!("│ File '{file_name}':");
println!("├────────────────────────────────────────");
print!("");
let mut prev_end = 0;
for sim in sorted_similarities {
let before = &file_content[prev_end..sim.start];
let inside = &file_content[sim.start..sim.end];
prev_end = sim.end;
print_formatted_text(before, "", (BLUE, RESET));
print_formatted_text(inside, "", (YELLOW, RESET));
}
print_formatted_text(&file_content[prev_end..], "", (BLUE, RESET));
println!();
println!("└────────────────────────────────────────");
}
fn print_formatted_text(text: &str, prefix: &str, color: (&str, &str)) {
let (col_start, col_end) = color;
let prefixed = text.replace("\n", &format!("{col_end}\n{prefix}{col_start}"));
print!("{col_start}{prefixed}{col_end}");
}
const YELLOW: &str = "\x1b[0;33m";
const BLUE: &str = "\x1b[0;34m";
const RESET: &str = "\x1b[0m";

3
rustfmt.toml Normal file
View file

@ -0,0 +1,3 @@
max_width = 120
hard_tabs = true