From 8d092f5c9827978a265655157eac8b5d333c00c1 Mon Sep 17 00:00:00 2001 From: "Wesley R. Elsberry" Date: Tue, 4 Nov 2025 23:07:36 -0500 Subject: [PATCH] Initial commit --- .cargo/config.toml | 8 + .github/workflows/build.yml | 33 +++++ Cargo.toml | 36 +++++ README.md | 16 +- scripts/plot_eval.py | 44 ++++++ src/bib.rs | 67 +++++++++ src/eval.rs | 284 ++++++++++++++++++++++++++++++++++++ src/llm.rs | 118 +++++++++++++++ src/main.rs | 61 ++++++++ src/net.rs | 155 ++++++++++++++++++++ src/score.rs | 50 +++++++ tests/bib_roundtrip.rs | 29 ++++ 12 files changed, 899 insertions(+), 2 deletions(-) create mode 100644 .cargo/config.toml create mode 100644 .github/workflows/build.yml create mode 100644 Cargo.toml create mode 100644 scripts/plot_eval.py create mode 100644 src/bib.rs create mode 100644 src/eval.rs create mode 100644 src/llm.rs create mode 100644 src/main.rs create mode 100644 src/net.rs create mode 100644 src/score.rs create mode 100644 tests/bib_roundtrip.rs diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..d0cae3b --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,8 @@ +# Tri-target friendly config. +# For Linux static builds: +# rustup target add x86_64-unknown-linux-musl +# sudo apt-get install -y musl-tools +# cargo build --release --target x86_64-unknown-linux-musl +# +# Future: Cosmopolitan experiments (advanced) +# You can try setting cosmocc as linker and using APE post-link steps. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..4e93a9e --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,33 @@ +name: build +on: [push, pull_request] +jobs: + build: + strategy: + matrix: + include: + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + - os: ubuntu-latest + target: x86_64-unknown-linux-musl + - os: windows-latest + target: x86_64-pc-windows-msvc + - os: macos-latest + target: x86_64-apple-darwin + - os: macos-latest + target: aarch64-apple-darwin + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + - name: Install musl tools + if: matrix.target == 'x86_64-unknown-linux-musl' + run: sudo apt-get update && sudo apt-get install -y musl-tools + - name: Build + run: cargo build --release --target ${{ matrix.target }} + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: veribib-${{ matrix.target }} + path: target/${{ matrix.target }}/release/* diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..2ceddf7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "veribib" +version = "0.3.0" +edition = "2021" +license = "MIT" +description = "VeriBib-rs: verify/disambiguate citations via Crossref/OpenAlex, output annotated BibTeX." + +[dependencies] +biblatex = "0.9" # If build fails, try `biblatex = "0.8"` or switch to `bibtex-parser` +clap = { version = "4.5", features = ["derive"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +thiserror = "1.0" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } +webpki-roots = "0.26" +tokio = { version = "1.38", features = ["rt-multi-thread", "macros"] } +percent-encoding = "2.3" +regex = "1.10" + +[features] +llm = [] + +[profile.release] +lto = true +codegen-units = 1 +opt-level = "z" +strip = "symbols" + + +[[bin]] +name = "veribib" +path = "src/main.rs" + +[[bin]] +name = "veribib-eval" +path = "src/eval.rs" diff --git a/README.md b/README.md index e229ceb..859e986 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ -# VeriBib-rs +# VeriBib-rs (tri-target Rust) -VeriBib-rs is a Rust implementation of the VeriBib citation checking tool. \ No newline at end of file +- Crossref + OpenAlex lookups +- Portable TLS (reqwest+rustls) +- Emits annotated BibTeX (`x_status`, `x_confidence`, …) +- GitHub Actions matrix for Win/macOS/Linux +- Future note for Cosmopolitan APE in `.cargo/config.toml` + +## Build +See `.github/workflows/build.yml` or run locally per target. + +## Use +``` +./veribib --string "Evans 1960" --context "bottlenose dolphin echolocation" --out out.bib +``` diff --git a/scripts/plot_eval.py b/scripts/plot_eval.py new file mode 100644 index 0000000..9c9f472 --- /dev/null +++ b/scripts/plot_eval.py @@ -0,0 +1,44 @@ + +import sys, csv +import matplotlib.pyplot as plt + +def main(csv_path, out_prefix="curves"): + thr = [] + prec = [] + rec = [] + with open(csv_path, newline="") as f: + r = csv.DictReader(f) + for row in r: + thr.append(float(row["threshold"])) + prec.append(float(row["precision"])) + rec.append(float(row["recall"])) + + # Precision-Recall + plt.figure() + plt.plot(rec, prec, marker='o') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.grid(True) + plt.tight_layout() + plt.savefig(f"{out_prefix}_pr.png", dpi=160) + + # Precision & Recall vs Threshold + plt.figure() + plt.plot(thr, prec, marker='o', label="Precision") + plt.plot(thr, rec, marker='o', label="Recall") + plt.xlabel('Confidence threshold') + plt.ylabel('Score') + plt.title('Precision/Recall vs Threshold') + plt.legend() + plt.grid(True) + plt.tight_layout() + plt.savefig(f"{out_prefix}_thr.png", dpi=160) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python scripts/plot_eval.py metrics.csv [out_prefix]") + sys.exit(1) + csv_path = sys.argv[1] + out_prefix = sys.argv[2] if len(sys.argv) > 2 else "curves" + main(csv_path, out_prefix) diff --git a/src/bib.rs b/src/bib.rs new file mode 100644 index 0000000..e1b9317 --- /dev/null +++ b/src/bib.rs @@ -0,0 +1,67 @@ +use std::fmt::Write as _; +use regex::Regex; +use crate::net::Candidate; + +#[derive(Clone, Default)] +pub struct MinimalBibFields { + pub title: Option, + pub author: Option, + pub year: Option, + pub journal: Option, + pub booktitle: Option, + pub doi: Option, +} + +pub fn parse_minimal_bib_fields(entry: &str) -> MinimalBibFields { + fn grab(entry: &str, key: &str) -> Option { + let pat = format!(r"(?is)\b{}\s*=\s*[{{"](.*?)[}}"]", key); + let re = Regex::new(&pat).ok()?; + let cap = re.captures(entry)?; + Some(cap.get(1)?.as_str().trim().to_string()) + } + MinimalBibFields { + title: grab(entry, "title"), + author: grab(entry, "author"), + year: grab(entry, "year"), + journal: grab(entry, "journal"), + booktitle: grab(entry, "booktitle"), + doi: grab(entry, "doi"), + } +} + +pub fn bib_entry( + key: &str, + meta: &Candidate, + status: &str, + confidence: f64, + query: &str, + context: &str, + alternates: &[Candidate], +) -> String { + let mut out = String::new(); + let _ = writeln!(&mut out, "@article{{{key},"); + let _ = writeln!(&mut out, " author = {{{}}},", meta.authors.join(" and ")); + let _ = writeln!(&mut out, " title = {{{}}},", meta.title.replace('{', "\{").replace('}', "\}")); + if !meta.venue.is_empty() { let _ = writeln!(&mut out, " journal = {{{}}},", meta.venue.replace('{', "\{").replace('}', "\}")); } + if let Some(y) = meta.year { let _ = writeln!(&mut out, " year = {{{}}},", y); } + if let Some(doi) = &meta.doi { let _ = writeln!(&mut out, " doi = {{{}}},", doi); } + let _ = writeln!(&mut out, " x_status = {{{}}},", status); + let _ = writeln!(&mut out, " x_confidence = {{{:.2}}},", confidence); + let _ = writeln!(&mut out, " x_source = {{{}}},", meta.source); + let _ = writeln!(&mut out, " x_query = {{{}}},", query.replace('{', "\{").replace('}', "\}")); + let _ = writeln!(&mut out, " x_context = {{{}}},", context.replace('{', "\{").replace('}', "\}")); + if !alternates.is_empty() { + let alt = alternates.iter() + .map(|a| format!("{}|{}|{}|{}", a.doi.clone().unwrap_or_default(), a.title, a.authors.get(0).cloned().unwrap_or_default(), a.year.map(|y| y.to_string()).unwrap_or_default())) + .collect::>().join(" || "); + let _ = writeln!(&mut out, " x_alternates = {{{}}},", alt.replace('{', "\{").replace('}', "\}")); + } + if out.ends_with(", +") { out.truncate(out.len()-2); out.push('\n'); } + out.push_str("}\n"); + out +} + +pub fn error_entry(msg: &str) -> String { + format!("@misc{{veribib_error,\n x_status = {{not_found}},\n x_confidence = {{0.0}},\n x_source = {{error}},\n x_query = {{{}}}\n}}\n", msg.replace('{', "\{").replace('}', "\}")) +} diff --git a/src/eval.rs b/src/eval.rs new file mode 100644 index 0000000..19c114b --- /dev/null +++ b/src/eval.rs @@ -0,0 +1,284 @@ + +use clap::Parser; +use std::path::PathBuf; +use serde::Deserialize; +use std::io::Write; + +#[derive(Parser, Debug)] +#[command(author, version, about = "VeriBib-rs evaluation harness")] +struct Args { + #[arg(long)] gold: PathBuf, + /// Top-k for DOI containment metric + #[arg(long, default_value_t = 3)] k: usize, + /// Enable LLM assistance (requires building with `--features llm`) + #[arg(long, default_value_t = false)] llm: bool, + /// OpenAI-compatible base URL (e.g., http://localhost:11434/v1) or Ollama native base (http://localhost:11434) + #[arg(long)] llm_base_url: Option, + /// Model name (e.g., llama3.1) + #[arg(long)] llm_model: Option, + /// Write overall metrics CSV here (optional) + #[arg(long)] csv: Option, + /// Write per-item CSV here (optional) + #[arg(long)] items_csv: Option, +} + +#[derive(Deserialize)] +struct GoldRec { + #[serde(default)] input_type: String, // "string" or "bib" + #[serde(default)] query: String, + #[serde(default)] context: String, + #[serde(default)] bib: Option, + #[serde(default)] expected: Option, +} +#[derive(Deserialize, Default, Clone)] +struct Expected { + doi: Option, + title: Option, + first_author: Option, + year: Option, +} + +#[tokio::main(flavor = "multi_thread")] +async fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let raw = std::fs::read_to_string(&args.gold)?; + let mut items = Vec::::new(); + for line in raw.lines().filter(|l| !l.trim().is_empty()) { + items.push(serde_json::from_str::(line)?); + } + + #[cfg(feature = "llm")] + let llm_cfg = if args.llm { + Some(veribib::llm::LlmConfig { + base_url: args.llm_base_url.clone().unwrap_or_default(), + model: args.llm_model.clone().unwrap_or_else(|| "llama3.1".to_string()), + }) + } else { None }; + + #[cfg(not(feature = "llm"))] + let llm_cfg: Option<()> = None; + + let mut n = 0usize; + let mut doi_exact_1 = 0usize; + let mut aty_1 = 0usize; + let mut topk_contains = 0usize; + let mut status_counts: std::collections::BTreeMap = Default::default(); + + // Per-item rows + let mut rows: Vec> = Vec::new(); + + // For threshold curves + let mut thresholds: Vec = (0..=20).map(|i| i as f64 / 20.0).collect(); + let mut tp_at: Vec = vec![0; thresholds.len()]; + let mut fp_at: Vec = vec![0; thresholds.len()]; + let mut fn_at: Vec = vec![0; thresholds.len()]; + + // Helper to normalize lowercase + let lc = |s: &str| s.to_lowercase(); + + for (idx, rec) in items.into_iter().enumerate() { + n += 1; + let ctx = if rec.context.is_empty() { None } else { Some(rec.context.as_str()) }; + + // Build input + let input = if rec.input_type == "bib" { + let b = rec.bib.clone().unwrap_or_default(); + let m = veribib::bib::MinimalBibFields { + title: b.get("title").and_then(|v| v.as_str()).map(|s| s.to_string()), + author: b.get("author").and_then(|v| v.as_str()).map(|s| s.to_string()), + year: b.get("year").and_then(|v| v.as_str()).map(|s| s.to_string()), + journal: b.get("journal").and_then(|v| v.as_str()).map(|s| s.to_string()), + booktitle: b.get("booktitle").and_then(|v| v.as_str()).map(|s| s.to_string()), + doi: b.get("doi").and_then(|v| v.as_str()).map(|s| s.to_string()), + }; + veribib::net::InputItem::BibLike(m) + } else { + veribib::net::InputItem::FreeString(rec.query.clone()) + }; + + // Run process_item to generate BibTeX, then parse status and confidence + let bib = veribib::net::process_item( + input, ctx, + #[cfg(feature = "llm")] llm_cfg.as_ref(), + ).await.unwrap_or_else(|e| veribib::bib::error_entry(&format!("{}", e))); + + let status = extract_field(&bib, "x_status").unwrap_or_else(|| "unknown".into()); + let confidence = extract_field(&bib, "x_confidence").and_then(|s| s.parse::().ok()).unwrap_or(0.0); + let top1_doi = extract_field(&bib, "doi").unwrap_or_default(); + let top1_title = extract_field(&bib, "title").unwrap_or_default(); + + *status_counts.entry(status.clone()).or_default() += 1; + + // Re-fetch candidates cheaply for metrics (same as earlier harness) + let q = if rec.input_type == "bib" { + rec.bib.as_ref().and_then(|b| b.get("title").and_then(|v| v.as_str()).map(|s| s.to_string())).unwrap_or_else(|| rec.query.clone()) + } else { rec.query.clone() }; + + let qf = veribib::score::QueryFields::from_free_string(&q); + let mut cands = Vec::::new(); + // Crossref + { + let base_query = q.clone(); + let year = qf.year; + let url = if let Some(y) = year { + format!("https://api.crossref.org/works?query.bibliographic={}&rows=5&filter=from-pub-date:{y}-01-01,until-pub-date:{y}-12-31", + percent_encoding::utf8_percent_encode(&base_query, percent_encoding::NON_ALPHANUMERIC)) + } else { + format!("https://api.crossref.org/works?query.bibliographic={}&rows=5", + percent_encoding::utf8_percent_encode(&base_query, percent_encoding::NON_ALPHANUMERIC)) + }; + if let Ok(resp) = reqwest::get(&url).await { + if let Ok(msg) = resp.json::().await { + cands.extend(msg.message.items.iter().map(|it| veribib::net::map_crossref(it))); + } + } + // OpenAlex + let qq = format!("{} {}", &base_query, ctx.unwrap_or("")).trim().to_string(); + let url = if let Some(y) = year { + format!("https://api.openalex.org/works?search={}&per_page=5&from_publication_date={y}-01-01&to_publication_date={y}-12-31", + percent_encoding::utf8_percent_encode(&qq, percent_encoding::NON_ALPHANUMERIC)) + } else { + format!("https://api.openalex.org/works?search={}&per_page=5", + percent_encoding::utf8_percent_encode(&qq, percent_encoding::NON_ALPHANUMERIC)) + }; + if let Ok(resp) = reqwest::get(&url).await { + if let Ok(rs) = resp.json::().await { + cands.extend(rs.results.iter().map(|it| veribib::net::map_openalex(it))); + } + } + } + // Rank + cands.sort_by(|a, b| { + let sa = veribib::score::score_candidate(&qf, ctx, a); + let sb = veribib::score::score_candidate(&qf, ctx, b); + sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal) + }); + + // Metrics + let mut m_doi_exact_1 = 0usize; + let mut m_aty_1 = 0usize; + let mut m_topk_contains = 0usize; + let mut top1_score = 0.0; + if let Some(top) = cands.get(0) { + top1_score = veribib::score::score_candidate(&qf, ctx, top); + } + + if let Some(exp) = rec.expected.clone() { + if let Some(doi) = exp.doi.as_ref().map(|s| lc(s)) { + if let Some(top) = cands.get(0) { + if top.doi.clone().unwrap_or_default().to_lowercase() == doi { m_doi_exact_1 = 1; } + } + let topk = cands.iter().take(args.k).any(|c| c.doi.clone().unwrap_or_default().to_lowercase() == doi); + if topk { m_topk_contains = 1; } + } + let mut ok = true; + if let Some(t) = exp.title.as_ref().map(|s| lc(s)) { + if let Some(top) = cands.get(0) { + let tt = lc(&top.title); + ok &= tt.contains(&t) || (t.contains(&tt) && t.len() < 25); + } + } + if let Some(a) = exp.first_author.as_ref().map(|s| lc(s)) { + if let Some(top) = cands.get(0) { + let a0 = top.authors.get(0).map(|s| s.split_whitespace().last().unwrap_or("").to_lowercase()).unwrap_or_default(); + ok &= a0 == a; + } + } + if let Some(y) = exp.year.as_ref() { + if let Some(top) = cands.get(0) { + ok &= top.year.map(|yy| yy.to_string()) == Some(y.clone()); + } + } + if ok { m_aty_1 = 1; } + } + + doi_exact_1 += m_doi_exact_1; + aty_1 += m_aty_1; + topk_contains += m_topk_contains; + + // Hallucination heuristic: top1 DOI present but doesn't resolve (HTTP != 200) + let mut doi_resolves = 0usize; + let mut hallucination = 0usize; + if !top1_doi.is_empty() { + let url = format!("https://doi.org/{}", top1_doi); + if let Ok(resp) = reqwest::Client::new().head(&url).send().await { + if resp.status().is_success() { doi_resolves = 1; } + } + if doi_resolves == 0 && status != "not_found" { + hallucination = 1; + } + } + + // Threshold bookkeeping (only for items with an expected DOI, so precision/recall have a ground truth) + if rec.expected.as_ref().and_then(|e| e.doi.as_ref()).is_some() { + let is_tp = m_doi_exact_1 == 1; + for (i, thr) in thresholds.iter().enumerate() { + if confidence >= *thr { + if is_tp { tp_at[i] += 1; } else { fp_at[i] += 1; } + } else { + if is_tp { fn_at[i] += 1; } + } + } + } + + // Per-item row + rows.push(vec![ + idx.to_string(), + rec.input_type.clone(), + q.replace(',', ' '), + rec.expected.as_ref().and_then(|e| e.doi.clone()).unwrap_or_default(), + top1_doi.clone(), + format!("{:.3}", top1_score), + format!("{:.3}", confidence), + status.clone(), + m_doi_exact_1.to_string(), + m_aty_1.to_string(), + m_topk_contains.to_string(), + doi_resolves.to_string(), + hallucination.to_string(), + top1_title.replace(',', ' '), + ]); + } + + let nn = n as f64; + println!("VeriBib-rs Evaluation Summary"); + println!("Gold items: {}", n); + println!("doi_exact@1: {}/{} = {:.3}", doi_exact_1, n, doi_exact_1 as f64 / nn); + println!("title_author_year_match@1: {}/{} = {:.3}", aty_1, n, aty_1 as f64 / nn); + println!("top{} contains expected doi: {}/{} = {:.3}", args.k, topk_contains, n, topk_contains as f64 / nn); + println!("Status breakdown:"); + for (k,v) in status_counts.iter() { println!(" {}: {}", k, v); } + + // Write CSVs if requested + if let Some(path) = args.items_csv.as_ref() { + let mut w = std::fs::File::create(path)?; + writeln!(&mut w, "id,input_type,query,expected_doi,top1_doi,top1_score,x_confidence,x_status,doi_exact1,aty1,topk_contains,doi_resolves,hallucination,top1_title")?; + for r in rows { + writeln!(&mut w, "{}", r.join(","))?; + } + println!("Wrote per-item CSV: {}", path.display()); + } + + if let Some(path) = args.csv.as_ref() { + let mut w = std::fs::File::create(path)?; + writeln!(&mut w, "threshold,precision,recall,tp,fp,fn")?; + for i in 0..thresholds.len() { + let tp = tp_at[i] as f64; + let fp = fp_at[i] as f64; + let fnv = fn_at[i] as f64; + let prec = if tp + fp > 0.0 { tp / (tp + fp) } else { 1.0 }; + let rec = if tp + fnv > 0.0 { tp / (tp + fnv) } else { 0.0 }; + writeln!(&mut w, "{:.2},{:.4},{:.4},{},{},{}", thresholds[i], prec, rec, tp_at[i], fp_at[i], fn_at[i])?; + } + println!("Wrote threshold metrics CSV: {}", path.display()); + } + + Ok(()) +} + +fn extract_field(bib: &str, key: &str) -> Option { + let pat = format!(r"(?im)^\s*{}\s*=\s*\{{(.*?)\}},?\s*$", regex::escape(key)); + let re = regex::Regex::new(&pat).ok()?; + let cap = re.captures(bib)?; + Some(cap.get(1)?.as_str().to_string()) +} diff --git a/src/llm.rs b/src/llm.rs new file mode 100644 index 0000000..b7d2763 --- /dev/null +++ b/src/llm.rs @@ -0,0 +1,118 @@ + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug)] +pub struct LlmConfig { + pub base_url: String, + pub model: String, +} + +#[derive(Serialize)] +struct ChatRequest<'a> { + model: &'a str, + messages: Vec>, + temperature: f32, +} + +#[derive(Serialize)] +struct Message<'a> { + role: &'a str, + content: &'a str, +} + +#[derive(Deserialize)] +struct OpenAIResponse { + choices: Vec, +} +#[derive(Deserialize)] +struct Choice { + message: ChoiceMsg, +} +#[derive(Deserialize)] +struct ChoiceMsg { + content: String, +} + +// Ollama native +#[derive(Serialize)] +struct OllamaChatReq<'a> { + model: &'a str, + messages: Vec>, + stream: bool, + options: OllamaOptions, +} +#[derive(Serialize)] +struct OllamaOptions { temperature: f32 } + +pub async fn extract_hints(base_url: &str, model: &str, free_text: &str, context: &str) -> Option { + let sys = "You are a meticulous citation analyst. NEVER invent DOIs. When unsure, say null. Respond with STRICT JSON: {\"title\":...,\"authors\":[...],\"year\":...,\"venue\":...,\"keywords\":[...] }"; + let user = serde_json::json!({ + "task": "extract_bibliographic_clues", + "input": { "free_text": free_text, "context": context } + }).to_string(); + + let messages = vec![ + Message { role: "system", content: sys }, + Message { role: "user", content: &user }, + ]; + + if base_url.trim_end_matches('/').ends_with("/v1") { + // OpenAI-compatible + let req = ChatRequest { model, messages, temperature: 0.0 }; + let url = format!("{}/chat/completions", base_url.trim_end_matches('/')); + let resp = reqwest::Client::new() + .post(&url) + .json(&req) + .send().await.ok()?; + let data: OpenAIResponse = resp.json().await.ok()?; + let content = data.choices.get(0)?.message.content.clone(); + serde_json::from_str::(&content).ok() + } else { + // Ollama native + let req = OllamaChatReq { + model, + messages, + stream: false, + options: OllamaOptions { temperature: 0.0 }, + }; + let url = format!("{}/api/chat", base_url.trim_end_matches('/')); + let resp = reqwest::Client::new().post(&url).json(&req).send().await.ok()?; + #[derive(Deserialize)] struct OllamaResp { message: ChoiceMsg } + let data: OllamaResp = resp.json().await.ok()?; + serde_json::from_str::(&data.message.content).ok() + } +} + +pub async fn rerank_indices(base_url: &str, model: &str, brief: &serde_json::Value) -> Option> { + let sys = "You rerank candidate citations. Respond with a JSON array of indices (0..N-1) sorted best->worst. Nothing else."; + let user = serde_json::json!({ + "task":"rerank_candidates", + "input": brief + }).to_string(); + + let messages = vec![ + Message { role: "system", content: sys }, + Message { role: "user", content: &user }, + ]; + + if base_url.trim_end_matches('/').ends_with("/v1") { + let req = ChatRequest { model, messages, temperature: 0.0 }; + let url = format!("{}/chat/completions", base_url.trim_end_matches('/')); + let resp = reqwest::Client::new().post(&url).json(&req).send().await.ok()?; + let data: OpenAIResponse = resp.json().await.ok()?; + let content = data.choices.get(0)?.message.content.clone(); + serde_json::from_str::>(&content).ok() + } else { + let req = OllamaChatReq { + model, + messages, + stream: false, + options: OllamaOptions { temperature: 0.0 }, + }; + let url = format!("{}/api/chat", base_url.trim_end_matches('/')); + let resp = reqwest::Client::new().post(&url).json(&req).send().await.ok()?; + #[derive(Deserialize)] struct OllamaResp { message: ChoiceMsg } + let data: OllamaResp = resp.json().await.ok()?; + serde_json::from_str::>(&data.message.content).ok() + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..d902c0a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,61 @@ +use clap::Parser; +use std::path::PathBuf; + +mod net; +mod score; +mod bib; +#[cfg(feature = "llm")] mod llm; + +#[derive(Parser, Debug)] +#[command(author, version, about = "VeriBib-rs: Crossref+OpenAlex → annotated BibTeX")] +struct Args { + #[arg(long)] string: Option, + #[arg(long)] list: Option, + #[arg(long)] bib: Option, + #[arg(long)] context: Option, + #[arg(long)] out: PathBuf, + #[arg(long, default_value_t = false)] llm: bool, + #[arg(long)] llm_base_url: Option, + #[arg(long)] llm_model: Option, +} + +#[tokio::main(flavor = "multi_thread")] +async fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let mut items: Vec = Vec::new(); + + if let Some(s) = args.string.as_ref() { items.push(net::InputItem::FreeString(s.clone())); } + if let Some(list) = args.list.as_ref() { + let content = std::fs::read_to_string(list)?; + for line in content.lines().map(|s| s.trim()).filter(|s| !s.is_empty()) { + items.push(net::InputItem::FreeString(line.to_string())); + } + } + if let Some(bibpath) = args.bib.as_ref() { + let raw = std::fs::read_to_string(bibpath)?; + for chunk in raw.split('@').filter(|c| !c.trim().is_empty()) { + let entry = format!("@{}", chunk); + let fields = bib::parse_minimal_bib_fields(&entry); + items.push(net::InputItem::BibLike(fields)); + } + } + if items.is_empty() { eprintln!("No input items."); std::process::exit(1); } + + #[cfg(feature = "llm")] + let llm_cfg = if args.llm { Some(llm::LlmConfig { base_url: args.llm_base_url.clone().unwrap_or_default(), model: args.llm_model.clone().unwrap_or_else(|| "llama3.1".to_string()) }) } else { None }; + #[cfg(not(feature = "llm"))] + let llm_cfg: Option<()> = None; + + let mut out = String::new(); + for it in items { + let entry = net::process_item( + it, + args.context.as_deref(), + #[cfg(feature = "llm")] llm_cfg.as_ref(), + ).await; + out.push_str(&entry.unwrap_or_else(|e| bib::error_entry(&format!("{}", e)))); + } + std::fs::write(&args.out, out)?; + println!("Wrote {}", args.out.display()); + Ok(()) +} diff --git a/src/net.rs b/src/net.rs new file mode 100644 index 0000000..322b716 --- /dev/null +++ b/src/net.rs @@ -0,0 +1,155 @@ +use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; +use serde::Deserialize; + +use crate::score::{score_candidate, QueryFields}; +use crate::bib::{bib_entry, MinimalBibFields}; + +pub enum InputItem { FreeString(String), BibLike(MinimalBibFields) } + +#[derive(Deserialize, Debug)] struct CrossrefMessage { message: CrossrefWorks } +#[derive(Deserialize, Debug)] struct CrossrefWorks { items: Vec } +#[derive(Deserialize, Debug, Clone)] +struct CrossrefItem { + #[serde(default)] title: Vec, + #[serde(default)] author: Vec, + #[serde(default)] issued: DateParts, + #[serde(default, rename="container-title")] container_title: Vec, + #[serde(rename="DOI")] doi: Option, + #[serde(default)] subject: Vec, +} +#[derive(Deserialize, Debug, Clone)] struct Person { given: Option, family: Option } +#[derive(Deserialize, Debug, Clone, Default)] struct DateParts { #[serde(rename="date-parts", default)] date_parts: Vec> } + +#[derive(Deserialize, Debug)] struct OpenAlexResults { results: Vec } +#[derive(Deserialize, Debug, Clone)] +struct OpenAlexItem { + title: Option, + #[serde(default)] authorships: Vec, + publication_year: Option, + #[serde(default)] host_venue: Option, + doi: Option, + #[serde(default)] concepts: Vec, +} +#[derive(Deserialize, Debug, Clone)] struct OpenAlexAuthorship { author: OpenAlexAuthor } +#[derive(Deserialize, Debug, Clone)] struct OpenAlexAuthor { display_name: String } +#[derive(Deserialize, Debug, Clone, Default)] struct OpenAlexVenue { display_name: String } +#[derive(Deserialize, Debug, Clone, Default)] struct OpenAlexConcept { display_name: String } + +fn map_crossref(it: &CrossrefItem) -> Candidate { + Candidate { + title: it.title.get(0).cloned().unwrap_or_default(), + authors: it.author.iter().map(|a| format!("{} {}", a.given.clone().unwrap_or_default(), a.family.clone().unwrap_or_default()).trim().to_string()).collect(), + year: it.issued.date_parts.get(0).and_then(|v| v.get(0)).cloned(), + venue: it.container_title.get(0).cloned().unwrap_or_default(), + doi: it.doi.clone(), + concepts: it.subject.clone(), + source: "crossref".to_string(), + } +} +fn map_openalex(it: &OpenAlexItem) -> Candidate { + Candidate { + title: it.title.clone().unwrap_or_default(), + authors: it.authorships.iter().map(|a| a.author.display_name.clone()).collect(), + year: it.publication_year, + venue: it.host_venue.as_ref().map(|v| v.display_name.clone()).unwrap_or_default(), + doi: it.doi.clone().map(|d| d.replace("https://doi.org/", "")), + concepts: it.concepts.iter().map(|c| c.display_name.clone()).collect(), + source: "openalex".to_string(), + } +} + +#[derive(Clone)] +pub struct Candidate { + pub title: String, + pub authors: Vec, + pub year: Option, + pub venue: String, + pub doi: Option, + pub concepts: Vec, + pub source: String, +} + +pub async fn process_item( + item: InputItem, + context: Option<&str>, + #[cfg(feature = "llm")] llm_cfg: Option<&crate::llm::LlmConfig>, +) -> anyhow::Result { + let (q, base_query) = match item { + InputItem::FreeString(s) => (QueryFields::from_free_string(&s), s), + InputItem::BibLike(b) => { + let q = QueryFields { + title: b.title.clone(), + authors: b.author.map(|a| vec![a]).unwrap_or_default(), + year: b.year.and_then(|y| y.parse::().ok()), + venue: b.journal.or(b.booktitle), + }; + let bq = if let Some(ref d) = b.doi { + d.clone() + } else { + format!("{} {} {}", q.title.clone().unwrap_or_default(), q.authors.get(0).cloned().unwrap_or_default(), q.year.map(|y| y.to_string()).unwrap_or_default()).trim().to_string() + }; + (q, bq) + } + }; + + let mut cands = Vec::::new(); + let year = q.year; + + // Crossref + { + let query = base_query.clone(); + let url = if let Some(y) = year { + format!("https://api.crossref.org/works?query.bibliographic={}&rows=5&filter=from-pub-date:{y}-01-01,until-pub-date:{y}-12-31", + utf8_percent_encode(&query, NON_ALPHANUMERIC)) + } else { + format!("https://api.crossref.org/works?query.bibliographic={}&rows=5", + utf8_percent_encode(&query, NON_ALPHANUMERIC)) + }; + if let Ok(resp) = reqwest::get(&url).await { + if let Ok(msg) = resp.json::().await { + cands.extend(msg.message.items.iter().map(map_crossref)); + } + } + } + // OpenAlex + { + let query = format!("{} {}", base_query, context.unwrap_or("")).trim().to_string(); + let url = if let Some(y) = year { + format!("https://api.openalex.org/works?search={}&per_page=5&from_publication_date={y}-01-01&to_publication_date={y}-12-31", + utf8_percent_encode(&query, NON_ALPHANUMERIC)) + } else { + format!("https://api.openalex.org/works?search={}&per_page=5", + utf8_percent_encode(&query, NON_ALPHANUMERIC)) + }; + if let Ok(resp) = reqwest::get(&url).await { + if let Ok(rs) = resp.json::().await { + cands.extend(rs.results.iter().map(map_openalex)); + } + } + } + + let mut scored = cands.clone(); + scored.sort_by(|a, b| { + let sa = score_candidate(&q, context, a); + let sb = score_candidate(&q, context, b); + sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal) + }); + + if let Some(best) = scored.get(0) { + let s = score_candidate(&q, context, best); + let status = if best.doi.is_some() && s >= 0.95 { "exact" } + else if s >= 0.75 { "high_confidence" } + else { "ambiguous" }; + let alts = scored.get(1..std::cmp::min(4, scored.len())).unwrap_or(&[]).to_vec(); + Ok(bib_entry("veribib", best, status, s, &base_query, context.unwrap_or(""), &alts)) + } else { + let placeholder = Candidate { + title: q.title.clone().unwrap_or_default(), + authors: q.authors.clone(), + year: q.year, + venue: q.venue.clone().unwrap_or_default(), + doi: None, concepts: vec![], source: "none".into(), + }; + Ok(bib_entry("not_found", &placeholder, "not_found", 0.0, &base_query, context.unwrap_or(""), &[])) + } +} diff --git a/src/score.rs b/src/score.rs new file mode 100644 index 0000000..ae38887 --- /dev/null +++ b/src/score.rs @@ -0,0 +1,50 @@ +use std::collections::HashSet; + +#[derive(Clone)] +pub struct QueryFields { + pub title: Option, + pub authors: Vec, + pub year: Option, + pub venue: Option, +} + +impl QueryFields { + pub fn from_free_string(s: &str) -> Self { + let year = s.chars().collect::>().windows(4).find_map(|w| { + let ss: String = w.iter().collect(); + ss.parse::().ok() + }); + QueryFields { title: Some(s.to_string()), authors: vec![], year, venue: None } + } +} + +fn norm(s: &str) -> String { s.to_lowercase().split_whitespace().collect::>().join(" ") } +fn first_surname(v: &Vec) -> Option { v.get(0).map(|s| s.split_whitespace().last().unwrap_or("").to_lowercase()) } + +pub fn score_candidate(q: &QueryFields, ctx: Option<&str>, c: &crate::net::Candidate) -> f64 { + let mut sc = 0.0; + if let Some(ref t) = q.title { + let tq: HashSet<_> = norm(t).split_whitespace().collect(); + let tc: HashSet<_> = norm(&c.title).split_whitespace().collect(); + if !tq.is_empty() { + let inter = tq.intersection(&tc).count() as f64 / tq.len() as f64; + if inter >= 0.7 { sc += 0.40; } + } + } + if let (Some(qa0), Some(ca0)) = (first_surname(&q.authors), first_surname(&c.authors)) { + if !qa0.is_empty() && qa0 == ca0 { sc += 0.25; } + } + if let (Some(qy), Some(cy)) = (q.year, c.year) { + let dy = (qy - cy).abs(); + sc += if dy == 0 { 0.20 } else if dy == 1 { 0.10 } else { 0.0 }; + } + if let (Some(ref vq), v) = (q.venue.as_ref(), &c.venue) { + if norm(vq) == norm(v) { sc += 0.10; } + } + if let (Some(ctx), concepts) = (ctx, &c.concepts) { + let ctoks: HashSet<_> = norm(ctx).split_whitespace().collect(); + let conc: HashSet<_> = concepts.iter().map(|s| norm(s)).collect::>().iter().map(|s| s.as_str()).collect(); + if !ctoks.is_empty() && ctoks.intersection(&conc).next().is_some() { sc += 0.05; } + } + sc.min(1.0) +} diff --git a/tests/bib_roundtrip.rs b/tests/bib_roundtrip.rs new file mode 100644 index 0000000..bf7586b --- /dev/null +++ b/tests/bib_roundtrip.rs @@ -0,0 +1,29 @@ + +use veribib::{}; // Not needed; tests use public functions directly + +#[test] +fn bib_round_trip_minimal() { + let src = r#" +@article{evans1960, + author = {William E. Evans}, + title = {Some Observations on the Echolocation of the Bottlenose Dolphin}, + year = {1960}, + journal = {Journal of the Acoustical Society of America}, + doi = {10.0000/example} +} +"#; + let f = crate::bib::parse_minimal_bib_fields(src); + assert_eq!(f.author.unwrap().contains("Evans"), true); + let cand = crate::net::Candidate { + title: f.title.clone().unwrap_or_default(), + authors: vec![f.author.unwrap_or_default()], + year: f.year.and_then(|y| y.parse::().ok()), + venue: f.journal.unwrap_or_default(), + doi: f.doi, + concepts: vec![], + source: "test".into(), + }; + let out = crate::bib::bib_entry("evans1960", &cand, "exact", 0.99, "Evans 1960", "echolocation", &[]); + assert!(out.contains("@article{evans1960")); + assert!(out.contains("x_status = {exact}")); +}