Initial commit
This commit is contained in:
parent
ee79001025
commit
8d092f5c98
|
|
@ -0,0 +1,8 @@
|
||||||
|
# Tri-target friendly config.
|
||||||
|
# For Linux static builds:
|
||||||
|
# rustup target add x86_64-unknown-linux-musl
|
||||||
|
# sudo apt-get install -y musl-tools
|
||||||
|
# cargo build --release --target x86_64-unknown-linux-musl
|
||||||
|
#
|
||||||
|
# Future: Cosmopolitan experiments (advanced)
|
||||||
|
# You can try setting cosmocc as linker and using APE post-link steps.
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
name: build
|
||||||
|
on: [push, pull_request]
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- os: ubuntu-latest
|
||||||
|
target: x86_64-unknown-linux-gnu
|
||||||
|
- os: ubuntu-latest
|
||||||
|
target: x86_64-unknown-linux-musl
|
||||||
|
- os: windows-latest
|
||||||
|
target: x86_64-pc-windows-msvc
|
||||||
|
- os: macos-latest
|
||||||
|
target: x86_64-apple-darwin
|
||||||
|
- os: macos-latest
|
||||||
|
target: aarch64-apple-darwin
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: dtolnay/rust-toolchain@stable
|
||||||
|
with:
|
||||||
|
targets: ${{ matrix.target }}
|
||||||
|
- name: Install musl tools
|
||||||
|
if: matrix.target == 'x86_64-unknown-linux-musl'
|
||||||
|
run: sudo apt-get update && sudo apt-get install -y musl-tools
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --release --target ${{ matrix.target }}
|
||||||
|
- name: Upload artifact
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: veribib-${{ matrix.target }}
|
||||||
|
path: target/${{ matrix.target }}/release/*
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
[package]
|
||||||
|
name = "veribib"
|
||||||
|
version = "0.3.0"
|
||||||
|
edition = "2021"
|
||||||
|
license = "MIT"
|
||||||
|
description = "VeriBib-rs: verify/disambiguate citations via Crossref/OpenAlex, output annotated BibTeX."
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
biblatex = "0.9" # If build fails, try `biblatex = "0.8"` or switch to `bibtex-parser`
|
||||||
|
clap = { version = "4.5", features = ["derive"] }
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
thiserror = "1.0"
|
||||||
|
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }
|
||||||
|
webpki-roots = "0.26"
|
||||||
|
tokio = { version = "1.38", features = ["rt-multi-thread", "macros"] }
|
||||||
|
percent-encoding = "2.3"
|
||||||
|
regex = "1.10"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
llm = []
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
lto = true
|
||||||
|
codegen-units = 1
|
||||||
|
opt-level = "z"
|
||||||
|
strip = "symbols"
|
||||||
|
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "veribib"
|
||||||
|
path = "src/main.rs"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "veribib-eval"
|
||||||
|
path = "src/eval.rs"
|
||||||
16
README.md
16
README.md
|
|
@ -1,3 +1,15 @@
|
||||||
# VeriBib-rs
|
# VeriBib-rs (tri-target Rust)
|
||||||
|
|
||||||
VeriBib-rs is a Rust implementation of the VeriBib citation checking tool.
|
- Crossref + OpenAlex lookups
|
||||||
|
- Portable TLS (reqwest+rustls)
|
||||||
|
- Emits annotated BibTeX (`x_status`, `x_confidence`, …)
|
||||||
|
- GitHub Actions matrix for Win/macOS/Linux
|
||||||
|
- Future note for Cosmopolitan APE in `.cargo/config.toml`
|
||||||
|
|
||||||
|
## Build
|
||||||
|
See `.github/workflows/build.yml` or run locally per target.
|
||||||
|
|
||||||
|
## Use
|
||||||
|
```
|
||||||
|
./veribib --string "Evans 1960" --context "bottlenose dolphin echolocation" --out out.bib
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
|
||||||
|
import sys, csv
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def main(csv_path, out_prefix="curves"):
|
||||||
|
thr = []
|
||||||
|
prec = []
|
||||||
|
rec = []
|
||||||
|
with open(csv_path, newline="") as f:
|
||||||
|
r = csv.DictReader(f)
|
||||||
|
for row in r:
|
||||||
|
thr.append(float(row["threshold"]))
|
||||||
|
prec.append(float(row["precision"]))
|
||||||
|
rec.append(float(row["recall"]))
|
||||||
|
|
||||||
|
# Precision-Recall
|
||||||
|
plt.figure()
|
||||||
|
plt.plot(rec, prec, marker='o')
|
||||||
|
plt.xlabel('Recall')
|
||||||
|
plt.ylabel('Precision')
|
||||||
|
plt.title('Precision-Recall Curve')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(f"{out_prefix}_pr.png", dpi=160)
|
||||||
|
|
||||||
|
# Precision & Recall vs Threshold
|
||||||
|
plt.figure()
|
||||||
|
plt.plot(thr, prec, marker='o', label="Precision")
|
||||||
|
plt.plot(thr, rec, marker='o', label="Recall")
|
||||||
|
plt.xlabel('Confidence threshold')
|
||||||
|
plt.ylabel('Score')
|
||||||
|
plt.title('Precision/Recall vs Threshold')
|
||||||
|
plt.legend()
|
||||||
|
plt.grid(True)
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(f"{out_prefix}_thr.png", dpi=160)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python scripts/plot_eval.py metrics.csv [out_prefix]")
|
||||||
|
sys.exit(1)
|
||||||
|
csv_path = sys.argv[1]
|
||||||
|
out_prefix = sys.argv[2] if len(sys.argv) > 2 else "curves"
|
||||||
|
main(csv_path, out_prefix)
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
use std::fmt::Write as _;
|
||||||
|
use regex::Regex;
|
||||||
|
use crate::net::Candidate;
|
||||||
|
|
||||||
|
#[derive(Clone, Default)]
|
||||||
|
pub struct MinimalBibFields {
|
||||||
|
pub title: Option<String>,
|
||||||
|
pub author: Option<String>,
|
||||||
|
pub year: Option<String>,
|
||||||
|
pub journal: Option<String>,
|
||||||
|
pub booktitle: Option<String>,
|
||||||
|
pub doi: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_minimal_bib_fields(entry: &str) -> MinimalBibFields {
|
||||||
|
fn grab(entry: &str, key: &str) -> Option<String> {
|
||||||
|
let pat = format!(r"(?is)\b{}\s*=\s*[{{"](.*?)[}}"]", key);
|
||||||
|
let re = Regex::new(&pat).ok()?;
|
||||||
|
let cap = re.captures(entry)?;
|
||||||
|
Some(cap.get(1)?.as_str().trim().to_string())
|
||||||
|
}
|
||||||
|
MinimalBibFields {
|
||||||
|
title: grab(entry, "title"),
|
||||||
|
author: grab(entry, "author"),
|
||||||
|
year: grab(entry, "year"),
|
||||||
|
journal: grab(entry, "journal"),
|
||||||
|
booktitle: grab(entry, "booktitle"),
|
||||||
|
doi: grab(entry, "doi"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bib_entry(
|
||||||
|
key: &str,
|
||||||
|
meta: &Candidate,
|
||||||
|
status: &str,
|
||||||
|
confidence: f64,
|
||||||
|
query: &str,
|
||||||
|
context: &str,
|
||||||
|
alternates: &[Candidate],
|
||||||
|
) -> String {
|
||||||
|
let mut out = String::new();
|
||||||
|
let _ = writeln!(&mut out, "@article{{{key},");
|
||||||
|
let _ = writeln!(&mut out, " author = {{{}}},", meta.authors.join(" and "));
|
||||||
|
let _ = writeln!(&mut out, " title = {{{}}},", meta.title.replace('{', "\{").replace('}', "\}"));
|
||||||
|
if !meta.venue.is_empty() { let _ = writeln!(&mut out, " journal = {{{}}},", meta.venue.replace('{', "\{").replace('}', "\}")); }
|
||||||
|
if let Some(y) = meta.year { let _ = writeln!(&mut out, " year = {{{}}},", y); }
|
||||||
|
if let Some(doi) = &meta.doi { let _ = writeln!(&mut out, " doi = {{{}}},", doi); }
|
||||||
|
let _ = writeln!(&mut out, " x_status = {{{}}},", status);
|
||||||
|
let _ = writeln!(&mut out, " x_confidence = {{{:.2}}},", confidence);
|
||||||
|
let _ = writeln!(&mut out, " x_source = {{{}}},", meta.source);
|
||||||
|
let _ = writeln!(&mut out, " x_query = {{{}}},", query.replace('{', "\{").replace('}', "\}"));
|
||||||
|
let _ = writeln!(&mut out, " x_context = {{{}}},", context.replace('{', "\{").replace('}', "\}"));
|
||||||
|
if !alternates.is_empty() {
|
||||||
|
let alt = alternates.iter()
|
||||||
|
.map(|a| format!("{}|{}|{}|{}", a.doi.clone().unwrap_or_default(), a.title, a.authors.get(0).cloned().unwrap_or_default(), a.year.map(|y| y.to_string()).unwrap_or_default()))
|
||||||
|
.collect::<Vec<_>>().join(" || ");
|
||||||
|
let _ = writeln!(&mut out, " x_alternates = {{{}}},", alt.replace('{', "\{").replace('}', "\}"));
|
||||||
|
}
|
||||||
|
if out.ends_with(",
|
||||||
|
") { out.truncate(out.len()-2); out.push('\n'); }
|
||||||
|
out.push_str("}\n");
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn error_entry(msg: &str) -> String {
|
||||||
|
format!("@misc{{veribib_error,\n x_status = {{not_found}},\n x_confidence = {{0.0}},\n x_source = {{error}},\n x_query = {{{}}}\n}}\n", msg.replace('{', "\{").replace('}', "\}"))
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,284 @@
|
||||||
|
|
||||||
|
use clap::Parser;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::io::Write;
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[command(author, version, about = "VeriBib-rs evaluation harness")]
|
||||||
|
struct Args {
|
||||||
|
#[arg(long)] gold: PathBuf,
|
||||||
|
/// Top-k for DOI containment metric
|
||||||
|
#[arg(long, default_value_t = 3)] k: usize,
|
||||||
|
/// Enable LLM assistance (requires building with `--features llm`)
|
||||||
|
#[arg(long, default_value_t = false)] llm: bool,
|
||||||
|
/// OpenAI-compatible base URL (e.g., http://localhost:11434/v1) or Ollama native base (http://localhost:11434)
|
||||||
|
#[arg(long)] llm_base_url: Option<String>,
|
||||||
|
/// Model name (e.g., llama3.1)
|
||||||
|
#[arg(long)] llm_model: Option<String>,
|
||||||
|
/// Write overall metrics CSV here (optional)
|
||||||
|
#[arg(long)] csv: Option<PathBuf>,
|
||||||
|
/// Write per-item CSV here (optional)
|
||||||
|
#[arg(long)] items_csv: Option<PathBuf>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct GoldRec {
|
||||||
|
#[serde(default)] input_type: String, // "string" or "bib"
|
||||||
|
#[serde(default)] query: String,
|
||||||
|
#[serde(default)] context: String,
|
||||||
|
#[serde(default)] bib: Option<serde_json::Value>,
|
||||||
|
#[serde(default)] expected: Option<Expected>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize, Default, Clone)]
|
||||||
|
struct Expected {
|
||||||
|
doi: Option<String>,
|
||||||
|
title: Option<String>,
|
||||||
|
first_author: Option<String>,
|
||||||
|
year: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main(flavor = "multi_thread")]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let args = Args::parse();
|
||||||
|
let raw = std::fs::read_to_string(&args.gold)?;
|
||||||
|
let mut items = Vec::<GoldRec>::new();
|
||||||
|
for line in raw.lines().filter(|l| !l.trim().is_empty()) {
|
||||||
|
items.push(serde_json::from_str::<GoldRec>(line)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "llm")]
|
||||||
|
let llm_cfg = if args.llm {
|
||||||
|
Some(veribib::llm::LlmConfig {
|
||||||
|
base_url: args.llm_base_url.clone().unwrap_or_default(),
|
||||||
|
model: args.llm_model.clone().unwrap_or_else(|| "llama3.1".to_string()),
|
||||||
|
})
|
||||||
|
} else { None };
|
||||||
|
|
||||||
|
#[cfg(not(feature = "llm"))]
|
||||||
|
let llm_cfg: Option<()> = None;
|
||||||
|
|
||||||
|
let mut n = 0usize;
|
||||||
|
let mut doi_exact_1 = 0usize;
|
||||||
|
let mut aty_1 = 0usize;
|
||||||
|
let mut topk_contains = 0usize;
|
||||||
|
let mut status_counts: std::collections::BTreeMap<String, usize> = Default::default();
|
||||||
|
|
||||||
|
// Per-item rows
|
||||||
|
let mut rows: Vec<Vec<String>> = Vec::new();
|
||||||
|
|
||||||
|
// For threshold curves
|
||||||
|
let mut thresholds: Vec<f64> = (0..=20).map(|i| i as f64 / 20.0).collect();
|
||||||
|
let mut tp_at: Vec<usize> = vec![0; thresholds.len()];
|
||||||
|
let mut fp_at: Vec<usize> = vec![0; thresholds.len()];
|
||||||
|
let mut fn_at: Vec<usize> = vec![0; thresholds.len()];
|
||||||
|
|
||||||
|
// Helper to normalize lowercase
|
||||||
|
let lc = |s: &str| s.to_lowercase();
|
||||||
|
|
||||||
|
for (idx, rec) in items.into_iter().enumerate() {
|
||||||
|
n += 1;
|
||||||
|
let ctx = if rec.context.is_empty() { None } else { Some(rec.context.as_str()) };
|
||||||
|
|
||||||
|
// Build input
|
||||||
|
let input = if rec.input_type == "bib" {
|
||||||
|
let b = rec.bib.clone().unwrap_or_default();
|
||||||
|
let m = veribib::bib::MinimalBibFields {
|
||||||
|
title: b.get("title").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
author: b.get("author").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
year: b.get("year").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
journal: b.get("journal").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
booktitle: b.get("booktitle").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
doi: b.get("doi").and_then(|v| v.as_str()).map(|s| s.to_string()),
|
||||||
|
};
|
||||||
|
veribib::net::InputItem::BibLike(m)
|
||||||
|
} else {
|
||||||
|
veribib::net::InputItem::FreeString(rec.query.clone())
|
||||||
|
};
|
||||||
|
|
||||||
|
// Run process_item to generate BibTeX, then parse status and confidence
|
||||||
|
let bib = veribib::net::process_item(
|
||||||
|
input, ctx,
|
||||||
|
#[cfg(feature = "llm")] llm_cfg.as_ref(),
|
||||||
|
).await.unwrap_or_else(|e| veribib::bib::error_entry(&format!("{}", e)));
|
||||||
|
|
||||||
|
let status = extract_field(&bib, "x_status").unwrap_or_else(|| "unknown".into());
|
||||||
|
let confidence = extract_field(&bib, "x_confidence").and_then(|s| s.parse::<f64>().ok()).unwrap_or(0.0);
|
||||||
|
let top1_doi = extract_field(&bib, "doi").unwrap_or_default();
|
||||||
|
let top1_title = extract_field(&bib, "title").unwrap_or_default();
|
||||||
|
|
||||||
|
*status_counts.entry(status.clone()).or_default() += 1;
|
||||||
|
|
||||||
|
// Re-fetch candidates cheaply for metrics (same as earlier harness)
|
||||||
|
let q = if rec.input_type == "bib" {
|
||||||
|
rec.bib.as_ref().and_then(|b| b.get("title").and_then(|v| v.as_str()).map(|s| s.to_string())).unwrap_or_else(|| rec.query.clone())
|
||||||
|
} else { rec.query.clone() };
|
||||||
|
|
||||||
|
let qf = veribib::score::QueryFields::from_free_string(&q);
|
||||||
|
let mut cands = Vec::<veribib::net::Candidate>::new();
|
||||||
|
// Crossref
|
||||||
|
{
|
||||||
|
let base_query = q.clone();
|
||||||
|
let year = qf.year;
|
||||||
|
let url = if let Some(y) = year {
|
||||||
|
format!("https://api.crossref.org/works?query.bibliographic={}&rows=5&filter=from-pub-date:{y}-01-01,until-pub-date:{y}-12-31",
|
||||||
|
percent_encoding::utf8_percent_encode(&base_query, percent_encoding::NON_ALPHANUMERIC))
|
||||||
|
} else {
|
||||||
|
format!("https://api.crossref.org/works?query.bibliographic={}&rows=5",
|
||||||
|
percent_encoding::utf8_percent_encode(&base_query, percent_encoding::NON_ALPHANUMERIC))
|
||||||
|
};
|
||||||
|
if let Ok(resp) = reqwest::get(&url).await {
|
||||||
|
if let Ok(msg) = resp.json::<veribib::net::CrossrefMessage>().await {
|
||||||
|
cands.extend(msg.message.items.iter().map(|it| veribib::net::map_crossref(it)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// OpenAlex
|
||||||
|
let qq = format!("{} {}", &base_query, ctx.unwrap_or("")).trim().to_string();
|
||||||
|
let url = if let Some(y) = year {
|
||||||
|
format!("https://api.openalex.org/works?search={}&per_page=5&from_publication_date={y}-01-01&to_publication_date={y}-12-31",
|
||||||
|
percent_encoding::utf8_percent_encode(&qq, percent_encoding::NON_ALPHANUMERIC))
|
||||||
|
} else {
|
||||||
|
format!("https://api.openalex.org/works?search={}&per_page=5",
|
||||||
|
percent_encoding::utf8_percent_encode(&qq, percent_encoding::NON_ALPHANUMERIC))
|
||||||
|
};
|
||||||
|
if let Ok(resp) = reqwest::get(&url).await {
|
||||||
|
if let Ok(rs) = resp.json::<veribib::net::OpenAlexResults>().await {
|
||||||
|
cands.extend(rs.results.iter().map(|it| veribib::net::map_openalex(it)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Rank
|
||||||
|
cands.sort_by(|a, b| {
|
||||||
|
let sa = veribib::score::score_candidate(&qf, ctx, a);
|
||||||
|
let sb = veribib::score::score_candidate(&qf, ctx, b);
|
||||||
|
sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal)
|
||||||
|
});
|
||||||
|
|
||||||
|
// Metrics
|
||||||
|
let mut m_doi_exact_1 = 0usize;
|
||||||
|
let mut m_aty_1 = 0usize;
|
||||||
|
let mut m_topk_contains = 0usize;
|
||||||
|
let mut top1_score = 0.0;
|
||||||
|
if let Some(top) = cands.get(0) {
|
||||||
|
top1_score = veribib::score::score_candidate(&qf, ctx, top);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(exp) = rec.expected.clone() {
|
||||||
|
if let Some(doi) = exp.doi.as_ref().map(|s| lc(s)) {
|
||||||
|
if let Some(top) = cands.get(0) {
|
||||||
|
if top.doi.clone().unwrap_or_default().to_lowercase() == doi { m_doi_exact_1 = 1; }
|
||||||
|
}
|
||||||
|
let topk = cands.iter().take(args.k).any(|c| c.doi.clone().unwrap_or_default().to_lowercase() == doi);
|
||||||
|
if topk { m_topk_contains = 1; }
|
||||||
|
}
|
||||||
|
let mut ok = true;
|
||||||
|
if let Some(t) = exp.title.as_ref().map(|s| lc(s)) {
|
||||||
|
if let Some(top) = cands.get(0) {
|
||||||
|
let tt = lc(&top.title);
|
||||||
|
ok &= tt.contains(&t) || (t.contains(&tt) && t.len() < 25);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(a) = exp.first_author.as_ref().map(|s| lc(s)) {
|
||||||
|
if let Some(top) = cands.get(0) {
|
||||||
|
let a0 = top.authors.get(0).map(|s| s.split_whitespace().last().unwrap_or("").to_lowercase()).unwrap_or_default();
|
||||||
|
ok &= a0 == a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(y) = exp.year.as_ref() {
|
||||||
|
if let Some(top) = cands.get(0) {
|
||||||
|
ok &= top.year.map(|yy| yy.to_string()) == Some(y.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ok { m_aty_1 = 1; }
|
||||||
|
}
|
||||||
|
|
||||||
|
doi_exact_1 += m_doi_exact_1;
|
||||||
|
aty_1 += m_aty_1;
|
||||||
|
topk_contains += m_topk_contains;
|
||||||
|
|
||||||
|
// Hallucination heuristic: top1 DOI present but doesn't resolve (HTTP != 200)
|
||||||
|
let mut doi_resolves = 0usize;
|
||||||
|
let mut hallucination = 0usize;
|
||||||
|
if !top1_doi.is_empty() {
|
||||||
|
let url = format!("https://doi.org/{}", top1_doi);
|
||||||
|
if let Ok(resp) = reqwest::Client::new().head(&url).send().await {
|
||||||
|
if resp.status().is_success() { doi_resolves = 1; }
|
||||||
|
}
|
||||||
|
if doi_resolves == 0 && status != "not_found" {
|
||||||
|
hallucination = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Threshold bookkeeping (only for items with an expected DOI, so precision/recall have a ground truth)
|
||||||
|
if rec.expected.as_ref().and_then(|e| e.doi.as_ref()).is_some() {
|
||||||
|
let is_tp = m_doi_exact_1 == 1;
|
||||||
|
for (i, thr) in thresholds.iter().enumerate() {
|
||||||
|
if confidence >= *thr {
|
||||||
|
if is_tp { tp_at[i] += 1; } else { fp_at[i] += 1; }
|
||||||
|
} else {
|
||||||
|
if is_tp { fn_at[i] += 1; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-item row
|
||||||
|
rows.push(vec![
|
||||||
|
idx.to_string(),
|
||||||
|
rec.input_type.clone(),
|
||||||
|
q.replace(',', ' '),
|
||||||
|
rec.expected.as_ref().and_then(|e| e.doi.clone()).unwrap_or_default(),
|
||||||
|
top1_doi.clone(),
|
||||||
|
format!("{:.3}", top1_score),
|
||||||
|
format!("{:.3}", confidence),
|
||||||
|
status.clone(),
|
||||||
|
m_doi_exact_1.to_string(),
|
||||||
|
m_aty_1.to_string(),
|
||||||
|
m_topk_contains.to_string(),
|
||||||
|
doi_resolves.to_string(),
|
||||||
|
hallucination.to_string(),
|
||||||
|
top1_title.replace(',', ' '),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let nn = n as f64;
|
||||||
|
println!("VeriBib-rs Evaluation Summary");
|
||||||
|
println!("Gold items: {}", n);
|
||||||
|
println!("doi_exact@1: {}/{} = {:.3}", doi_exact_1, n, doi_exact_1 as f64 / nn);
|
||||||
|
println!("title_author_year_match@1: {}/{} = {:.3}", aty_1, n, aty_1 as f64 / nn);
|
||||||
|
println!("top{} contains expected doi: {}/{} = {:.3}", args.k, topk_contains, n, topk_contains as f64 / nn);
|
||||||
|
println!("Status breakdown:");
|
||||||
|
for (k,v) in status_counts.iter() { println!(" {}: {}", k, v); }
|
||||||
|
|
||||||
|
// Write CSVs if requested
|
||||||
|
if let Some(path) = args.items_csv.as_ref() {
|
||||||
|
let mut w = std::fs::File::create(path)?;
|
||||||
|
writeln!(&mut w, "id,input_type,query,expected_doi,top1_doi,top1_score,x_confidence,x_status,doi_exact1,aty1,topk_contains,doi_resolves,hallucination,top1_title")?;
|
||||||
|
for r in rows {
|
||||||
|
writeln!(&mut w, "{}", r.join(","))?;
|
||||||
|
}
|
||||||
|
println!("Wrote per-item CSV: {}", path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(path) = args.csv.as_ref() {
|
||||||
|
let mut w = std::fs::File::create(path)?;
|
||||||
|
writeln!(&mut w, "threshold,precision,recall,tp,fp,fn")?;
|
||||||
|
for i in 0..thresholds.len() {
|
||||||
|
let tp = tp_at[i] as f64;
|
||||||
|
let fp = fp_at[i] as f64;
|
||||||
|
let fnv = fn_at[i] as f64;
|
||||||
|
let prec = if tp + fp > 0.0 { tp / (tp + fp) } else { 1.0 };
|
||||||
|
let rec = if tp + fnv > 0.0 { tp / (tp + fnv) } else { 0.0 };
|
||||||
|
writeln!(&mut w, "{:.2},{:.4},{:.4},{},{},{}", thresholds[i], prec, rec, tp_at[i], fp_at[i], fn_at[i])?;
|
||||||
|
}
|
||||||
|
println!("Wrote threshold metrics CSV: {}", path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_field(bib: &str, key: &str) -> Option<String> {
|
||||||
|
let pat = format!(r"(?im)^\s*{}\s*=\s*\{{(.*?)\}},?\s*$", regex::escape(key));
|
||||||
|
let re = regex::Regex::new(&pat).ok()?;
|
||||||
|
let cap = re.captures(bib)?;
|
||||||
|
Some(cap.get(1)?.as_str().to_string())
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,118 @@
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct LlmConfig {
|
||||||
|
pub base_url: String,
|
||||||
|
pub model: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct ChatRequest<'a> {
|
||||||
|
model: &'a str,
|
||||||
|
messages: Vec<Message<'a>>,
|
||||||
|
temperature: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct Message<'a> {
|
||||||
|
role: &'a str,
|
||||||
|
content: &'a str,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct OpenAIResponse {
|
||||||
|
choices: Vec<Choice>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Choice {
|
||||||
|
message: ChoiceMsg,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ChoiceMsg {
|
||||||
|
content: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ollama native
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct OllamaChatReq<'a> {
|
||||||
|
model: &'a str,
|
||||||
|
messages: Vec<Message<'a>>,
|
||||||
|
stream: bool,
|
||||||
|
options: OllamaOptions,
|
||||||
|
}
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct OllamaOptions { temperature: f32 }
|
||||||
|
|
||||||
|
pub async fn extract_hints(base_url: &str, model: &str, free_text: &str, context: &str) -> Option<serde_json::Value> {
|
||||||
|
let sys = "You are a meticulous citation analyst. NEVER invent DOIs. When unsure, say null. Respond with STRICT JSON: {\"title\":...,\"authors\":[...],\"year\":...,\"venue\":...,\"keywords\":[...] }";
|
||||||
|
let user = serde_json::json!({
|
||||||
|
"task": "extract_bibliographic_clues",
|
||||||
|
"input": { "free_text": free_text, "context": context }
|
||||||
|
}).to_string();
|
||||||
|
|
||||||
|
let messages = vec![
|
||||||
|
Message { role: "system", content: sys },
|
||||||
|
Message { role: "user", content: &user },
|
||||||
|
];
|
||||||
|
|
||||||
|
if base_url.trim_end_matches('/').ends_with("/v1") {
|
||||||
|
// OpenAI-compatible
|
||||||
|
let req = ChatRequest { model, messages, temperature: 0.0 };
|
||||||
|
let url = format!("{}/chat/completions", base_url.trim_end_matches('/'));
|
||||||
|
let resp = reqwest::Client::new()
|
||||||
|
.post(&url)
|
||||||
|
.json(&req)
|
||||||
|
.send().await.ok()?;
|
||||||
|
let data: OpenAIResponse = resp.json().await.ok()?;
|
||||||
|
let content = data.choices.get(0)?.message.content.clone();
|
||||||
|
serde_json::from_str::<serde_json::Value>(&content).ok()
|
||||||
|
} else {
|
||||||
|
// Ollama native
|
||||||
|
let req = OllamaChatReq {
|
||||||
|
model,
|
||||||
|
messages,
|
||||||
|
stream: false,
|
||||||
|
options: OllamaOptions { temperature: 0.0 },
|
||||||
|
};
|
||||||
|
let url = format!("{}/api/chat", base_url.trim_end_matches('/'));
|
||||||
|
let resp = reqwest::Client::new().post(&url).json(&req).send().await.ok()?;
|
||||||
|
#[derive(Deserialize)] struct OllamaResp { message: ChoiceMsg }
|
||||||
|
let data: OllamaResp = resp.json().await.ok()?;
|
||||||
|
serde_json::from_str::<serde_json::Value>(&data.message.content).ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn rerank_indices(base_url: &str, model: &str, brief: &serde_json::Value) -> Option<Vec<usize>> {
|
||||||
|
let sys = "You rerank candidate citations. Respond with a JSON array of indices (0..N-1) sorted best->worst. Nothing else.";
|
||||||
|
let user = serde_json::json!({
|
||||||
|
"task":"rerank_candidates",
|
||||||
|
"input": brief
|
||||||
|
}).to_string();
|
||||||
|
|
||||||
|
let messages = vec![
|
||||||
|
Message { role: "system", content: sys },
|
||||||
|
Message { role: "user", content: &user },
|
||||||
|
];
|
||||||
|
|
||||||
|
if base_url.trim_end_matches('/').ends_with("/v1") {
|
||||||
|
let req = ChatRequest { model, messages, temperature: 0.0 };
|
||||||
|
let url = format!("{}/chat/completions", base_url.trim_end_matches('/'));
|
||||||
|
let resp = reqwest::Client::new().post(&url).json(&req).send().await.ok()?;
|
||||||
|
let data: OpenAIResponse = resp.json().await.ok()?;
|
||||||
|
let content = data.choices.get(0)?.message.content.clone();
|
||||||
|
serde_json::from_str::<Vec<usize>>(&content).ok()
|
||||||
|
} else {
|
||||||
|
let req = OllamaChatReq {
|
||||||
|
model,
|
||||||
|
messages,
|
||||||
|
stream: false,
|
||||||
|
options: OllamaOptions { temperature: 0.0 },
|
||||||
|
};
|
||||||
|
let url = format!("{}/api/chat", base_url.trim_end_matches('/'));
|
||||||
|
let resp = reqwest::Client::new().post(&url).json(&req).send().await.ok()?;
|
||||||
|
#[derive(Deserialize)] struct OllamaResp { message: ChoiceMsg }
|
||||||
|
let data: OllamaResp = resp.json().await.ok()?;
|
||||||
|
serde_json::from_str::<Vec<usize>>(&data.message.content).ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
use clap::Parser;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
mod net;
|
||||||
|
mod score;
|
||||||
|
mod bib;
|
||||||
|
#[cfg(feature = "llm")] mod llm;
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[command(author, version, about = "VeriBib-rs: Crossref+OpenAlex → annotated BibTeX")]
|
||||||
|
struct Args {
|
||||||
|
#[arg(long)] string: Option<String>,
|
||||||
|
#[arg(long)] list: Option<PathBuf>,
|
||||||
|
#[arg(long)] bib: Option<PathBuf>,
|
||||||
|
#[arg(long)] context: Option<String>,
|
||||||
|
#[arg(long)] out: PathBuf,
|
||||||
|
#[arg(long, default_value_t = false)] llm: bool,
|
||||||
|
#[arg(long)] llm_base_url: Option<String>,
|
||||||
|
#[arg(long)] llm_model: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main(flavor = "multi_thread")]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let args = Args::parse();
|
||||||
|
let mut items: Vec<net::InputItem> = Vec::new();
|
||||||
|
|
||||||
|
if let Some(s) = args.string.as_ref() { items.push(net::InputItem::FreeString(s.clone())); }
|
||||||
|
if let Some(list) = args.list.as_ref() {
|
||||||
|
let content = std::fs::read_to_string(list)?;
|
||||||
|
for line in content.lines().map(|s| s.trim()).filter(|s| !s.is_empty()) {
|
||||||
|
items.push(net::InputItem::FreeString(line.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(bibpath) = args.bib.as_ref() {
|
||||||
|
let raw = std::fs::read_to_string(bibpath)?;
|
||||||
|
for chunk in raw.split('@').filter(|c| !c.trim().is_empty()) {
|
||||||
|
let entry = format!("@{}", chunk);
|
||||||
|
let fields = bib::parse_minimal_bib_fields(&entry);
|
||||||
|
items.push(net::InputItem::BibLike(fields));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if items.is_empty() { eprintln!("No input items."); std::process::exit(1); }
|
||||||
|
|
||||||
|
#[cfg(feature = "llm")]
|
||||||
|
let llm_cfg = if args.llm { Some(llm::LlmConfig { base_url: args.llm_base_url.clone().unwrap_or_default(), model: args.llm_model.clone().unwrap_or_else(|| "llama3.1".to_string()) }) } else { None };
|
||||||
|
#[cfg(not(feature = "llm"))]
|
||||||
|
let llm_cfg: Option<()> = None;
|
||||||
|
|
||||||
|
let mut out = String::new();
|
||||||
|
for it in items {
|
||||||
|
let entry = net::process_item(
|
||||||
|
it,
|
||||||
|
args.context.as_deref(),
|
||||||
|
#[cfg(feature = "llm")] llm_cfg.as_ref(),
|
||||||
|
).await;
|
||||||
|
out.push_str(&entry.unwrap_or_else(|e| bib::error_entry(&format!("{}", e))));
|
||||||
|
}
|
||||||
|
std::fs::write(&args.out, out)?;
|
||||||
|
println!("Wrote {}", args.out.display());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,155 @@
|
||||||
|
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
|
||||||
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::score::{score_candidate, QueryFields};
|
||||||
|
use crate::bib::{bib_entry, MinimalBibFields};
|
||||||
|
|
||||||
|
pub enum InputItem { FreeString(String), BibLike(MinimalBibFields) }
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)] struct CrossrefMessage { message: CrossrefWorks }
|
||||||
|
#[derive(Deserialize, Debug)] struct CrossrefWorks { items: Vec<CrossrefItem> }
|
||||||
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
|
struct CrossrefItem {
|
||||||
|
#[serde(default)] title: Vec<String>,
|
||||||
|
#[serde(default)] author: Vec<Person>,
|
||||||
|
#[serde(default)] issued: DateParts,
|
||||||
|
#[serde(default, rename="container-title")] container_title: Vec<String>,
|
||||||
|
#[serde(rename="DOI")] doi: Option<String>,
|
||||||
|
#[serde(default)] subject: Vec<String>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize, Debug, Clone)] struct Person { given: Option<String>, family: Option<String> }
|
||||||
|
#[derive(Deserialize, Debug, Clone, Default)] struct DateParts { #[serde(rename="date-parts", default)] date_parts: Vec<Vec<i32>> }
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)] struct OpenAlexResults { results: Vec<OpenAlexItem> }
|
||||||
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
|
struct OpenAlexItem {
|
||||||
|
title: Option<String>,
|
||||||
|
#[serde(default)] authorships: Vec<OpenAlexAuthorship>,
|
||||||
|
publication_year: Option<i32>,
|
||||||
|
#[serde(default)] host_venue: Option<OpenAlexVenue>,
|
||||||
|
doi: Option<String>,
|
||||||
|
#[serde(default)] concepts: Vec<OpenAlexConcept>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize, Debug, Clone)] struct OpenAlexAuthorship { author: OpenAlexAuthor }
|
||||||
|
#[derive(Deserialize, Debug, Clone)] struct OpenAlexAuthor { display_name: String }
|
||||||
|
#[derive(Deserialize, Debug, Clone, Default)] struct OpenAlexVenue { display_name: String }
|
||||||
|
#[derive(Deserialize, Debug, Clone, Default)] struct OpenAlexConcept { display_name: String }
|
||||||
|
|
||||||
|
fn map_crossref(it: &CrossrefItem) -> Candidate {
|
||||||
|
Candidate {
|
||||||
|
title: it.title.get(0).cloned().unwrap_or_default(),
|
||||||
|
authors: it.author.iter().map(|a| format!("{} {}", a.given.clone().unwrap_or_default(), a.family.clone().unwrap_or_default()).trim().to_string()).collect(),
|
||||||
|
year: it.issued.date_parts.get(0).and_then(|v| v.get(0)).cloned(),
|
||||||
|
venue: it.container_title.get(0).cloned().unwrap_or_default(),
|
||||||
|
doi: it.doi.clone(),
|
||||||
|
concepts: it.subject.clone(),
|
||||||
|
source: "crossref".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn map_openalex(it: &OpenAlexItem) -> Candidate {
|
||||||
|
Candidate {
|
||||||
|
title: it.title.clone().unwrap_or_default(),
|
||||||
|
authors: it.authorships.iter().map(|a| a.author.display_name.clone()).collect(),
|
||||||
|
year: it.publication_year,
|
||||||
|
venue: it.host_venue.as_ref().map(|v| v.display_name.clone()).unwrap_or_default(),
|
||||||
|
doi: it.doi.clone().map(|d| d.replace("https://doi.org/", "")),
|
||||||
|
concepts: it.concepts.iter().map(|c| c.display_name.clone()).collect(),
|
||||||
|
source: "openalex".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Candidate {
|
||||||
|
pub title: String,
|
||||||
|
pub authors: Vec<String>,
|
||||||
|
pub year: Option<i32>,
|
||||||
|
pub venue: String,
|
||||||
|
pub doi: Option<String>,
|
||||||
|
pub concepts: Vec<String>,
|
||||||
|
pub source: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn process_item(
|
||||||
|
item: InputItem,
|
||||||
|
context: Option<&str>,
|
||||||
|
#[cfg(feature = "llm")] llm_cfg: Option<&crate::llm::LlmConfig>,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let (q, base_query) = match item {
|
||||||
|
InputItem::FreeString(s) => (QueryFields::from_free_string(&s), s),
|
||||||
|
InputItem::BibLike(b) => {
|
||||||
|
let q = QueryFields {
|
||||||
|
title: b.title.clone(),
|
||||||
|
authors: b.author.map(|a| vec![a]).unwrap_or_default(),
|
||||||
|
year: b.year.and_then(|y| y.parse::<i32>().ok()),
|
||||||
|
venue: b.journal.or(b.booktitle),
|
||||||
|
};
|
||||||
|
let bq = if let Some(ref d) = b.doi {
|
||||||
|
d.clone()
|
||||||
|
} else {
|
||||||
|
format!("{} {} {}", q.title.clone().unwrap_or_default(), q.authors.get(0).cloned().unwrap_or_default(), q.year.map(|y| y.to_string()).unwrap_or_default()).trim().to_string()
|
||||||
|
};
|
||||||
|
(q, bq)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut cands = Vec::<Candidate>::new();
|
||||||
|
let year = q.year;
|
||||||
|
|
||||||
|
// Crossref
|
||||||
|
{
|
||||||
|
let query = base_query.clone();
|
||||||
|
let url = if let Some(y) = year {
|
||||||
|
format!("https://api.crossref.org/works?query.bibliographic={}&rows=5&filter=from-pub-date:{y}-01-01,until-pub-date:{y}-12-31",
|
||||||
|
utf8_percent_encode(&query, NON_ALPHANUMERIC))
|
||||||
|
} else {
|
||||||
|
format!("https://api.crossref.org/works?query.bibliographic={}&rows=5",
|
||||||
|
utf8_percent_encode(&query, NON_ALPHANUMERIC))
|
||||||
|
};
|
||||||
|
if let Ok(resp) = reqwest::get(&url).await {
|
||||||
|
if let Ok(msg) = resp.json::<CrossrefMessage>().await {
|
||||||
|
cands.extend(msg.message.items.iter().map(map_crossref));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// OpenAlex
|
||||||
|
{
|
||||||
|
let query = format!("{} {}", base_query, context.unwrap_or("")).trim().to_string();
|
||||||
|
let url = if let Some(y) = year {
|
||||||
|
format!("https://api.openalex.org/works?search={}&per_page=5&from_publication_date={y}-01-01&to_publication_date={y}-12-31",
|
||||||
|
utf8_percent_encode(&query, NON_ALPHANUMERIC))
|
||||||
|
} else {
|
||||||
|
format!("https://api.openalex.org/works?search={}&per_page=5",
|
||||||
|
utf8_percent_encode(&query, NON_ALPHANUMERIC))
|
||||||
|
};
|
||||||
|
if let Ok(resp) = reqwest::get(&url).await {
|
||||||
|
if let Ok(rs) = resp.json::<OpenAlexResults>().await {
|
||||||
|
cands.extend(rs.results.iter().map(map_openalex));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut scored = cands.clone();
|
||||||
|
scored.sort_by(|a, b| {
|
||||||
|
let sa = score_candidate(&q, context, a);
|
||||||
|
let sb = score_candidate(&q, context, b);
|
||||||
|
sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal)
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(best) = scored.get(0) {
|
||||||
|
let s = score_candidate(&q, context, best);
|
||||||
|
let status = if best.doi.is_some() && s >= 0.95 { "exact" }
|
||||||
|
else if s >= 0.75 { "high_confidence" }
|
||||||
|
else { "ambiguous" };
|
||||||
|
let alts = scored.get(1..std::cmp::min(4, scored.len())).unwrap_or(&[]).to_vec();
|
||||||
|
Ok(bib_entry("veribib", best, status, s, &base_query, context.unwrap_or(""), &alts))
|
||||||
|
} else {
|
||||||
|
let placeholder = Candidate {
|
||||||
|
title: q.title.clone().unwrap_or_default(),
|
||||||
|
authors: q.authors.clone(),
|
||||||
|
year: q.year,
|
||||||
|
venue: q.venue.clone().unwrap_or_default(),
|
||||||
|
doi: None, concepts: vec![], source: "none".into(),
|
||||||
|
};
|
||||||
|
Ok(bib_entry("not_found", &placeholder, "not_found", 0.0, &base_query, context.unwrap_or(""), &[]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct QueryFields {
|
||||||
|
pub title: Option<String>,
|
||||||
|
pub authors: Vec<String>,
|
||||||
|
pub year: Option<i32>,
|
||||||
|
pub venue: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryFields {
|
||||||
|
pub fn from_free_string(s: &str) -> Self {
|
||||||
|
let year = s.chars().collect::<Vec<_>>().windows(4).find_map(|w| {
|
||||||
|
let ss: String = w.iter().collect();
|
||||||
|
ss.parse::<i32>().ok()
|
||||||
|
});
|
||||||
|
QueryFields { title: Some(s.to_string()), authors: vec![], year, venue: None }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn norm(s: &str) -> String { s.to_lowercase().split_whitespace().collect::<Vec<_>>().join(" ") }
|
||||||
|
fn first_surname(v: &Vec<String>) -> Option<String> { v.get(0).map(|s| s.split_whitespace().last().unwrap_or("").to_lowercase()) }
|
||||||
|
|
||||||
|
pub fn score_candidate(q: &QueryFields, ctx: Option<&str>, c: &crate::net::Candidate) -> f64 {
|
||||||
|
let mut sc = 0.0;
|
||||||
|
if let Some(ref t) = q.title {
|
||||||
|
let tq: HashSet<_> = norm(t).split_whitespace().collect();
|
||||||
|
let tc: HashSet<_> = norm(&c.title).split_whitespace().collect();
|
||||||
|
if !tq.is_empty() {
|
||||||
|
let inter = tq.intersection(&tc).count() as f64 / tq.len() as f64;
|
||||||
|
if inter >= 0.7 { sc += 0.40; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let (Some(qa0), Some(ca0)) = (first_surname(&q.authors), first_surname(&c.authors)) {
|
||||||
|
if !qa0.is_empty() && qa0 == ca0 { sc += 0.25; }
|
||||||
|
}
|
||||||
|
if let (Some(qy), Some(cy)) = (q.year, c.year) {
|
||||||
|
let dy = (qy - cy).abs();
|
||||||
|
sc += if dy == 0 { 0.20 } else if dy == 1 { 0.10 } else { 0.0 };
|
||||||
|
}
|
||||||
|
if let (Some(ref vq), v) = (q.venue.as_ref(), &c.venue) {
|
||||||
|
if norm(vq) == norm(v) { sc += 0.10; }
|
||||||
|
}
|
||||||
|
if let (Some(ctx), concepts) = (ctx, &c.concepts) {
|
||||||
|
let ctoks: HashSet<_> = norm(ctx).split_whitespace().collect();
|
||||||
|
let conc: HashSet<_> = concepts.iter().map(|s| norm(s)).collect::<Vec<_>>().iter().map(|s| s.as_str()).collect();
|
||||||
|
if !ctoks.is_empty() && ctoks.intersection(&conc).next().is_some() { sc += 0.05; }
|
||||||
|
}
|
||||||
|
sc.min(1.0)
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
use veribib::{}; // Not needed; tests use public functions directly
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bib_round_trip_minimal() {
|
||||||
|
let src = r#"
|
||||||
|
@article{evans1960,
|
||||||
|
author = {William E. Evans},
|
||||||
|
title = {Some Observations on the Echolocation of the Bottlenose Dolphin},
|
||||||
|
year = {1960},
|
||||||
|
journal = {Journal of the Acoustical Society of America},
|
||||||
|
doi = {10.0000/example}
|
||||||
|
}
|
||||||
|
"#;
|
||||||
|
let f = crate::bib::parse_minimal_bib_fields(src);
|
||||||
|
assert_eq!(f.author.unwrap().contains("Evans"), true);
|
||||||
|
let cand = crate::net::Candidate {
|
||||||
|
title: f.title.clone().unwrap_or_default(),
|
||||||
|
authors: vec![f.author.unwrap_or_default()],
|
||||||
|
year: f.year.and_then(|y| y.parse::<i32>().ok()),
|
||||||
|
venue: f.journal.unwrap_or_default(),
|
||||||
|
doi: f.doi,
|
||||||
|
concepts: vec![],
|
||||||
|
source: "test".into(),
|
||||||
|
};
|
||||||
|
let out = crate::bib::bib_entry("evans1960", &cand, "exact", 0.99, "Evans 1960", "echolocation", &[]);
|
||||||
|
assert!(out.contains("@article{evans1960"));
|
||||||
|
assert!(out.contains("x_status = {exact}"));
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue