CiteGeist/tests/test_resolver_identifiers.py

202 lines
6.9 KiB
Python

"""Tests for identifier resolution and normalization."""
from __future__ import annotations
import pytest
from citegeist.resolver import (
IdentifierExtractor,
IdentifierNormalizer,
IdentifierResolver,
extract_identifiers,
normalize_identifier,
get_primary_identifier,
resolve_identifiers,
)
class TestIdentifierExtractor:
"""Test IdentifierExtractor class."""
def test_extract_from_entry(self):
"""Test extracting identifiers from entry fields."""
fields = {
'doi': '10.1234/example',
'title': 'Test Title',
'author': 'John Doe',
'pmid': '123456',
}
identifiers = IdentifierExtractor.extract(fields)
assert 'doi' in identifiers
assert identifiers['doi'] == '10.1234/example'
assert 'pmid' in identifiers
assert identifiers['pmid'] == '123456'
assert 'title' not in identifiers # Title is not an identifier
def test_extract_multiple_identifiers(self):
"""Test extracting multiple identifiers."""
fields = {
'doi': '10.1234/example',
'pmid': '123456',
'arxiv': '2310.12345',
'isbn': '978-0-123456-78-9',
}
identifiers = IdentifierExtractor.extract(fields)
assert len(identifiers) == 4
assert identifiers['doi'] == '10.1234/example'
assert identifiers['pmid'] == '123456'
assert identifiers['arxiv'] == '2310.12345'
assert identifiers['isbn'] == '978-0-123456-78-9'
class TestIdentifierNormalizer:
"""Test IdentifierNormalizer class."""
def test_normalize_doi(self):
"""Test DOI normalization."""
assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
assert IdentifierNormalizer.normalize_doi('invalid') is None
def test_normalize_pmid(self):
"""Test PMID normalization."""
assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
assert IdentifierNormalizer.normalize_pmid('invalid') is None
def test_normalize_pmcid(self):
"""Test PMCID normalization."""
assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
assert IdentifierNormalizer.normalize_pmcid('invalid') is None
def test_normalize_arxiv(self):
"""Test arXiv normalization."""
assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
def test_normalize_orcid(self):
"""Test ORCID normalization."""
assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
# ORCID with spaces is invalid according to the canonical format
assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
assert IdentifierNormalizer.normalize_orcid('invalid') is None
def test_normalize_identifier(self):
"""Test generic identifier normalization."""
result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
assert result == ('doi', '10.1234/test')
result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
assert result == ('pmid', '12345')
result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
assert result is None
class TestIdentifierResolver:
"""Test IdentifierResolver class."""
def test_resolve_with_doi(self):
"""Test resolving with DOI."""
fields = {'doi': '10.1234/example', 'title': 'Test Title'}
resolved = IdentifierResolver.resolve(fields)
assert len(resolved) >= 1
doi_resolved = [r for r in resolved if r[0] == 'doi']
assert len(doi_resolved) > 0
def test_resolve_with_multiple_identifiers(self):
"""Test resolving with multiple identifiers."""
fields = {
'doi': '10.1234/example',
'pmid': '12345',
'arxiv': '2310.12345',
}
resolved = IdentifierResolver.resolve(fields)
assert len(resolved) >= 2
doi_resolved = [r for r in resolved if r[0] == 'doi']
assert len(doi_resolved) > 0
def test_resolve_without_identifiers(self):
"""Test resolving without identifiers."""
fields = {'title': 'Test Title', 'author': 'John Doe'}
resolved = IdentifierResolver.resolve(fields)
# Should have at least title fingerprint
assert len(resolved) >= 1
title_resolved = [r for r in resolved if r[0] == 'title']
assert len(title_resolved) > 0
def test_get_primary_identifier(self):
"""Test getting primary identifier."""
fields = {
'doi': '10.1234/example',
'pmid': '12345',
'title': 'Test Title',
}
primary = IdentifierResolver.get_primary_identifier(fields)
assert primary is not None
# DOI should be first priority
assert primary[0] == 'doi'
def test_get_scheme_value(self):
"""Test getting specific scheme value."""
fields = {
'doi': '10.1234/example',
'pmid': '12345',
}
doi = IdentifierResolver.get_scheme_value('doi', fields)
assert doi == '10.1234/example'
pmid = IdentifierResolver.get_scheme_value('pmid', fields)
assert pmid == '12345'
isbn = IdentifierResolver.get_scheme_value('isbn', fields)
assert isbn is None
class TestConvenienceFunctions:
"""Test convenience functions."""
def test_extract_identifiers(self):
"""Test extract_identifiers function."""
fields = {'doi': '10.1234/example', 'pmid': '12345'}
identifiers = extract_identifiers(fields)
assert 'doi' in identifiers
assert 'pmid' in identifiers
def test_normalize_identifier(self):
"""Test normalize_identifier function."""
result = normalize_identifier('doi', '10.1234/test')
assert result == ('doi', '10.1234/test')
def test_get_primary_identifier(self):
"""Test get_primary_identifier function."""
fields = {'doi': '10.1234/example'}
primary = get_primary_identifier(fields)
assert primary == ('doi', '10.1234/example')
def test_resolve_identifiers(self):
"""Test resolve_identifiers function."""
fields = {'doi': '10.1234/example'}
resolved = resolve_identifiers(fields)
assert len(resolved) > 0