202 lines
6.9 KiB
Python
202 lines
6.9 KiB
Python
"""Tests for identifier resolution and normalization."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from citegeist.resolver import (
|
|
IdentifierExtractor,
|
|
IdentifierNormalizer,
|
|
IdentifierResolver,
|
|
extract_identifiers,
|
|
normalize_identifier,
|
|
get_primary_identifier,
|
|
resolve_identifiers,
|
|
)
|
|
|
|
|
|
class TestIdentifierExtractor:
|
|
"""Test IdentifierExtractor class."""
|
|
|
|
def test_extract_from_entry(self):
|
|
"""Test extracting identifiers from entry fields."""
|
|
fields = {
|
|
'doi': '10.1234/example',
|
|
'title': 'Test Title',
|
|
'author': 'John Doe',
|
|
'pmid': '123456',
|
|
}
|
|
|
|
identifiers = IdentifierExtractor.extract(fields)
|
|
|
|
assert 'doi' in identifiers
|
|
assert identifiers['doi'] == '10.1234/example'
|
|
assert 'pmid' in identifiers
|
|
assert identifiers['pmid'] == '123456'
|
|
assert 'title' not in identifiers # Title is not an identifier
|
|
|
|
def test_extract_multiple_identifiers(self):
|
|
"""Test extracting multiple identifiers."""
|
|
fields = {
|
|
'doi': '10.1234/example',
|
|
'pmid': '123456',
|
|
'arxiv': '2310.12345',
|
|
'isbn': '978-0-123456-78-9',
|
|
}
|
|
|
|
identifiers = IdentifierExtractor.extract(fields)
|
|
|
|
assert len(identifiers) == 4
|
|
assert identifiers['doi'] == '10.1234/example'
|
|
assert identifiers['pmid'] == '123456'
|
|
assert identifiers['arxiv'] == '2310.12345'
|
|
assert identifiers['isbn'] == '978-0-123456-78-9'
|
|
|
|
|
|
class TestIdentifierNormalizer:
|
|
"""Test IdentifierNormalizer class."""
|
|
|
|
def test_normalize_doi(self):
|
|
"""Test DOI normalization."""
|
|
assert IdentifierNormalizer.normalize_doi('10.1234/EXAMPLE') == '10.1234/example'
|
|
assert IdentifierNormalizer.normalize_doi('10.1234/test') == '10.1234/test'
|
|
assert IdentifierNormalizer.normalize_doi('invalid') is None
|
|
|
|
def test_normalize_pmid(self):
|
|
"""Test PMID normalization."""
|
|
assert IdentifierNormalizer.normalize_pmid('12345') == '12345'
|
|
assert IdentifierNormalizer.normalize_pmid('1234567') == '1234567'
|
|
assert IdentifierNormalizer.normalize_pmid('invalid') is None
|
|
|
|
def test_normalize_pmcid(self):
|
|
"""Test PMCID normalization."""
|
|
assert IdentifierNormalizer.normalize_pmcid('PMC12345') == 'pmc12345'
|
|
assert IdentifierNormalizer.normalize_pmcid('PMCabcdef') == 'pmcabcdef'
|
|
assert IdentifierNormalizer.normalize_pmcid('invalid') is None
|
|
|
|
def test_normalize_arxiv(self):
|
|
"""Test arXiv normalization."""
|
|
assert IdentifierNormalizer.normalize_arxiv('2310.12345') == '2310.12345'
|
|
assert IdentifierNormalizer.normalize_arxiv('2310.12345v1') == '2310.12345'
|
|
assert IdentifierNormalizer.normalize_arxiv('INVALID') is None
|
|
|
|
def test_normalize_orcid(self):
|
|
"""Test ORCID normalization."""
|
|
assert IdentifierNormalizer.normalize_orcid('0000-0001-2345-6789') == '0000-0001-2345-6789'
|
|
# ORCID with spaces is invalid according to the canonical format
|
|
assert IdentifierNormalizer.normalize_orcid('0000 0001 2345 6789') is None
|
|
assert IdentifierNormalizer.normalize_orcid('invalid') is None
|
|
|
|
def test_normalize_identifier(self):
|
|
"""Test generic identifier normalization."""
|
|
result = IdentifierNormalizer.normalize_identifier('doi', '10.1234/test')
|
|
assert result == ('doi', '10.1234/test')
|
|
|
|
result = IdentifierNormalizer.normalize_identifier('pmid', '12345')
|
|
assert result == ('pmid', '12345')
|
|
|
|
result = IdentifierNormalizer.normalize_identifier('invalid', 'value')
|
|
assert result is None
|
|
|
|
|
|
class TestIdentifierResolver:
|
|
"""Test IdentifierResolver class."""
|
|
|
|
def test_resolve_with_doi(self):
|
|
"""Test resolving with DOI."""
|
|
fields = {'doi': '10.1234/example', 'title': 'Test Title'}
|
|
|
|
resolved = IdentifierResolver.resolve(fields)
|
|
|
|
assert len(resolved) >= 1
|
|
doi_resolved = [r for r in resolved if r[0] == 'doi']
|
|
assert len(doi_resolved) > 0
|
|
|
|
def test_resolve_with_multiple_identifiers(self):
|
|
"""Test resolving with multiple identifiers."""
|
|
fields = {
|
|
'doi': '10.1234/example',
|
|
'pmid': '12345',
|
|
'arxiv': '2310.12345',
|
|
}
|
|
|
|
resolved = IdentifierResolver.resolve(fields)
|
|
|
|
assert len(resolved) >= 2
|
|
doi_resolved = [r for r in resolved if r[0] == 'doi']
|
|
assert len(doi_resolved) > 0
|
|
|
|
def test_resolve_without_identifiers(self):
|
|
"""Test resolving without identifiers."""
|
|
fields = {'title': 'Test Title', 'author': 'John Doe'}
|
|
|
|
resolved = IdentifierResolver.resolve(fields)
|
|
|
|
# Should have at least title fingerprint
|
|
assert len(resolved) >= 1
|
|
title_resolved = [r for r in resolved if r[0] == 'title']
|
|
assert len(title_resolved) > 0
|
|
|
|
def test_get_primary_identifier(self):
|
|
"""Test getting primary identifier."""
|
|
fields = {
|
|
'doi': '10.1234/example',
|
|
'pmid': '12345',
|
|
'title': 'Test Title',
|
|
}
|
|
|
|
primary = IdentifierResolver.get_primary_identifier(fields)
|
|
|
|
assert primary is not None
|
|
# DOI should be first priority
|
|
assert primary[0] == 'doi'
|
|
|
|
def test_get_scheme_value(self):
|
|
"""Test getting specific scheme value."""
|
|
fields = {
|
|
'doi': '10.1234/example',
|
|
'pmid': '12345',
|
|
}
|
|
|
|
doi = IdentifierResolver.get_scheme_value('doi', fields)
|
|
assert doi == '10.1234/example'
|
|
|
|
pmid = IdentifierResolver.get_scheme_value('pmid', fields)
|
|
assert pmid == '12345'
|
|
|
|
isbn = IdentifierResolver.get_scheme_value('isbn', fields)
|
|
assert isbn is None
|
|
|
|
|
|
class TestConvenienceFunctions:
|
|
"""Test convenience functions."""
|
|
|
|
def test_extract_identifiers(self):
|
|
"""Test extract_identifiers function."""
|
|
fields = {'doi': '10.1234/example', 'pmid': '12345'}
|
|
|
|
identifiers = extract_identifiers(fields)
|
|
|
|
assert 'doi' in identifiers
|
|
assert 'pmid' in identifiers
|
|
|
|
def test_normalize_identifier(self):
|
|
"""Test normalize_identifier function."""
|
|
result = normalize_identifier('doi', '10.1234/test')
|
|
assert result == ('doi', '10.1234/test')
|
|
|
|
def test_get_primary_identifier(self):
|
|
"""Test get_primary_identifier function."""
|
|
fields = {'doi': '10.1234/example'}
|
|
|
|
primary = get_primary_identifier(fields)
|
|
|
|
assert primary == ('doi', '10.1234/example')
|
|
|
|
def test_resolve_identifiers(self):
|
|
"""Test resolve_identifiers function."""
|
|
fields = {'doi': '10.1234/example'}
|
|
|
|
resolved = resolve_identifiers(fields)
|
|
|
|
assert len(resolved) > 0
|