Skip to the content.

Examples

This section contains practical examples of using PyEuropePMC for various tasks.

Table of Contents

Query Builder Examples

Basic Query Building

from pyeuropepmc import QueryBuilder

qb = QueryBuilder()

# Simple keyword search
query1 = qb.keyword("machine learning").build()
print(query1)  # "machine learning"

# Field-specific search
query2 = qb.field("author", "Smith J").build()
print(query2)  # "AUTH:Smith J"

# Boolean operators
query3 = qb.keyword("cancer").and_().keyword("therapy").build()
print(query3)  # "cancer AND therapy"

Advanced Query Patterns

# Complex query with multiple conditions
complex_query = (qb
    .keyword("CRISPR", field="title")
    .and_()
    .keyword("gene editing")
    .and_()
    .date_range(start_year=2018, end_year=2023)
    .and_()
    .citation_count(min_count=50)
    .build())

print(complex_query)
# "(TITLE:CRISPR) AND gene editing AND (PUB_YEAR:[2018 TO 2023]) AND (CITED:[50 TO *])"

Citation and Date Filtering

# High-impact papers from specific period
high_impact_query = (qb
    .keyword("artificial intelligence")
    .and_()
    .citation_count(min_count=100, max_count=1000)
    .and_()
    .date_range(start_year=2020)
    .build())

# Recent papers with open access
recent_oa_query = (qb
    .keyword("COVID-19")
    .and_()
    .field("open_access", True)
    .and_()
    .date_range(start_year=2020)
    .build())

OR Logic and Grouping

# OR logic for synonyms
synonym_query = (qb
    .keyword("machine learning")
    .or_()
    .keyword("artificial intelligence")
    .or_()
    .keyword("deep learning")
    .build())

# Grouped sub-queries
diseases = QueryBuilder().keyword("cancer").or_().keyword("tumor").or_().keyword("neoplasm")
therapies = QueryBuilder().keyword("therapy").or_().keyword("treatment").or_().keyword("intervention")

combined_query = (qb
    .group(diseases)
    .and_()
    .group(therapies)
    .and_()
    .field("pub_year", 2023)
    .build())

Field-Specific Searches

# Author and affiliation searches
author_query = qb.field("author", "John Smith").build()
affiliation_query = qb.field("affiliation", "Harvard University").build()

# Journal and publication type
journal_query = qb.field("journal", "Nature").build()
review_query = qb.field("pub_type", "review").build()

# MeSH terms and keywords
mesh_query = qb.field("mesh", "Gene Therapy").build()
keyword_query = qb.field("keyword", "CRISPR").build()

PMC and DOI Searches

# PMC ID search (automatically adds PMC prefix)
pmc_query = qb.pmcid("1234567").build()  # "PMCID:PMC1234567"

# DOI search
doi_query = qb.field("doi", "10.1038/nature12345").build()

# Accession type search (automatically lowercased)
accession_query = qb.accession_type("PDB").build()  # "ACCESSION_TYPE:pdb"

Citation Network Queries

# Find papers that cite a specific article
citing_query = qb.cites("8521067", source="med").build()

# Find highly cited papers
highly_cited = (qb
    .keyword("neural networks")
    .and_()
    .citation_count(min_count=500)
    .build())

Query Persistence and Translation

# Save query to file
qb.save("my_search.json",
        platform="pubmed",
        authors=[{"name": "Researcher Name", "ORCID": "0000-0000-0000-0001"}])

# Load query from file
loaded_qb = QueryBuilder.from_file("my_search.json")

# Translate to different platforms
pubmed_query = loaded_qb.build()
wos_query = loaded_qb.translate("wos")  # Web of Science syntax
ebsco_query = loaded_qb.translate("ebsco")  # EBSCO syntax

Systematic Review Integration

from pyeuropepmc.utils.search_logging import start_search

# Start systematic review log
log = start_search("AI in Healthcare Review", executed_by="Dr. Smith")

# Build comprehensive search
comprehensive_search = (qb
    .keyword("artificial intelligence")
    .and_()
    .keyword("healthcare")
    .and_()
    .date_range(start_year=2018)
    .and_()
    .field("open_access", True)
    .build())

# Log the search for systematic review
qb.log_to_search(
    search_log=log,
    database="Europe PMC",
    filters={
        "date_range": "2018+",
        "open_access": True,
        "keywords": ["artificial intelligence", "healthcare"]
    },
    results_returned=250,
    notes="Comprehensive search for AI in healthcare literature"
)

# Save the review log
log.save("systematic_review_searches.json")

Query Evaluation and Optimization

# Evaluate search effectiveness
test_records = {
    "r1": {"title": "AI in cancer diagnosis", "colrev_status": "rev_included"},
    "r2": {"title": "Machine learning for drug discovery", "colrev_status": "rev_included"},
    "r3": {"title": "Weather prediction models", "colrev_status": "rev_excluded"}
}

evaluation = qb.evaluate(test_records)
print(f"Recall: {evaluation['recall']:.2f}")
print(f"Precision: {evaluation['precision']:.2f}")
print(f"F1 Score: {evaluation['f1_score']:.2f}")

Custom Field Transformations

# Use transform parameter for custom value processing
custom_query = (qb
    .field("pmcid", "1234567", transform=lambda x: f"PMC{x}" if not str(x).startswith("PMC") else str(x))
    .and_()
    .field("accession_type", "GENBANK", transform=str.lower)
    .build())

print(custom_query)  # "PMCID:PMC1234567 AND ACCESSION_TYPE:genbank"

Fetching Article by ID

# Fetch by PubMed ID
article = client.fetch_by_id(pmid="12345678")
print(f"Title: {article.title}")
print(f"Abstract: {article.abstract}")

# Fetch by PMC ID
article = client.fetch_by_id(pmcid="PMC1234567")

# Fetch by DOI
article = client.fetch_by_id(doi="10.1038/nature12345")

Search Examples

Advanced Search Queries

# Search with Boolean operators
results = client.search("(cancer OR tumor) AND therapy", limit=20)

# Search in specific fields
results = client.search("AUTH:\"Smith J\" AND JOURNAL:\"Nature\"")

# Search with date range
results = client.search("CRISPR AND PUB_YEAR:[2020 TO 2023]")

# Search by MeSH terms
results = client.search("MESH:\"Gene Therapy\"")

Filtering and Sorting

# Filter by source
results = client.search(
    "machine learning",
    source="PMC",  # Only PMC articles
    limit=15
)

# Sort by date (newest first)
results = client.search(
    "artificial intelligence",
    sort="date",
    limit=10
)

# Sort by citation count
results = client.search(
    "deep learning",
    sort="cited",
    limit=5
)

Pagination

# Get first page
page1 = client.search("cancer", limit=25, offset=0)

# Get second page
page2 = client.search("cancer", limit=25, offset=25)

# Iterate through all results
def get_all_results(query, batch_size=100):
    offset = 0
    all_results = []

    while True:
        results = client.search(
            query,
            limit=batch_size,
            offset=offset
        )

        if not results:
            break

        all_results.extend(results)
        offset += batch_size

        # Optional: add delay to respect rate limits
        time.sleep(1)

    return all_results

# Get all articles about "bioinformatics"
all_articles = get_all_results("bioinformatics")
print(f"Found {len(all_articles)} articles")

Data Processing Examples

Extracting Author Networks

import collections
from itertools import combinations

def build_author_network(articles):
    """Build co-authorship network from articles."""
    collaborations = collections.defaultdict(int)

    for article in articles:
        if len(article.authors) > 1:
            # Create pairs of co-authors
            for author1, author2 in combinations(article.authors, 2):
                pair = tuple(sorted([author1, author2]))
                collaborations[pair] += 1

    return collaborations

# Search for articles in a specific field
articles = client.search("computational biology", limit=100)

# Build collaboration network
network = build_author_network(articles)

# Find most frequent collaborations
top_collaborations = sorted(
    network.items(),
    key=lambda x: x[1],
    reverse=True
)[:10]

print("Top collaborations:")
for (author1, author2), count in top_collaborations:
    print(f"{author1} <-> {author2}: {count} papers")

Journal Impact Analysis

import collections

def analyze_journals(articles):
    """Analyze journal publication patterns."""
    journal_stats = collections.defaultdict(lambda: {
        'count': 0,
        'years': set(),
        'articles': []
    })

    for article in articles:
        journal = article.journal
        journal_stats[journal]['count'] += 1
        journal_stats[journal]['years'].add(article.pub_year)
        journal_stats[journal]['articles'].append(article)

    return journal_stats

# Analyze AI research journals
ai_articles = client.search("artificial intelligence", limit=200)
journal_analysis = analyze_journals(ai_articles)

# Sort by publication count
sorted_journals = sorted(
    journal_analysis.items(),
    key=lambda x: x[1]['count'],
    reverse=True
)

print("Top journals for AI research:")
for journal, stats in sorted_journals[:10]:
    year_range = f"{min(stats['years'])}-{max(stats['years'])}"
    print(f"{journal}: {stats['count']} articles ({year_range})")

Citation Analysis

def analyze_citations(pmid_list):
    """Analyze citation patterns for a list of articles."""
    citation_data = {}

    for pmid in pmid_list:
        try:
            # Get citations for this article
            citations = client.fetch_citations(pmid=pmid, limit=100)

            # Get references from this article
            references = client.fetch_references(pmid=pmid, limit=100)

            citation_data[pmid] = {
                'cited_by_count': len(citations),
                'references_count': len(references),
                'citations': citations,
                'references': references
            }

        except Exception as e:
            print(f"Error processing {pmid}: {e}")
            continue

    return citation_data

# Example: Analyze top papers in a field
top_papers = client.search("machine learning", sort="cited", limit=10)
pmid_list = [article.pmid for article in top_papers if article.pmid]

citation_analysis = analyze_citations(pmid_list)

# Print citation statistics
for pmid, data in citation_analysis.items():
    print(f"PMID {pmid}:")
    print(f"  Cited by: {data['cited_by_count']} articles")
    print(f"  References: {data['references_count']} articles")
    print()

Advanced Use Cases

Research Trend Analysis

import matplotlib.pyplot as plt
from collections import defaultdict

def analyze_research_trends(keywords, years):
    """Analyze research trends over time for given keywords."""
    trend_data = defaultdict(lambda: defaultdict(int))

    for keyword in keywords:
        for year in years:
            query = f"{keyword} AND PUB_YEAR:{year}"
            try:
                results = client.search(query, limit=1000)
                trend_data[keyword][year] = len(results)
            except Exception as e:
                print(f"Error for {keyword} in {year}: {e}")
                trend_data[keyword][year] = 0

    return trend_data

# Analyze AI/ML trends
keywords = ["artificial intelligence", "machine learning", "deep learning"]
years = range(2015, 2024)

trends = analyze_research_trends(keywords, years)

# Plot trends
plt.figure(figsize=(12, 6))
for keyword in keywords:
    years_list = sorted(trends[keyword].keys())
    counts = [trends[keyword][year] for year in years_list]
    plt.plot(years_list, counts, marker='o', label=keyword)

plt.xlabel("Year")
plt.ylabel("Number of Publications")
plt.title("AI/ML Research Trends")
plt.legend()
plt.grid(True)
plt.show()

Multi-source Data Integration

def comprehensive_search(query, max_results=1000):
    """Search across multiple Europe PMC sources."""
    sources = ["MED", "PMC", "AGR", "CBA"]
    all_results = []

    for source in sources:
        try:
            results = client.search(
                query,
                source=source,
                limit=min(max_results // len(sources), 250)
            )

            # Add source information to each result
            for article in results:
                article.source = source

            all_results.extend(results)

        except Exception as e:
            print(f"Error searching {source}: {e}")
            continue

    return all_results

# Search across all sources
comprehensive_results = comprehensive_search("COVID-19 treatment")

# Analyze by source
source_counts = {}
for article in comprehensive_results:
    source = getattr(article, 'source', 'Unknown')
    source_counts[source] = source_counts.get(source, 0) + 1

print("Results by source:")
for source, count in source_counts.items():
    print(f"{source}: {count} articles")

Export to Different Formats

import json
import csv
import pandas as pd

def export_results(articles, format='json', filename=None):
    """Export search results to various formats."""

    if format == 'json':
        data = []
        for article in articles:
            data.append({
                'title': article.title,
                'authors': article.authors,
                'journal': article.journal,
                'year': article.pub_year,
                'pmid': article.pmid,
                'doi': article.doi,
                'abstract': article.abstract
            })

        if filename:
            with open(filename, 'w') as f:
                json.dump(data, f, indent=2)
        return data

    elif format == 'csv':
        if filename:
            with open(filename, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['Title', 'Authors', 'Journal', 'Year', 'PMID', 'DOI'])

                for article in articles:
                    writer.writerow([
                        article.title,
                        '; '.join(article.authors) if article.authors else '',
                        article.journal,
                        article.pub_year,
                        article.pmid,
                        article.doi
                    ])

    elif format == 'dataframe':
        data = []
        for article in articles:
            data.append({
                'title': article.title,
                'authors': '; '.join(article.authors) if article.authors else '',
                'journal': article.journal,
                'year': article.pub_year,
                'pmid': article.pmid,
                'doi': article.doi
            })

        return pd.DataFrame(data)

# Example usage
results = client.search("bioinformatics", limit=50)

# Export to JSON
export_results(results, format='json', filename='bioinformatics_articles.json')

# Export to CSV
export_results(results, format='csv', filename='bioinformatics_articles.csv')

# Create DataFrame for analysis
df = export_results(results, format='dataframe')
print(df.head())

Error Handling Examples

from pyeuropepmc import EuropePMC, APIError, RateLimitError

def robust_search(query, max_retries=3):
    """Search with robust error handling."""
    client = EuropePMC()

    for attempt in range(max_retries):
        try:
            results = client.search(query, limit=100)
            return results

        except RateLimitError:
            print(f"Rate limit hit, waiting before retry {attempt + 1}")
            time.sleep(2 ** attempt)  # Exponential backoff

        except APIError as e:
            print(f"API error: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(1)

        except Exception as e:
            print(f"Unexpected error: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(1)

    return []

# Use robust search
try:
    results = robust_search("complex query here")
    print(f"Successfully retrieved {len(results)} articles")
except Exception as e:
    print(f"Failed after all retries: {e}")

For more examples, check out the examples directory in the repository.