Examples

This section contains practical examples of using PyEuropePMC for various tasks.

Basic Examples
Query Builder Examples
Search Examples
Data Processing Examples
Advanced Use Cases

Query Builder Examples

Basic Query Building

from pyeuropepmc import QueryBuilder

qb = QueryBuilder()

# Simple keyword search
query1 = qb.keyword("machine learning").build()
print(query1)  # "machine learning"

# Field-specific search
query2 = qb.field("author", "Smith J").build()
print(query2)  # "AUTH:Smith J"

# Boolean operators
query3 = qb.keyword("cancer").and_().keyword("therapy").build()
print(query3)  # "cancer AND therapy"

Advanced Query Patterns

# Complex query with multiple conditions
complex_query = (qb
    .keyword("CRISPR", field="title")
    .and_()
    .keyword("gene editing")
    .and_()
    .date_range(start_year=2018, end_year=2023)
    .and_()
    .citation_count(min_count=50)
    .build())

print(complex_query)
# "(TITLE:CRISPR) AND gene editing AND (PUB_YEAR:[2018 TO 2023]) AND (CITED:[50 TO *])"

Citation and Date Filtering

# High-impact papers from specific period
high_impact_query = (qb
    .keyword("artificial intelligence")
    .and_()
    .citation_count(min_count=100, max_count=1000)
    .and_()
    .date_range(start_year=2020)
    .build())

# Recent papers with open access
recent_oa_query = (qb
    .keyword("COVID-19")
    .and_()
    .field("open_access", True)
    .and_()
    .date_range(start_year=2020)
    .build())

OR Logic and Grouping

# OR logic for synonyms
synonym_query = (qb
    .keyword("machine learning")
    .or_()
    .keyword("artificial intelligence")
    .or_()
    .keyword("deep learning")
    .build())

# Grouped sub-queries
diseases = QueryBuilder().keyword("cancer").or_().keyword("tumor").or_().keyword("neoplasm")
therapies = QueryBuilder().keyword("therapy").or_().keyword("treatment").or_().keyword("intervention")

combined_query = (qb
    .group(diseases)
    .and_()
    .group(therapies)
    .and_()
    .field("pub_year", 2023)
    .build())

Field-Specific Searches

# Author and affiliation searches
author_query = qb.field("author", "John Smith").build()
affiliation_query = qb.field("affiliation", "Harvard University").build()

# Journal and publication type
journal_query = qb.field("journal", "Nature").build()
review_query = qb.field("pub_type", "review").build()

# MeSH terms and keywords
mesh_query = qb.field("mesh", "Gene Therapy").build()
keyword_query = qb.field("keyword", "CRISPR").build()

PMC and DOI Searches

# PMC ID search (automatically adds PMC prefix)
pmc_query = qb.pmcid("1234567").build()  # "PMCID:PMC1234567"

# DOI search
doi_query = qb.field("doi", "10.1038/nature12345").build()

# Accession type search (automatically lowercased)
accession_query = qb.accession_type("PDB").build()  # "ACCESSION_TYPE:pdb"

Citation Network Queries

# Find papers that cite a specific article
citing_query = qb.cites("8521067", source="med").build()

# Find highly cited papers
highly_cited = (qb
    .keyword("neural networks")
    .and_()
    .citation_count(min_count=500)
    .build())

Query Persistence and Translation

# Save query to file
qb.save("my_search.json",
        platform="pubmed",
        authors=[{"name": "Researcher Name", "ORCID": "0000-0000-0000-0001"}])

# Load query from file
loaded_qb = QueryBuilder.from_file("my_search.json")

# Translate to different platforms
pubmed_query = loaded_qb.build()
wos_query = loaded_qb.translate("wos")  # Web of Science syntax
ebsco_query = loaded_qb.translate("ebsco")  # EBSCO syntax

Systematic Review Integration

from pyeuropepmc.utils.search_logging import start_search

# Start systematic review log
log = start_search("AI in Healthcare Review", executed_by="Dr. Smith")

# Build comprehensive search
comprehensive_search = (qb
    .keyword("artificial intelligence")
    .and_()
    .keyword("healthcare")
    .and_()
    .date_range(start_year=2018)
    .and_()
    .field("open_access", True)
    .build())

# Log the search for systematic review
qb.log_to_search(
    search_log=log,
    database="Europe PMC",
    filters={
        "date_range": "2018+",
        "open_access": True,
        "keywords": ["artificial intelligence", "healthcare"]
    },
    results_returned=250,
    notes="Comprehensive search for AI in healthcare literature"
)

# Save the review log
log.save("systematic_review_searches.json")

Query Evaluation and Optimization

# Evaluate search effectiveness
test_records = {
    "r1": {"title": "AI in cancer diagnosis", "colrev_status": "rev_included"},
    "r2": {"title": "Machine learning for drug discovery", "colrev_status": "rev_included"},
    "r3": {"title": "Weather prediction models", "colrev_status": "rev_excluded"}
}

evaluation = qb.evaluate(test_records)
print(f"Recall: {evaluation['recall']:.2f}")
print(f"Precision: {evaluation['precision']:.2f}")
print(f"F1 Score: {evaluation['f1_score']:.2f}")

Custom Field Transformations

# Use transform parameter for custom value processing
custom_query = (qb
    .field("pmcid", "1234567", transform=lambda x: f"PMC{x}" if not str(x).startswith("PMC") else str(x))
    .and_()
    .field("accession_type", "GENBANK", transform=str.lower)
    .build())

print(custom_query)  # "PMCID:PMC1234567 AND ACCESSION_TYPE:genbank"

Fetching Article by ID

# Fetch by PubMed ID
article = client.fetch_by_id(pmid="12345678")
print(f"Title: {article.title}")
print(f"Abstract: {article.abstract}")

# Fetch by PMC ID
article = client.fetch_by_id(pmcid="PMC1234567")

# Fetch by DOI
article = client.fetch_by_id(doi="10.1038/nature12345")

Search Examples

Advanced Search Queries

# Search with Boolean operators
results = client.search("(cancer OR tumor) AND therapy", limit=20)

# Search in specific fields
results = client.search("AUTH:\"Smith J\" AND JOURNAL:\"Nature\"")

# Search with date range
results = client.search("CRISPR AND PUB_YEAR:[2020 TO 2023]")

# Search by MeSH terms
results = client.search("MESH:\"Gene Therapy\"")

Filtering and Sorting

# Filter by source
results = client.search(
    "machine learning",
    source="PMC",  # Only PMC articles
    limit=15
)

# Sort by date (newest first)
results = client.search(
    "artificial intelligence",
    sort="date",
    limit=10
)

# Sort by citation count
results = client.search(
    "deep learning",
    sort="cited",
    limit=5
)

Pagination

# Get first page
page1 = client.search("cancer", limit=25, offset=0)

# Get second page
page2 = client.search("cancer", limit=25, offset=25)

# Iterate through all results
def get_all_results(query, batch_size=100):
    offset = 0
    all_results = []

    while True:
        results = client.search(
            query,
            limit=batch_size,
            offset=offset
        )

        if not results:
            break

        all_results.extend(results)
        offset += batch_size

        # Optional: add delay to respect rate limits
        time.sleep(1)

    return all_results

# Get all articles about "bioinformatics"
all_articles = get_all_results("bioinformatics")
print(f"Found {len(all_articles)} articles")

Data Processing Examples

Extracting Author Networks

import collections
from itertools import combinations

def build_author_network(articles):
    """Build co-authorship network from articles."""
    collaborations = collections.defaultdict(int)

    for article in articles:
        if len(article.authors) > 1:
            # Create pairs of co-authors
            for author1, author2 in combinations(article.authors, 2):
                pair = tuple(sorted([author1, author2]))
                collaborations[pair] += 1

    return collaborations

# Search for articles in a specific field
articles = client.search("computational biology", limit=100)

# Build collaboration network
network = build_author_network(articles)

# Find most frequent collaborations
top_collaborations = sorted(
    network.items(),
    key=lambda x: x[1],
    reverse=True
)[:10]

print("Top collaborations:")
for (author1, author2), count in top_collaborations:
    print(f"{author1} <-> {author2}: {count} papers")

Journal Impact Analysis

import collections

def analyze_journals(articles):
    """Analyze journal publication patterns."""
    journal_stats = collections.defaultdict(lambda: {
        'count': 0,
        'years': set(),
        'articles': []
    })

    for article in articles:
        journal = article.journal
        journal_stats[journal]['count'] += 1
        journal_stats[journal]['years'].add(article.pub_year)
        journal_stats[journal]['articles'].append(article)

    return journal_stats

# Analyze AI research journals
ai_articles = client.search("artificial intelligence", limit=200)
journal_analysis = analyze_journals(ai_articles)

# Sort by publication count
sorted_journals = sorted(
    journal_analysis.items(),
    key=lambda x: x[1]['count'],
    reverse=True
)

print("Top journals for AI research:")
for journal, stats in sorted_journals[:10]:
    year_range = f"{min(stats['years'])}-{max(stats['years'])}"
    print(f"{journal}: {stats['count']} articles ({year_range})")

Citation Analysis

def analyze_citations(pmid_list):
    """Analyze citation patterns for a list of articles."""
    citation_data = {}

    for pmid in pmid_list:
        try:
            # Get citations for this article
            citations = client.fetch_citations(pmid=pmid, limit=100)

            # Get references from this article
            references = client.fetch_references(pmid=pmid, limit=100)

            citation_data[pmid] = {
                'cited_by_count': len(citations),
                'references_count': len(references),
                'citations': citations,
                'references': references
            }

        except Exception as e:
            print(f"Error processing {pmid}: {e}")
            continue

    return citation_data

# Example: Analyze top papers in a field
top_papers = client.search("machine learning", sort="cited", limit=10)
pmid_list = [article.pmid for article in top_papers if article.pmid]

citation_analysis = analyze_citations(pmid_list)

# Print citation statistics
for pmid, data in citation_analysis.items():
    print(f"PMID {pmid}:")
    print(f"  Cited by: {data['cited_by_count']} articles")
    print(f"  References: {data['references_count']} articles")
    print()

Advanced Use Cases

Research Trend Analysis

import matplotlib.pyplot as plt
from collections import defaultdict

def analyze_research_trends(keywords, years):
    """Analyze research trends over time for given keywords."""
    trend_data = defaultdict(lambda: defaultdict(int))

    for keyword in keywords:
        for year in years:
            query = f"{keyword} AND PUB_YEAR:{year}"
            try:
                results = client.search(query, limit=1000)
                trend_data[keyword][year] = len(results)
            except Exception as e:
                print(f"Error for {keyword} in {year}: {e}")
                trend_data[keyword][year] = 0

    return trend_data

# Analyze AI/ML trends
keywords = ["artificial intelligence", "machine learning", "deep learning"]
years = range(2015, 2024)

trends = analyze_research_trends(keywords, years)

# Plot trends
plt.figure(figsize=(12, 6))
for keyword in keywords:
    years_list = sorted(trends[keyword].keys())
    counts = [trends[keyword][year] for year in years_list]
    plt.plot(years_list, counts, marker='o', label=keyword)

plt.xlabel("Year")
plt.ylabel("Number of Publications")
plt.title("AI/ML Research Trends")
plt.legend()
plt.grid(True)
plt.show()

Multi-source Data Integration

def comprehensive_search(query, max_results=1000):
    """Search across multiple Europe PMC sources."""
    sources = ["MED", "PMC", "AGR", "CBA"]
    all_results = []

    for source in sources:
        try:
            results = client.search(
                query,
                source=source,
                limit=min(max_results // len(sources), 250)
            )

            # Add source information to each result
            for article in results:
                article.source = source

            all_results.extend(results)

        except Exception as e:
            print(f"Error searching {source}: {e}")
            continue

    return all_results

# Search across all sources
comprehensive_results = comprehensive_search("COVID-19 treatment")

# Analyze by source
source_counts = {}
for article in comprehensive_results:
    source = getattr(article, 'source', 'Unknown')
    source_counts[source] = source_counts.get(source, 0) + 1

print("Results by source:")
for source, count in source_counts.items():
    print(f"{source}: {count} articles")

Export to Different Formats

import json
import csv
import pandas as pd

def export_results(articles, format='json', filename=None):
    """Export search results to various formats."""

    if format == 'json':
        data = []
        for article in articles:
            data.append({
                'title': article.title,
                'authors': article.authors,
                'journal': article.journal,
                'year': article.pub_year,
                'pmid': article.pmid,
                'doi': article.doi,
                'abstract': article.abstract
            })

        if filename:
            with open(filename, 'w') as f:
                json.dump(data, f, indent=2)
        return data

    elif format == 'csv':
        if filename:
            with open(filename, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(['Title', 'Authors', 'Journal', 'Year', 'PMID', 'DOI'])

                for article in articles:
                    writer.writerow([
                        article.title,
                        '; '.join(article.authors) if article.authors else '',
                        article.journal,
                        article.pub_year,
                        article.pmid,
                        article.doi
                    ])

    elif format == 'dataframe':
        data = []
        for article in articles:
            data.append({
                'title': article.title,
                'authors': '; '.join(article.authors) if article.authors else '',
                'journal': article.journal,
                'year': article.pub_year,
                'pmid': article.pmid,
                'doi': article.doi
            })

        return pd.DataFrame(data)

# Example usage
results = client.search("bioinformatics", limit=50)

# Export to JSON
export_results(results, format='json', filename='bioinformatics_articles.json')

# Export to CSV
export_results(results, format='csv', filename='bioinformatics_articles.csv')

# Create DataFrame for analysis
df = export_results(results, format='dataframe')
print(df.head())

Error Handling Examples

from pyeuropepmc import EuropePMC, APIError, RateLimitError

def robust_search(query, max_retries=3):
    """Search with robust error handling."""
    client = EuropePMC()

    for attempt in range(max_retries):
        try:
            results = client.search(query, limit=100)
            return results

        except RateLimitError:
            print(f"Rate limit hit, waiting before retry {attempt + 1}")
            time.sleep(2 ** attempt)  # Exponential backoff

        except APIError as e:
            print(f"API error: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(1)

        except Exception as e:
            print(f"Unexpected error: {e}")
            if attempt == max_retries - 1:
                raise
            time.sleep(1)

    return []

# Use robust search
try:
    results = robust_search("complex query here")
    print(f"Successfully retrieved {len(results)} articles")
except Exception as e:
    print(f"Failed after all retries: {e}")

For more examples, check out the examples directory in the repository.

Examples

Robust Python toolkit for scientific literature analysis from Europe PMC

Examples

Table of Contents

Query Builder Examples

Basic Query Building

Advanced Query Patterns

Citation and Date Filtering

OR Logic and Grouping

Field-Specific Searches

PMC and DOI Searches

Citation Network Queries

Query Persistence and Translation

Systematic Review Integration

Query Evaluation and Optimization

Custom Field Transformations

Fetching Article by ID

Search Examples

Advanced Search Queries

Filtering and Sorting

Data Processing Examples

Extracting Author Networks

Journal Impact Analysis

Citation Analysis

Advanced Use Cases

Research Trend Analysis

Multi-source Data Integration

Export to Different Formats

Error Handling Examples