Filter Health Data by Demographic Variables — sus_data_filter

Filters health data based on demographic characteristics such as sex, race, age range, education level, region, and municipality. This function complements sus_data_filter_cid() by enabling stratified analyses by population subgroups.

Usage

sus_data_filter_demographics(
  df,
  sex = NULL,
  race = NULL,
  age_range = NULL,
  education = NULL,
  region = NULL,
  city = NULL,
  municipality_code = NULL,
  drop_ignored = FALSE,
  backend = "arrow",
  use_cache = TRUE,
  cache_dir = "~/.climasus4r_cache/spatial",
  lang = "pt",
  verbose = TRUE
)

Arguments

df

A climasus_df object containing health data (output of the climasus4r pipeline).

sex

Character vector specifying sex categories to include. Accepts values in English, Portuguese, or Spanish (e.g., "Male", "Masculino", "Masculino"). If NULL (default), includes all sexes.

race

Character vector specifying race/color categories to include. Accepts IBGE standard categories in multiple languages. If NULL (default), includes all races.

age_range

Numeric vector of length 2 specifying the age range c(min_age, max_age). Use Inf for no upper limit. If NULL (default), includes all ages.

education

Character vector specifying education levels to include. If NULL (default), includes all education levels.

region

A string indicating a predefined group of states or regions (supports multilingual names PT, EN, ES). See Details.

city

Character vector of municipality names (e.g., "Sao Paulo", "Natal") or IBGE codes (6 or 7-digit, e.g., "3550308", "2408102"). Case-insensitive; accents are normalised for matching. Partial typos trigger fuzzy suggestions. If NULL (default), no additional municipality filter is applied. Results are merged (union) with any codes in municipality_code.

municipality_code

Character or numeric vector specifying municipality codes (IBGE 6 or 7-digit codes) to include. If NULL (default), includes all municipalities.

drop_ignored

Logical. If TRUE, explicitly removes rows where demographic variables (sex, race, education) contain missing values (NA) or DATASUS ignored codes (e.g., "9", "Ignorado"). Default is FALSE.

backend

Character string specifying the data processing backend. Use "arrow" for out-of-memory, lazy processing (recommended for large datasets), or "tibble" for in-memory processing (recommended for small to medium datasets).

"arrow": operations are performed lazily using the Apache Arrow engine, avoiding loading the full dataset into memory. Ideal for large files (e.g., Parquet, Feather) and high-performance workflows.
"tibble": data is fully loaded into memory as a tibble and processed eagerly using dplyr. Simpler and more predictable, but may be slow or fail for large datasets.

If not specified, the function may automatically choose the backend based on the input data type.

use_cache

Logical. If TRUE (default), uses cached spatial data to avoid re-downloads and improve performance. Only relevant when city is provided.

cache_dir

Character string specifying the directory to store cached files. Default is "~/.climasus4r_cache/spatial".

lang

Character string specifying the language for messages. Options: "en" (English), "pt" (Portuguese, default), "es" (Spanish).

verbose

Logical. If TRUE (default), prints filtering summary.

Value

A climasus_df filtered by all specified demographic criteria.

Details

The function automatically detects column names in different languages and standardisations. It handles both original DATASUS column names and standardised names from sus_data_standardize().

Sex categories (case-insensitive):

English: "Male", "Female"
Portuguese: "Masculino", "Feminino"
Spanish: "Masculino", "Femenino"

Race/Color categories (IBGE standard):

English: "White", "Black", "Yellow", "Brown", "Indigenous"
Portuguese: "Branca", "Preta", "Amarela", "Parda", "Indigena"
Spanish: "Blanca", "Negra", "Amarilla", "Parda", "Indigena"

IBGE Macro-regions:

"norte": c("AC", "AP", "AM", "PA", "RO", "RR", "TO")
"nordeste": c("AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE")
"centro_oeste": c("DF", "GO", "MT", "MS")
"sudeste": c("ES", "MG", "RJ", "SP")
"sul": c("PR", "RS", "SC")

Biomes (Ecological Borders):

"amazonia_legal": c("AC", "AP", "AM", "PA", "RO", "RR", "MT", "MA", "TO")
"mata_atlantica": c("AL", "BA", "CE", "ES", "GO", "MA", "MG", "MS", "PB", "PE", "PI", "PR", "RJ", "RN", "RS", "SC", "SE", "SP")
"caatinga": c("AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE", "MG")
"cerrado": c("BA", "DF", "GO", "MA", "MG", "MS", "MT", "PA", "PI", "PR", "RO", "SP", "TO")
"pantanal": c("MT", "MS")
"pampa": c("RS")

Hydrography & Climate:

"bacia_amazonia": c("AC", "AM", "AP", "MT", "PA", "RO", "RR")
"bacia_sao_francisco": c("AL", "BA", "DF", "GO", "MG", "PE", "SE")
"bacia_parana": c("GO", "MG", "MS", "PR", "SP")
"bacia_tocantins": c("GO", "MA", "PA", "TO")
"semi_arido": c("AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE", "MG")

Health, Agriculture & Geopolitics:

"matopiba": c("MA", "TO", "PI", "BA")
"arco_desmatamento": c("RO", "AC", "AM", "PA", "MT", "MA")
"dengue_hyperendemic": c("GO", "MS", "MT", "PR", "RJ", "SP")
"sudene": c("AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE", "MG", "ES")
"fronteira_brasil": c("AC", "AM", "AP", "MT", "MS", "PA", "PR", "RO", "RR", "RS", "SC")

Examples

if (FALSE) { # \dontrun{
library(climasus4r)

# Filter by sex only
df_women <- sus_data_filter_demographics(df, sex = "Female")

# Filter by age range (elderly, 65+)
df_elderly <- sus_data_filter_demographics(df, age_range = c(65, Inf))

# Filter by city name (with accent tolerance)
df_natal <- sus_data_filter_demographics(df, city = "Natal", lang = "pt")

# Filter by multiple city names
df_capitals <- sus_data_filter_demographics(
  df,
  city = c("Sao Paulo", "Rio de Janeiro", "Fortaleza"), #Usar acentos se preferir
  lang = "pt"
)

# Mix: city names + explicit codes (union)
df_subset <- sus_data_filter_demographics(
  df,
  city            = "Natal",
  municipality_code = "3550308",
  lang            = "pt"
)

# Complex filtering
df_children <- sus_data_filter_demographics(
  df,
  age_range = c(0, 5),
  region    = "Norte",
  lang      = "pt"
)
} # }