Read and manipulate a tabular-data-resource

library(fr)

The {fr} package comes with an example frictionless tabular-data-resource (tdr) named hamilton_poverty_2020. On disk, a tdr is composed of a folder containing a data CSV file (both named based on the name of the tdr) and a tabular-data-resource.yaml file, which contains the metadata descriptors:

fs::dir_tree(fs::path_package("fr", "hamilton_poverty_2020"), recurse = TRUE)
#> /tmp/Rtmp6QuCYM/Rinstb80634905c2/fr/hamilton_poverty_2020
#> ├── hamilton_poverty_2020.csv
#> └── tabular-data-resource.yaml

Read the hamilton_poverty_2020 tdr into R by specifying the location of the tabular-data-resource file or to a folder containing a tabular-data-resource.yaml file:

d_fr <- read_fr_tdr(fs::path_package("fr", "hamilton_poverty_2020"))

Print the returned fr_tdr (frictionless tabular-data-resource) object to view all of the table-specific metadata descriptors and the underlying data:

d_fr
#> hamilton_poverty_2020
#> - version: 0.0.1
#> - title: Hamilton County Poverty Rates in 2020
#> # A tibble: 226 × 3
#>    census_tract_id_2020  year fraction_poverty
#>    <chr>                <dbl>            <dbl>
#>  1 39061021508           2020            0.057
#>  2 39061021421           2020            0.031
#>  3 39061023300           2020            0.03 
#>  4 39061002000           2020            0.098
#>  5 39061002500           2020            0.442
#>  6 39061007700           2020            0.603
#>  7 39061009902           2020            0.15 
#>  8 39061010700           2020            0.15 
#>  9 39061023902           2020            0.013
#> 10 39061022301           2020            0.247
#> # ℹ 216 more rows

Print the schema property to view the table-specific metadata:

S7::prop(d_fr, "schema")
#> census_tract_id_2020
#> - type: string
#> - title: Census Tract Identifier
#> - description: refers to 2020 vintage census tracts identifiers
#> year
#> - type: integer
#> - title: Year
#> - description: The year of the 5-year ACS estimates (e.g., the 2019 ACS covers
#> 2015 - 2019)
#> fraction_poverty
#> - type: number
#> - title: Fraction of Households in Poverty
#> - description: Fraction of households with income below poverty level within
#> the past 12 months

fr_tdr objects can be used mostly anywhere that the underlying data frame can be used because as.data.frame usually is used to coerce objects into data frames and works with fr_tdr objects:

lm(fraction_poverty ~ year, data = d_fr)
#> 
#> Call:
#> lm(formula = fraction_poverty ~ year, data = d_fr)
#> 
#> Coefficients:
#> (Intercept)         year  
#>      0.1729           NA

Accessor functions ([, [[, $) work as they do with data frames and tibbles:

head(d_fr$fraction_poverty)
#> [1] 0.057 0.031 0.030 0.098 0.442 0.603

In some cases, fr_tdr objects need to be disassociated into data and metadata before the data is manipulated and the metadata is rejoined:

d_fr |>
  dplyr::mutate(high_poverty = fraction_poverty > median(fraction_poverty))
#> Error in `vec_data()`:
#> ! `x` must be a vector, not a <fr_tdr/data.frame/S7_object> object.

In this case, explicitly convert the fr_tdr object to a tibble by dropping the metadata attributes using as_tibble, as_data_frame, or as.data.frame and then use as_fr_tdr() while specifying the original fr_tdr object as a template to convert back to a fr_tdr object:

d_fr |>
  tibble::as_tibble() |>
  dplyr::mutate(high_poverty = fraction_poverty > median(fraction_poverty)) |>
  as_fr_tdr(.template = d_fr)
#> hamilton_poverty_2020
#> - version: 0.0.1
#> - title: Hamilton County Poverty Rates in 2020
#> # A tibble: 226 × 4
#>    census_tract_id_2020  year fraction_poverty high_poverty
#>    <chr>                <dbl>            <dbl> <lgl>       
#>  1 39061021508           2020            0.057 FALSE       
#>  2 39061021421           2020            0.031 FALSE       
#>  3 39061023300           2020            0.03  FALSE       
#>  4 39061002000           2020            0.098 FALSE       
#>  5 39061002500           2020            0.442 TRUE        
#>  6 39061007700           2020            0.603 TRUE        
#>  7 39061009902           2020            0.15  TRUE        
#>  8 39061010700           2020            0.15  TRUE        
#>  9 39061023902           2020            0.013 FALSE       
#> 10 39061022301           2020            0.247 TRUE        
#> # ℹ 216 more rows

Shortcuts are provided for some functions from {dplyr} (see dplyr_methods() for a full list).

d_fr |>
  fr_mutate(high_poverty = fraction_poverty > median(fraction_poverty)) |>
  fr_select(-year) |>
  fr_arrange(desc(fraction_poverty))
#> hamilton_poverty_2020
#> - version: 0.0.1
#> - title: Hamilton County Poverty Rates in 2020
#> # A tibble: 226 × 3
#>    census_tract_id_2020 fraction_poverty high_poverty
#>    <chr>                           <dbl> <lgl>       
#>  1 39061008502                     0.754 TRUE        
#>  2 39061026300                     0.734 TRUE        
#>  3 39061026900                     0.69  TRUE        
#>  4 39061007700                     0.603 TRUE        
#>  5 39061022700                     0.599 TRUE        
#>  6 39061003000                     0.592 TRUE        
#>  7 39061002901                     0.576 TRUE        
#>  8 39061006600                     0.561 TRUE        
#>  9 39061008000                     0.556 TRUE        
#> 10 39061009300                     0.54  TRUE        
#> # ℹ 216 more rows

More complicated dplyr functions (e.g., group_by() and friends) as well as functions from other packages that do not coerce their inputs to data.frame objects will need to use the pattern above. Below is a simple example for dplyr::left_join():

library(dplyr, warn.conflicts = FALSE)

d_fr <- update_field(d_fr, "fraction_poverty", description = "the poverty fraction")

d_extant <-
  d_fr |>
  fr_mutate(score = 1 + fraction_poverty) |>
  fr_select(-fraction_poverty, -year) |>
  as_tibble()

d_fr_new <-
  left_join(
    as_tibble(d_fr),
    d_extant,
    by = join_by(census_tract_id_2020 == census_tract_id_2020)
  ) |>
  as_fr_tdr(.template = d_fr) |>
  update_field("score", description = "the score")

d_fr_new
#> hamilton_poverty_2020
#> - version: 0.0.1
#> - title: Hamilton County Poverty Rates in 2020
#> # A tibble: 226 × 4
#>    census_tract_id_2020  year fraction_poverty score
#>    <chr>                <dbl>            <dbl> <dbl>
#>  1 39061021508           2020            0.057  1.06
#>  2 39061021421           2020            0.031  1.03
#>  3 39061023300           2020            0.03   1.03
#>  4 39061002000           2020            0.098  1.10
#>  5 39061002500           2020            0.442  1.44
#>  6 39061007700           2020            0.603  1.60
#>  7 39061009902           2020            0.15   1.15
#>  8 39061010700           2020            0.15   1.15
#>  9 39061023902           2020            0.013  1.01
#> 10 39061022301           2020            0.247  1.25
#> # ℹ 216 more rows

S7::prop(d_fr_new, "schema")
#> census_tract_id_2020
#> - type: string
#> - title: Census Tract Identifier
#> - description: refers to 2020 vintage census tracts identifiers
#> year
#> - type: integer
#> - title: Year
#> - description: The year of the 5-year ACS estimates (e.g., the 2019 ACS covers
#> 2015 - 2019)
#> fraction_poverty
#> - type: number
#> - title: Fraction of Households in Poverty
#> - description: the poverty fraction
#> score
#> - type: number
#> - description: the score