| Title: | Record Linkage and Epidemiological Case Definitions in 'R' |
|---|---|
| Description: | An R package for iterative and batched record linkage, and applying epidemiological case definitions. 'diyar' can be used for deterministic and probabilistic record linkage, or multistage record linkage combining both approaches. It features the implementation of nested match criteria, and mechanisms to address missing data and conflicting matches during stepwise record linkage. Case definitions are implemented by assigning records to groups based on match criteria such as person or place, and overlapping time or duration of events e.g. sample collection dates or periods of hospital stays. Matching records are assigned a unique group ID. Index and duplicate records are removed or further analyses as required. |
| Authors: | Olisaeloka Nsonwu |
| Maintainer: | Olisaeloka Nsonwu <[email protected]> |
| License: | GPL-3 |
| Version: | 0.5.1.9012 |
| Built: | 2026-05-16 15:36:30 UTC |
| Source: | https://github.com/olisansonwu/diyar |
Vectorised approach to group operations.
bys_count(by, unique.var = NULL) bys_rank(..., by = NULL, from_last = FALSE) bys_position(..., by = NULL, from_last = FALSE, ordered = TRUE) bys_val(..., val, by = NULL, from_last = FALSE, na.last = TRUE) bys_nval( ..., val, by = NULL, from_last = FALSE, n = 1, nmax = FALSE, na.last = TRUE ) bys_min(val, by = NULL, na.rm = TRUE) bys_max(val, by = NULL, na.rm = TRUE) bys_sum(val, by = NULL, na.rm = TRUE) bys_prod(val, by = NULL, na.rm = TRUE) bys_cummin(val, by = NULL) bys_cummax(val, by = NULL) bys_cumsum(val, by = NULL) bys_cumprod(val, by = NULL) bys_lag(..., val, by = NULL, n = 1, from_last = FALSE) bys_lead(..., val, by = NULL, n = 1, from_last = FALSE) bys(..., val, by = NULL, from_last = FALSE, na.last = TRUE) bys_shift(..., val, by = NULL, n = 1, from_last = FALSE) bys_func(..., val, by = NULL, from_last = FALSE, func = sum)bys_count(by, unique.var = NULL) bys_rank(..., by = NULL, from_last = FALSE) bys_position(..., by = NULL, from_last = FALSE, ordered = TRUE) bys_val(..., val, by = NULL, from_last = FALSE, na.last = TRUE) bys_nval( ..., val, by = NULL, from_last = FALSE, n = 1, nmax = FALSE, na.last = TRUE ) bys_min(val, by = NULL, na.rm = TRUE) bys_max(val, by = NULL, na.rm = TRUE) bys_sum(val, by = NULL, na.rm = TRUE) bys_prod(val, by = NULL, na.rm = TRUE) bys_cummin(val, by = NULL) bys_cummax(val, by = NULL) bys_cumsum(val, by = NULL) bys_cumprod(val, by = NULL) bys_lag(..., val, by = NULL, n = 1, from_last = FALSE) bys_lead(..., val, by = NULL, n = 1, from_last = FALSE) bys(..., val, by = NULL, from_last = FALSE, na.last = TRUE) bys_shift(..., val, by = NULL, n = 1, from_last = FALSE) bys_func(..., val, by = NULL, from_last = FALSE, func = sum)
by |
|
... |
|
from_last |
|
ordered |
If |
val |
|
n |
|
nmax |
|
na.rm |
If |
[atomic]
x <- data.frame( group = c(2, 2, 1, 2, 1, 1, 1, 2, 1, 1), value = c(13, 14, 20, 9, 2, 1, 8, 18, 3, 17)) bys_count(x$group) bys_position(x$value, by = x$group, from_last = TRUE) bys_rank(by = x$group, val = x$value, from_last = TRUE) bys_val(x$value, by = x$group, val = x$value, from_last = TRUE) bys_nval(x$value, by = x$group, val = x$value, from_last = TRUE, n = 2) bys_min(by = x$group, val = x$value) bys_max(by = x$group, val = x$value) bys_sum(by = x$group, val = x$value) bys_prod(by = x$group, val = x$value) bys_cummin(by = x$group, val = x$value) bys_cummax(by = x$group, val = x$value) bys_cumsum(by = x$group, val = x$value) bys_cumprod(by = x$group, val = x$value) bys_lag(by = x$group, val = x$value) bys_lead(by = x$group, val = x$value)x <- data.frame( group = c(2, 2, 1, 2, 1, 1, 1, 2, 1, 1), value = c(13, 14, 20, 9, 2, 1, 8, 18, 3, 17)) bys_count(x$group) bys_position(x$value, by = x$group, from_last = TRUE) bys_rank(by = x$group, val = x$value, from_last = TRUE) bys_val(x$value, by = x$group, val = x$value, from_last = TRUE) bys_nval(x$value, by = x$group, val = x$value, from_last = TRUE, n = 2) bys_min(by = x$group, val = x$value) bys_max(by = x$group, val = x$value) bys_sum(by = x$group, val = x$value) bys_prod(by = x$group, val = x$value) bys_cummin(by = x$group, val = x$value) bys_cummax(by = x$group, val = x$value) bys_cumsum(by = x$group, val = x$value) bys_cumprod(by = x$group, val = x$value) bys_lag(by = x$group, val = x$value) bys_lead(by = x$group, val = x$value)
Vectorised approach to group operations.
bys_count_legacy(by, unique.var = NULL) bys_rank_legacy(..., by = NULL, from_last = FALSE) bys_position_legacy(val, by = NULL, from_last = FALSE, ordered = TRUE) bys_val_legacy(..., val, by = NULL, from_last = FALSE) bys_nval_legacy(..., val, by = NULL, from_last = FALSE, n = 1, nmax = FALSE) bys_min_legacy(val, by = NULL, na.rm = TRUE) bys_max_legacy(val, by = NULL, na.rm = TRUE) bys_sum_legacy(val, by = NULL, na.rm = TRUE, cumulative = FALSE) bys_prod_legacy(val, by = NULL, na.rm = TRUE, cumulative = FALSE) bys_cummin_legacy(val, by = NULL, na.rm = TRUE) bys_cummax_legacy(val, by = NULL, na.rm = FALSE) bys_cumsum_legacy(val, by = NULL, na.rm = TRUE) bys_cumprod_legacy(val, by = NULL, na.rm = TRUE) bys_lag_legacy(val, by = NULL, n = 1) bys_lead_legacy(val, by = NULL, n = 1)bys_count_legacy(by, unique.var = NULL) bys_rank_legacy(..., by = NULL, from_last = FALSE) bys_position_legacy(val, by = NULL, from_last = FALSE, ordered = TRUE) bys_val_legacy(..., val, by = NULL, from_last = FALSE) bys_nval_legacy(..., val, by = NULL, from_last = FALSE, n = 1, nmax = FALSE) bys_min_legacy(val, by = NULL, na.rm = TRUE) bys_max_legacy(val, by = NULL, na.rm = TRUE) bys_sum_legacy(val, by = NULL, na.rm = TRUE, cumulative = FALSE) bys_prod_legacy(val, by = NULL, na.rm = TRUE, cumulative = FALSE) bys_cummin_legacy(val, by = NULL, na.rm = TRUE) bys_cummax_legacy(val, by = NULL, na.rm = FALSE) bys_cumsum_legacy(val, by = NULL, na.rm = TRUE) bys_cumprod_legacy(val, by = NULL, na.rm = TRUE) bys_lag_legacy(val, by = NULL, n = 1) bys_lead_legacy(val, by = NULL, n = 1)
by |
|
... |
|
from_last |
|
val |
|
ordered |
If |
n |
|
nmax |
|
na.rm |
If |
[atomic]
x <- data.frame( group = c(2, 2, 1, 2, 1, 1, 1, 2, 1, 1), value = c(13, 14, 20, 9, 2, 1, 8, 18, 3, 17)) bys_count_legacy(x$group) bys_position_legacy(x$value, by = x$group, from_last = TRUE) bys_rank_legacy(by = x$group, val = x$value, from_last = TRUE) bys_val_legacy(x$value, by = x$group, val = x$value, from_last = TRUE) bys_nval_legacy(x$value, by = x$group, val = x$value, from_last = TRUE, n = 2) bys_min_legacy(by = x$group, val = x$value) bys_max_legacy(by = x$group, val = x$value) bys_sum_legacy(by = x$group, val = x$value) bys_prod_legacy(by = x$group, val = x$value) bys_cummin_legacy(by = x$group, val = x$value) bys_cummax_legacy(by = x$group, val = x$value) bys_cumsum_legacy(by = x$group, val = x$value) bys_cumprod_legacy(by = x$group, val = x$value) bys_lag_legacy(by = x$group, val = x$value) bys_lead_legacy(by = x$group, val = x$value)x <- data.frame( group = c(2, 2, 1, 2, 1, 1, 1, 2, 1, 1), value = c(13, 14, 20, 9, 2, 1, 8, 18, 3, 17)) bys_count_legacy(x$group) bys_position_legacy(x$value, by = x$group, from_last = TRUE) bys_rank_legacy(by = x$group, val = x$value, from_last = TRUE) bys_val_legacy(x$value, by = x$group, val = x$value, from_last = TRUE) bys_nval_legacy(x$value, by = x$group, val = x$value, from_last = TRUE, n = 2) bys_min_legacy(by = x$group, val = x$value) bys_max_legacy(by = x$group, val = x$value) bys_sum_legacy(by = x$group, val = x$value) bys_prod_legacy(by = x$group, val = x$value) bys_cummin_legacy(by = x$group, val = x$value) bys_cummax_legacy(by = x$group, val = x$value) bys_cumsum_legacy(by = x$group, val = x$value) bys_cumprod_legacy(by = x$group, val = x$value) bys_lag_legacy(by = x$group, val = x$value) bys_lead_legacy(by = x$group, val = x$value)
Numeric codes for unique combination of vectors.
combi(..., ordered = FALSE)combi(..., ordered = FALSE)
... |
|
ordered |
|
numeric
x <- c("A", "B", "A", "C", "B", "B") y <- c("X", "X", "Z", "Z", "X", "Z") combi(x, y)x <- c("A", "B", "A", "C", "B", "B") y <- c("X", "X", "Z", "Z", "X", "Z") combi(x, y)
Returns a sort order after sorting by a vector within another vector.
custom_sort(..., decreasing = FALSE, unique = FALSE)custom_sort(..., decreasing = FALSE, unique = FALSE)
... |
Sequence of |
decreasing |
Sort order. Passed to |
unique |
If |
numeric sort order.
a <- c(1, 1, 1, 2, 2) b <- c(2, 3, 2, 1, 1) custom_sort(a, b) custom_sort(b, a) custom_sort(b, a, unique = TRUE)a <- c(1, 1, 1, 2, 2) b <- c(2, 3, 2, 1, 1) custom_sort(a, b) custom_sort(b, a) custom_sort(b, a, unique = TRUE)
d_report
## S3 method for class 'd_report' plot( x, ..., metric = c("cumulative_duration", "duration", "max_memory", "records_checked", "records_skipped", "records_assigned") ) ## S3 method for class 'd_report' as.list(x, ...) ## S3 method for class 'd_report' as.data.frame(x, ...)## S3 method for class 'd_report' plot( x, ..., metric = c("cumulative_duration", "duration", "max_memory", "records_checked", "records_skipped", "records_assigned") ) ## S3 method for class 'd_report' as.list(x, ...) ## S3 method for class 'd_report' as.data.frame(x, ...)
x |
|
... |
Arguments passed to other methods |
metric |
Report information |
Unlink records from an episode (epid), record group (pid) or pane (pane) object.
delink(x, lgk, ...) ## S3 method for class 'epid' delink(x, lgk, ...) ## S3 method for class 'pane' delink(x, lgk, ...) ## S3 method for class 'pid' delink(x, lgk, ...)delink(x, lgk, ...) ## S3 method for class 'epid' delink(x, lgk, ...) ## S3 method for class 'pane' delink(x, lgk, ...) ## S3 method for class 'pid' delink(x, lgk, ...)
x |
|
lgk |
|
... |
Other arguments. |
ep <- episodes(1:8) unlinked_ep <- delink(ep, ep@sn %in% c(3, 8)) ep; unlinked_ep pn <- partitions(1:8, length.out = 2, separate = TRUE) unlinked_pn <- delink(pn, [email protected] == 5) pn; unlinked_pn pd <- links(list(c(1, 1, 1, NA, NA), c(NA, NA, 2, 2, 2))) unlinked_pd <- delink(pd, pd@pid_cri == 1) pd; unlinked_pd # A warning is given if an index record is unlinked as this will lead to seemly impossible links. ep2 <- episodes(1:8, 2, episode_type = "rolling") unlinked_ep2 <- delink(ep2, ep2@sn %in% c(3, 5)) schema(ep2, custom_label = decode(ep2@case_nm), seed = 2) schema(unlinked_ep2, custom_label = decode(unlinked_ep2@case_nm), seed = 2)ep <- episodes(1:8) unlinked_ep <- delink(ep, ep@sn %in% c(3, 8)) ep; unlinked_ep pn <- partitions(1:8, length.out = 2, separate = TRUE) unlinked_pn <- delink(pn, pn@.Data == 5) pn; unlinked_pn pd <- links(list(c(1, 1, 1, NA, NA), c(NA, NA, 2, 2, 2))) unlinked_pd <- delink(pd, pd@pid_cri == 1) pd; unlinked_pd # A warning is given if an index record is unlinked as this will lead to seemly impossible links. ep2 <- episodes(1:8, 2, episode_type = "rolling") unlinked_ep2 <- delink(ep2, ep2@sn %in% c(3, 5)) schema(ep2, custom_label = decode(ep2@case_nm), seed = 2) schema(unlinked_ep2, custom_label = decode(unlinked_ep2@case_nm), seed = 2)
diyar
Encode and decode character and numeric values.
encode(x, ...) decode(x, ...) ## Default S3 method: encode(x, ...) ## S3 method for class 'd_label' encode(x, ...) ## Default S3 method: decode(x, ...) ## S3 method for class 'd_label' decode(x, ...) ## S3 method for class 'd_label' rep(x, ...) ## S3 method for class 'd_label' x[i, ..., drop = TRUE] ## S3 method for class 'd_label' x[[i, ..., drop = TRUE]]encode(x, ...) decode(x, ...) ## Default S3 method: encode(x, ...) ## S3 method for class 'd_label' encode(x, ...) ## Default S3 method: decode(x, ...) ## S3 method for class 'd_label' decode(x, ...) ## S3 method for class 'd_label' rep(x, ...) ## S3 method for class 'd_label' x[i, ..., drop = TRUE] ## S3 method for class 'd_label' x[[i, ..., drop = TRUE]]
x |
|
... |
Other arguments. |
i |
i |
drop |
drop |
To minimise memory usage, most components of pid, epid and pane are integer objects with labels.
encode() and decode() translates these codes and labels as required.
d_label; atomic
cds <- encode(rep(LETTERS[1:5], 3)) cds nms <- decode(cds) nmscds <- encode(rep(LETTERS[1:5], 3)) cds nms <- decode(cds) nms
epid objectS4 objects storing the result of episodes.
is.epid(x) as.epid(x, ...) ## S3 method for class 'epid' format(x, ...) ## S3 method for class 'epid' unique(x, ...) ## S3 method for class 'epid' summary(object, ...) ## S3 method for class 'epid_summary' print(x, ...) ## S3 method for class 'epid' as.data.frame(x, ..., decode = TRUE) ## S3 method for class 'epid' as.list(x, ..., decode = TRUE) ## S4 method for signature 'epid' show(object) ## S4 method for signature 'epid' rep(x, ...) ## S4 method for signature 'epid' x[i, j, ..., drop = TRUE] ## S4 method for signature 'epid' x[[i, j, ..., exact = TRUE]] ## S4 method for signature 'epid' c(x, ...)is.epid(x) as.epid(x, ...) ## S3 method for class 'epid' format(x, ...) ## S3 method for class 'epid' unique(x, ...) ## S3 method for class 'epid' summary(object, ...) ## S3 method for class 'epid_summary' print(x, ...) ## S3 method for class 'epid' as.data.frame(x, ..., decode = TRUE) ## S3 method for class 'epid' as.list(x, ..., decode = TRUE) ## S4 method for signature 'epid' show(object) ## S4 method for signature 'epid' rep(x, ...) ## S4 method for signature 'epid' x[i, j, ..., drop = TRUE] ## S4 method for signature 'epid' x[[i, j, ..., exact = TRUE]] ## S4 method for signature 'epid' c(x, ...)
x |
x |
... |
... |
object |
object |
decode |
If |
i |
i |
j |
j |
drop |
drop |
exact |
exact |
snUnique record identifier.
.DataUnique episode identifier.
wind_idUnique reference ID for each match.
wind_nmType of window i.e. "Case" or "Recurrence".
case_nmRecord type in regards to case assignment.
dist_wind_indexUnit difference between each record and its window's reference record.
dist_epid_indexUnit difference between each record and its episode's reference record.
epid_datasetData sources in each episode.
epid_intervalThe start and end dates of each episode. A number_line object.
epid_lengthThe duration or length of (epid_interval).
epid_totalThe number of records in each episode.
iterationThe iteration when a record was matched to it's group (.Data).
optionsSome options passed to the instance of episodes.
# A test for `epid` objects ep <- episodes(date = 1) is.epid(ep); is.epid(2) ep <- episodes(date = 1) is.epid(ep); is.epid(2)# A test for `epid` objects ep <- episodes(date = 1) is.epid(ep); is.epid(2) ep <- episodes(date = 1) is.epid(ep); is.epid(2)
Dated events (records) within a certain duration of an index event are assigned to a unique group.
Each group has unique ID and are described as "episodes".
"episodes" can be "fixed" or "rolling" ("recurring").
Each episodes has a "Case" and/or "Recurrent" record
while all other records within the group are either "Duplicates" of
the "Case" or "Recurrent" event.
episodes( date, case_length = Inf, episode_type = "fixed", recurrence_length = case_length, episode_unit = "days", strata = NULL, sn = NULL, episodes_max = Inf, rolls_max = Inf, case_overlap_methods = 8, recurrence_overlap_methods = case_overlap_methods, skip_if_b4_lengths = FALSE, data_source = NULL, data_links = "ANY", custom_sort = NULL, skip_order = Inf, reference_event = "last_record", case_for_recurrence = FALSE, from_last = FALSE, group_stats = c("case_nm", "wind", "epid_interval"), display = "none", case_sub_criteria = NULL, recurrence_sub_criteria = case_sub_criteria, case_length_total = 1, recurrence_length_total = case_length_total, skip_unique_strata = TRUE, splits_by_strata = 1, batched = "semi" ) links_wf_episodes( date, case_length = Inf, episode_type = "fixed", strata = NULL, sn = NULL, display = "none" ) episodes_af_shift( date, case_length = Inf, sn = NULL, strata = NULL, group_stats = FALSE, episode_type = "fixed", data_source = NULL, episode_unit = "days", data_links = "ANY", display = "none" )episodes( date, case_length = Inf, episode_type = "fixed", recurrence_length = case_length, episode_unit = "days", strata = NULL, sn = NULL, episodes_max = Inf, rolls_max = Inf, case_overlap_methods = 8, recurrence_overlap_methods = case_overlap_methods, skip_if_b4_lengths = FALSE, data_source = NULL, data_links = "ANY", custom_sort = NULL, skip_order = Inf, reference_event = "last_record", case_for_recurrence = FALSE, from_last = FALSE, group_stats = c("case_nm", "wind", "epid_interval"), display = "none", case_sub_criteria = NULL, recurrence_sub_criteria = case_sub_criteria, case_length_total = 1, recurrence_length_total = case_length_total, skip_unique_strata = TRUE, splits_by_strata = 1, batched = "semi" ) links_wf_episodes( date, case_length = Inf, episode_type = "fixed", strata = NULL, sn = NULL, display = "none" ) episodes_af_shift( date, case_length = Inf, sn = NULL, strata = NULL, group_stats = FALSE, episode_type = "fixed", data_source = NULL, episode_unit = "days", data_links = "ANY", display = "none" )
date |
|
case_length |
|
episode_type |
|
recurrence_length |
|
episode_unit |
|
strata |
|
sn |
|
episodes_max |
|
rolls_max |
|
case_overlap_methods |
|
recurrence_overlap_methods |
|
skip_if_b4_lengths |
|
data_source |
|
data_links |
|
custom_sort |
|
skip_order |
|
reference_event |
|
case_for_recurrence |
|
from_last |
|
group_stats |
|
display |
|
case_sub_criteria |
|
recurrence_sub_criteria |
|
case_length_total |
|
recurrence_length_total |
|
skip_unique_strata |
|
splits_by_strata |
|
batched |
|
episodes() links dated records (events) that
are within a set duration of each other in iterations.
Every record is linked to a unique group (episode; epid object).
These episodes represent occurrences of interest as specified by function's arguments and defined by a case definition.
Two main type of episodes are possible;
"fixed" - An episode where all events are within a fixed duration of an index event.
"rolling" - An episode where all events are within a recurring duration of an index event.
Every record in each episode is categorised as one of the following;
"Case" - Index event of the episode (without a nested match criteria).
"Case_CR" - Index event of the episode (with a nested match criteria).
"Duplicate_C" - Duplicate of the index event.
"Recurrent" - Recurrence of the index event (without a nested match criteria).
"Recurrent_CR" - Recurrence of the index event (with a nested match criteria).
"Duplicate_R" - Duplicate of the recurrent event.
"Skipped" - Skipped records.
If data_links is supplied, every element of the list must be named "l" (links) or "g" (groups).
Unnamed elements are assumed to be "l".
If named "l", groups without records from every listed data_source will be unlinked.
If named "g", groups without records from any listed data_source will be unlinked.
All records with a missing (NA) strata or date are skipped.
Wrapper functions or alternative implementations of episodes() for specific use cases or benefits:
episodes_wf_repeats() - Identical records are excluded from the main analysis.
episodes_af_shift() - A mostly vectorised approach.
links_wf_episodes() - The same functionality achieved with links.
See vignette("episodes") for further details.
epid; list
episodes_wf_repeats; custom_sort;
sub_criteria; epid_length;
epid_window; partitions;
links; overlaps;
data(infections) data(hospital_admissions) # One 16-day (15-day difference) fixed episode per type of infection episodes(date = infections$date, strata = infections$infection, case_length = 15, episodes_max = 1, episode_type = "fixed") # Multiple 16-day episodes with an 11-day recurrence period episodes(date = infections$date, strata = NULL, case_length = 15, episodes_max = Inf, episode_type = "rolling", recurrence_length = 10) # Overlapping periods of hospital stays dfr <- hospital_admissions[2:3] dfr$admin_period <- number_line(dfr$admin_dt,dfr$discharge_dt) dfr$ep <- episodes(date = dfr$admin_period, strata = NULL, case_length = index_window(dfr$admin_period), case_overlap_methods = "inbetween") dfr as.data.frame(dfr$ep)data(infections) data(hospital_admissions) # One 16-day (15-day difference) fixed episode per type of infection episodes(date = infections$date, strata = infections$infection, case_length = 15, episodes_max = 1, episode_type = "fixed") # Multiple 16-day episodes with an 11-day recurrence period episodes(date = infections$date, strata = NULL, case_length = 15, episodes_max = Inf, episode_type = "rolling", recurrence_length = 10) # Overlapping periods of hospital stays dfr <- hospital_admissions[2:3] dfr$admin_period <- number_line(dfr$admin_dt,dfr$discharge_dt) dfr$ep <- episodes(date = dfr$admin_period, strata = NULL, case_length = index_window(dfr$admin_period), case_overlap_methods = "inbetween") dfr as.data.frame(dfr$ep)
episodes_wf_repeats is a wrapper function of episodes.
It's designed to be more efficient with larger datasets.
Duplicate records which do not affect the case definition are excluded prior to episode tracking.
The resulting episode identifiers are then recycled for the duplicate records.
episodes_wf_repeats(..., duplicates_recovered = "ANY")episodes_wf_repeats(..., duplicates_recovered = "ANY")
... |
Arguments passed to |
duplicates_recovered |
|
reframe |
|
episodes_wf_repeats() reduces or re-frames a dataset to
the minimum datasets required to implement a case definition.
This leads to the same outcome but with the benefit of a shorter processing time.
The duplicates_recovered argument determines which identifiers are recycled.
Selecting the "with_sub_criteria" option will force only identifiers created resulting from a matched sub_criteria ("Case_CR" and "Recurrent_CR") are recycled.
However, if "without_sub_criteria" is selected then only identifiers created that do not result from a matched sub_criteria ("Case" and "Recurrent") are recycled
Excluded duplicates of "Duplicate_C" and "Duplicate_R" are always recycled.
The reframe argument will either reframe or subset a sub_criteria.
Both will require slightly different functions for match_funcs or equal_funcs.
epid; list
# With 2,000 duplicate records of 20 events, # `episodes_wf_repeats()` will take less time than `episodes()` dates <- seq(from = as.Date("2019-04-01"), to = as.Date("2019-04-20"), by = 1) dates <- rep(dates, 2000) system.time( ep1 <- episodes(dates, 1) ) system.time( ep2 <- episodes_wf_repeats(dates, 1) ) # Both leads to the same outcome. all(ep1 == ep2)# With 2,000 duplicate records of 20 events, # `episodes_wf_repeats()` will take less time than `episodes()` dates <- seq(from = as.Date("2019-04-01"), to = as.Date("2019-04-20"), by = 1) dates <- rep(dates, 2000) system.time( ep1 <- episodes(dates, 1) ) system.time( ep2 <- episodes_wf_repeats(dates, 1) ) # Both leads to the same outcome. all(ep1 == ep2)
Deterministic and probabilistic record linkage Assign unique identifiers to records based on partial, nested or calculated probabilities.
links_af_probabilistic( attribute, blocking_attribute = NULL, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, u_probability = NULL, score_threshold = 1, repeats_allowed = FALSE, permutations_allowed = FALSE, data_source = NULL, ignore_same_source = TRUE, display = "none" ) links_wf_probabilistic( attribute, blocking_attribute = NULL, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, u_probability = NULL, score_threshold = 1, id_1 = NULL, id_2 = NULL, return_weights = FALSE, ... ) prob_score_range(attribute, m_probability = 0.95, u_probability = NULL)links_af_probabilistic( attribute, blocking_attribute = NULL, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, u_probability = NULL, score_threshold = 1, repeats_allowed = FALSE, permutations_allowed = FALSE, data_source = NULL, ignore_same_source = TRUE, display = "none" ) links_wf_probabilistic( attribute, blocking_attribute = NULL, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, u_probability = NULL, score_threshold = 1, id_1 = NULL, id_2 = NULL, return_weights = FALSE, ... ) prob_score_range(attribute, m_probability = 0.95, u_probability = NULL)
attribute |
|
blocking_attribute |
|
cmp_func |
|
attr_threshold |
|
probabilistic |
|
m_probability |
|
u_probability |
|
score_threshold |
|
repeats_allowed |
|
permutations_allowed |
|
data_source |
|
ignore_same_source |
|
display |
|
id_1 |
|
id_2 |
|
return_weights |
If |
... |
Arguments passed to |
links_wf_probabilistic() - A wrapper function of links with a
with a specific sub_criteria and to achieve to achieve probabilistic record linkage
It excludes functionalities for the nested and multi-stage linkage.
links_wf_probabilistic() requires a score_threshold in advance.
To help with this, prob_score_range() can be used to return the range of scores attainable for a given set of attribute, m and u-probabilities.
Additionally, id_1 and id_2 can be used to link specific records pairs, aiding the review of potential scores.
links_af_probabilistic() - A simpler version of links.
It excludes functionalities for the batched, nested and multi-stage linkage.
links_af_probabilistic() requires a score_threshold in advance,
however, since it exports the match weights, the score_threshold
can be changed after the analysis.
pid; list
Fellegi, I. P., & Sunter, A. B. (1969). A Theory for Record Linkage. Journal of the Statistical Association, 64(328), 1183 - 1210. https://doi.org/10.1080/01621459.1969.10501049
Asher, J., Resnick, D., Brite, J., Brackbill, R., & Cone, J. (2020). An Introduction to Probabilistic Record Linkage with a Focus on Linkage Processing for WTC Registries. International journal of environmental research and public health, 17(18), 6937. https://doi.org/10.3390/ijerph17186937.
See vignette("links") for more information.
data(patient_records) # Weighted (probabilistic) comparison of forename, middlename and surname criteria_1 <- as.list(patient_records[c("forename", "middlename", "surname")]) # Possible scores when m-probability is 0.95 prob_scores <- prob_score_range(attribute = criteria_1, m_probability = 0.95, u_probability = NULL) ## Not run: # Probabilistic record linkage with 'links_af_probabilistic()' pids_1a <- links_af_probabilistic(attribute = criteria_1, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, score_threshold = prob_scores$mid_scorce, display = "stats") # Equivalent with 'links_wf_probabilistic()' pids_1b <- links_wf_probabilistic(attribute = criteria_1, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, score_threshold = prob_scores$mid_scorce, display = "progress", recursive = TRUE, check_duplicates = TRUE) # Less thorough but faster equivalent with `links_wf_probabilistic()` pids_1c <- links_wf_probabilistic(attribute = criteria_1, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, score_threshold = prob_scores$mid_scorce, display = "progress", recursive = FALSE, check_duplicates = FALSE) # Each implementation can lead to different results summary(pids_1a$pid) summary(pids_1b$pid) summary(pids_1c$pid) ## End(Not run) # Weighted (non-probabilistic) comparison of forename, middlename and age difference criteria_2 <- as.list(patient_records[c("forename", "middlename", "dateofbirth")]) age_diff <- function(x, y){ diff <- abs(as.numeric(x) - as.numeric(y)) wgt <- diff %in% 0:(365 * 10) & !is.na(diff) wgt } pids_2a <- links_af_probabilistic(attribute = criteria_2, blocking_attribute = patient_records$surname, cmp_func = c(exact_match, exact_match, age_diff), score_threshold = number_line(3, 5), probabilistic = FALSE, display = "stats") # Larger weights can be assigned to particular attributes through `cmp_func` # For example, a smaller age difference can contribute a higher score (e.g 0 to 3) age_diff_2 <- function(x, y){ diff <- as.numeric(abs(x - y)) wgt <- diff %in% 0:(365 * 10) & !is.na(diff) wgt[wgt] <- match(as.numeric(cut(diff[wgt], 3)), 3:1) wgt } pids_2b <- links_af_probabilistic(attribute = criteria_2, blocking_attribute = patient_records$surname, cmp_func = c(exact_match, exact_match, age_diff_2), score_threshold = number_line(3, 5), probabilistic = FALSE, display = "stats") head(pids_2a$pid_weights, 10) head(pids_2b$pid_weights, 10)data(patient_records) # Weighted (probabilistic) comparison of forename, middlename and surname criteria_1 <- as.list(patient_records[c("forename", "middlename", "surname")]) # Possible scores when m-probability is 0.95 prob_scores <- prob_score_range(attribute = criteria_1, m_probability = 0.95, u_probability = NULL) ## Not run: # Probabilistic record linkage with 'links_af_probabilistic()' pids_1a <- links_af_probabilistic(attribute = criteria_1, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, score_threshold = prob_scores$mid_scorce, display = "stats") # Equivalent with 'links_wf_probabilistic()' pids_1b <- links_wf_probabilistic(attribute = criteria_1, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, score_threshold = prob_scores$mid_scorce, display = "progress", recursive = TRUE, check_duplicates = TRUE) # Less thorough but faster equivalent with `links_wf_probabilistic()` pids_1c <- links_wf_probabilistic(attribute = criteria_1, cmp_func = exact_match, attr_threshold = 1, probabilistic = TRUE, m_probability = 0.95, score_threshold = prob_scores$mid_scorce, display = "progress", recursive = FALSE, check_duplicates = FALSE) # Each implementation can lead to different results summary(pids_1a$pid) summary(pids_1b$pid) summary(pids_1c$pid) ## End(Not run) # Weighted (non-probabilistic) comparison of forename, middlename and age difference criteria_2 <- as.list(patient_records[c("forename", "middlename", "dateofbirth")]) age_diff <- function(x, y){ diff <- abs(as.numeric(x) - as.numeric(y)) wgt <- diff %in% 0:(365 * 10) & !is.na(diff) wgt } pids_2a <- links_af_probabilistic(attribute = criteria_2, blocking_attribute = patient_records$surname, cmp_func = c(exact_match, exact_match, age_diff), score_threshold = number_line(3, 5), probabilistic = FALSE, display = "stats") # Larger weights can be assigned to particular attributes through `cmp_func` # For example, a smaller age difference can contribute a higher score (e.g 0 to 3) age_diff_2 <- function(x, y){ diff <- as.numeric(abs(x - y)) wgt <- diff %in% 0:(365 * 10) & !is.na(diff) wgt[wgt] <- match(as.numeric(cut(diff[wgt], 3)), 3:1) wgt } pids_2b <- links_af_probabilistic(attribute = criteria_2, blocking_attribute = patient_records$surname, cmp_func = c(exact_match, exact_match, age_diff_2), score_threshold = number_line(3, 5), probabilistic = FALSE, display = "stats") head(pids_2a$pid_weights, 10) head(pids_2b$pid_weights, 10)
Assign records to unique groups based on an ordered set of match criteria.
links( criteria, sub_criteria = NULL, sn = NULL, strata = NULL, data_source = NULL, data_links = "ANY", display = "none", group_stats = FALSE, expand = TRUE, shrink = FALSE, recursive = "none", check_duplicates = FALSE, tie_sort = NULL, batched = "yes", repeats_allowed = FALSE, permutations_allowed = FALSE, ignore_same_source = FALSE, stepwise_method = "expand_with_priority" )links( criteria, sub_criteria = NULL, sn = NULL, strata = NULL, data_source = NULL, data_links = "ANY", display = "none", group_stats = FALSE, expand = TRUE, shrink = FALSE, recursive = "none", check_duplicates = FALSE, tie_sort = NULL, batched = "yes", repeats_allowed = FALSE, permutations_allowed = FALSE, ignore_same_source = FALSE, stepwise_method = "expand_with_priority" )
criteria |
|
sub_criteria |
|
sn |
|
strata |
|
data_source |
|
data_links |
|
display |
|
group_stats |
|
expand |
|
shrink |
|
recursive |
|
check_duplicates |
|
tie_sort |
|
batched |
|
repeats_allowed |
|
permutations_allowed |
|
ignore_same_source |
|
The priority of matches decreases with each subsequent stage of the linkage process.
Therefore, the attributes in criteria should be in an order of decreasing relevance.
Records with missing data (NA) for each criteria are
skipped at the respective stage, while records with
missing data strata are skipped from every stage.
If a record is skipped from a stage, another attempt will be made to match the record at the next stage. If a record is still unmatched by the last stage, it is assigned a unique group ID.
A sub_criteria adds nested match criteria
to each stage of the linkage process. If used, only
records with a matching criteria and sub_criteria are linked.
In links, each sub_criteria must
be linked to a criteria. This is done by adding each sub_criteria
to a named element of a list - "cr" concatenated with
the corresponding stage's number.
For example, 3 sub_criteria linked to
criteria 1, 5 and 13 will be;
Any unlinked sub_criteria will be ignored.
Every element in data_links must be named "l" (links) or "g" (groups).
Unnamed elements of data_links will be assumed to be "l".
If named "l", groups without records from every listed data_source will be unlinked.
If named "g", groups without records from any listed data_source will be unlinked.
See vignette("links") for more information.
pid; list
links_af_probabilistic; episodes;
predefined_tests; sub_criteria
data(patient_records) dfr <- patient_records # An exact match on surname followed by an exact match on forename stages <- as.list(dfr[c("surname", "forename")]) p1 <- links(criteria = stages) # An exact match on forename followed by an exact match on surname p2 <- links(criteria = rev(stages)) # Nested matches # Same sex OR birth year m.cri.1 <- sub_criteria( format(dfr$dateofbirth, "%Y"), dfr$sex, operator = "or") # Same middle name AND a 10 year age difference age_diff <- function(x, y){ diff <- abs(as.numeric(x) - as.numeric(y)) wgt <- diff %in% 0:10 & !is.na(diff) wgt } m.cri.2 <- sub_criteria( format(dfr$dateofbirth, "%Y"), dfr$middlename, operator = "and", match_funcs = c(age_diff, exact_match)) # Nested match criteria 'm.cri.1' OR 'm.cri.2' n.cri <- sub_criteria( m.cri.1, m.cri.2, operator = "or") # Record linkage with additional match criteria p3 <- links( criteria = stages, sub_criteria = list(cr1 = m.cri.1, cr2 = m.cri.2)) # Record linkage with additonal nested match criteria p4 <- links( criteria = stages, sub_criteria = list(cr1 = n.cri, cr2 = n.cri)) dfr$p1 <- p1; dfr$p2 <- p2 dfr$p3 <- p3; dfr$p4 <- p4 head(dfr)data(patient_records) dfr <- patient_records # An exact match on surname followed by an exact match on forename stages <- as.list(dfr[c("surname", "forename")]) p1 <- links(criteria = stages) # An exact match on forename followed by an exact match on surname p2 <- links(criteria = rev(stages)) # Nested matches # Same sex OR birth year m.cri.1 <- sub_criteria( format(dfr$dateofbirth, "%Y"), dfr$sex, operator = "or") # Same middle name AND a 10 year age difference age_diff <- function(x, y){ diff <- abs(as.numeric(x) - as.numeric(y)) wgt <- diff %in% 0:10 & !is.na(diff) wgt } m.cri.2 <- sub_criteria( format(dfr$dateofbirth, "%Y"), dfr$middlename, operator = "and", match_funcs = c(age_diff, exact_match)) # Nested match criteria 'm.cri.1' OR 'm.cri.2' n.cri <- sub_criteria( m.cri.1, m.cri.2, operator = "or") # Record linkage with additional match criteria p3 <- links( criteria = stages, sub_criteria = list(cr1 = m.cri.1, cr2 = m.cri.2)) # Record linkage with additonal nested match criteria p4 <- links( criteria = stages, sub_criteria = list(cr1 = n.cri, cr2 = n.cri)) dfr$p1 <- p1; dfr$p2 <- p2 dfr$p3 <- p3; dfr$p4 <- p4 head(dfr)
A convenience function to format atomic vectors as a written list.
listr(x, sep = ", ", conj = " and ", lim = Inf)listr(x, sep = ", ", conj = " and ", lim = Inf)
x |
|
sep |
Separator. |
conj |
Final separator. |
lim |
Elements to include in the list. Other elements are abbreviated to |
character.
listr(1:5) listr(1:5, sep = "; ") listr(1:5, sep = "; ", conj = " and") listr(1:5, sep = "; ", conj = " and", lim = 2)listr(1:5) listr(1:5, sep = "; ") listr(1:5, sep = "; ", conj = " and") listr(1:5, sep = "; ", conj = " and", lim = 2)
Convert an edge list to record identifiers.
make_ids(x_pos, y_pos, id_length = max(x_pos, y_pos))make_ids(x_pos, y_pos, id_length = max(x_pos, y_pos))
x_pos |
|
y_pos |
|
id_length |
Length of the record identifier. |
Record groups from non-recursive links have the lowest record ID (sn) in the set as their group ID.
list
make_ids(x_pos = rep(7, 7), y_pos = 1:7) make_ids(x_pos = c(1, 6), y_pos = 6:7) make_ids(x_pos = 1:5, y_pos = c(1, 1, 2, 3, 4))make_ids(x_pos = rep(7, 7), y_pos = 1:7) make_ids(x_pos = c(1, 6), y_pos = 6:7) make_ids(x_pos = 1:5, y_pos = c(1, 1, 2, 3, 4))
Combinations and permutations of record-sets.
sets(n, r, permutations_allowed = TRUE, repeats_allowed = TRUE) make_sets( x, r, strata = NULL, permutations_allowed = TRUE, repeats_allowed = TRUE ) make_pairs( x, strata = NULL, repeats_allowed = TRUE, permutations_allowed = FALSE ) make_pairs_wf_source(..., data_source = NULL)sets(n, r, permutations_allowed = TRUE, repeats_allowed = TRUE) make_sets( x, r, strata = NULL, permutations_allowed = TRUE, repeats_allowed = TRUE ) make_pairs( x, strata = NULL, repeats_allowed = TRUE, permutations_allowed = FALSE ) make_pairs_wf_source(..., data_source = NULL)
n |
|
r |
|
permutations_allowed |
|
repeats_allowed |
|
x |
|
strata |
Subsets of |
... |
Arguments passed to |
data_source |
|
set() - Create r-set combinations or permutations of n observations.
make_set() - Create r-set combinations or permutations of vector x.
make_pairs() - Create 2-set combinations or permutations of vector x.
make_pairs_wf_source() - Create 2-set combinations or permutations of vector x that are from different sources (data_source).
A list of a vector's elements and corresponding indexes.
sets(4, 2) sets(4, 2, repeats_allowed = FALSE, permutations_allowed = FALSE) make_sets(month.abb[1:4], 2) make_sets(month.abb[1:4], 3) make_pairs(month.abb[1:4]) make_pairs(month.abb[1:4], strata = c(1, 1, 2, 2)) make_pairs_wf_source(month.abb[1:4], data_source = c(1, 1, 2, 2))sets(4, 2) sets(4, 2, repeats_allowed = FALSE, permutations_allowed = FALSE) make_sets(month.abb[1:4], 2) make_sets(month.abb[1:4], 3) make_pairs(month.abb[1:4]) make_pairs(month.abb[1:4], strata = c(1, 1, 2, 2)) make_pairs_wf_source(month.abb[1:4], data_source = c(1, 1, 2, 2))
epid and pid objects with index of matching recordsCreate epid and pid objects with index of matching records
make_episodes( x_pos, y_pos, x_val, date, case_nm, wind_id, wind_nm, from_last, data_source, data_links, iteration, options, episode_unit ) make_pids( x_pos, y_pos, x_val, link_id, pid_cri, data_source, data_links, iteration )make_episodes( x_pos, y_pos, x_val, date, case_nm, wind_id, wind_nm, from_last, data_source, data_links, iteration, options, episode_unit ) make_pids( x_pos, y_pos, x_val, link_id, pid_cri, data_source, data_links, iteration )
x_pos |
|
y_pos |
|
x_val |
|
date |
|
case_nm |
|
wind_id |
|
wind_nm |
|
from_last |
|
data_source |
|
data_links |
|
iteration |
The iteration when a record was matched to it's group ( |
options |
|
episode_unit |
|
link_id |
|
pid_cri |
Match stage of the step-wise linkage. |
Consolidate two group identifiers.
merge_ids(...) ## Default S3 method: merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...) ## S3 method for class 'pid' merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...) ## S3 method for class 'epid' merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...) ## S3 method for class 'pane' merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...)merge_ids(...) ## Default S3 method: merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...) ## S3 method for class 'pid' merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...) ## S3 method for class 'epid' merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...) ## S3 method for class 'pane' merge_ids(id1, id2, tie_sort = NULL, expand = TRUE, shrink = FALSE, ...)
... |
Other arguments |
id1 |
|
id2 |
|
tie_sort |
|
expand |
|
shrink |
|
Groups in id1 are expanded or shrunk by groups in id2.
A unique group with only one record is considered a non-matching record.
Note that the expand and shrink features are not interchangeable.
The outcome when shrink is TRUE is not the same when expand is FALSE.
See Examples.
id1 <- rep(1, 5) id2 <- c(2, 2, 3, 3, 3) merge_ids(id1, id2, shrink = TRUE) id1 <- c(rep(1, 3), 6, 7) id2 <- c(2,2,3,3,3) merge_ids(id1, id2, shrink = TRUE) merge_ids(id1, id2, expand = FALSE) id1 <- rep(1, 5) id2 <- c(1:3, 4, 4) merge_ids(id1, id2, shrink = TRUE) merge_ids(id1, id2, expand= FALSE) data(missing_staff_id) dfr <- missing_staff_id id1 <- links(dfr[[5]]) id2 <- links(dfr[[6]]) merge_ids(id1, id2)id1 <- rep(1, 5) id2 <- c(2, 2, 3, 3, 3) merge_ids(id1, id2, shrink = TRUE) id1 <- c(rep(1, 3), 6, 7) id2 <- c(2,2,3,3,3) merge_ids(id1, id2, shrink = TRUE) merge_ids(id1, id2, expand = FALSE) id1 <- rep(1, 5) id2 <- c(1:3, 4, 4) merge_ids(id1, id2, shrink = TRUE) merge_ids(id1, id2, expand= FALSE) data(missing_staff_id) dfr <- missing_staff_id id1 <- links(dfr[[5]]) id2 <- links(dfr[[6]]) merge_ids(id1, id2)
number_lineA range of numeric values.
number_line(l, r, id = NULL, gid = NULL) as.number_line(x) is.number_line(x) left_point(x) left_point(x) <- value right_point(x) right_point(x) <- value start_point(x) start_point(x) <- value end_point(x) end_point(x) <- value number_line_width(x) reverse_number_line(x, direction = "both") shift_number_line(x, by = 1) expand_number_line(x, by = 1, point = "both") invert_number_line(x, point = "both") number_line_sequence( x, by = NULL, length.out = 1, fill = TRUE, simplify = FALSE )number_line(l, r, id = NULL, gid = NULL) as.number_line(x) is.number_line(x) left_point(x) left_point(x) <- value right_point(x) right_point(x) <- value start_point(x) start_point(x) <- value end_point(x) end_point(x) <- value number_line_width(x) reverse_number_line(x, direction = "both") shift_number_line(x, by = 1) expand_number_line(x, by = 1, point = "both") invert_number_line(x, point = "both") number_line_sequence( x, by = NULL, length.out = 1, fill = TRUE, simplify = FALSE )
l |
|
r |
|
id |
|
gid |
|
x |
|
value |
[ |
direction |
|
by |
|
point |
|
length.out |
|
fill |
|
simplify |
|
A number_line object represents a range of numbers.
It is made up of a start and end point as the lower and upper ends of the range respectively.
The location of the start point - left or right,
determines whether it is an "increasing" or "decreasing" number_line.
This is the direction of the number_line.
reverse_number_line() - reverse the direction of a number_line.
A reversed number_line has its left and right points swapped.
The direction argument specifies which type of number_line will be reversed.
number_line with non-finite start or end points (i.e. NA, NaN and Inf) can't be reversed.
shift_number_line() - Shift a number_line towards the positive or negative end of the number line.
expand_number_line() - Increase or decrease the width of a number_line.
invert_number_line() - Change the left or right points from a negative to positive value or vice versa.
number_line_sequence() - Split a number_line into equal parts (length.out) or by a fixed recurring width (by).
number_line
overlaps; set_operations; episodes; links
number_line(-100, 100) # Also compatible with other numeric based object classes number_line(as.POSIXct("2019-05-15 13:15:07", tz = "UTC"), as.POSIXct("2019-05-15 15:17:10", tz = "UTC")) # Coerce compatible object classes to `number_line` objects as.number_line(5.1); as.number_line(as.Date("2019-10-21")) # A test for number_line objects a <- number_line(as.Date("2019-04-25"), as.Date("2019-01-01")) is.number_line(a) # Structure of a number_line object left_point(a); right_point(a); start_point(a); end_point(a) # Reverse number_line objects reverse_number_line(number_line(as.Date("2019-04-25"), as.Date("2019-01-01"))) reverse_number_line(number_line(200, -100), "increasing") reverse_number_line(number_line(200, -100), "decreasing") c <- number_line(5, 6) # Shift number_line objects towards the positive end of the number line shift_number_line(x = c(c, c), by = c(2, 3)) # Shift number_line objects towards the negative end of the number line shift_number_line(x = c(c, c), by = c(-2, -3)) # Change the duration, width or length of a number_line object d <- c(number_line(3, 6), number_line(6, 3)) expand_number_line(d, 2) expand_number_line(d, -2) expand_number_line(d, c(2,-1)) expand_number_line(d, 2, "start") expand_number_line(d, 2, "end") # Invert `number_line` objects e <- c(number_line(3, 6), number_line(-3, -6), number_line(-3, 6)) e invert_number_line(e) invert_number_line(e, "start") invert_number_line(e, "end") # Split number line objects x <- number_line(Sys.Date() - 5, Sys.Date()) x number_line_sequence(x, by = 2) number_line_sequence(x, by = 4) number_line_sequence(x, by = 4, fill = FALSE) number_line_sequence(x, length.out = 2)number_line(-100, 100) # Also compatible with other numeric based object classes number_line(as.POSIXct("2019-05-15 13:15:07", tz = "UTC"), as.POSIXct("2019-05-15 15:17:10", tz = "UTC")) # Coerce compatible object classes to `number_line` objects as.number_line(5.1); as.number_line(as.Date("2019-10-21")) # A test for number_line objects a <- number_line(as.Date("2019-04-25"), as.Date("2019-01-01")) is.number_line(a) # Structure of a number_line object left_point(a); right_point(a); start_point(a); end_point(a) # Reverse number_line objects reverse_number_line(number_line(as.Date("2019-04-25"), as.Date("2019-01-01"))) reverse_number_line(number_line(200, -100), "increasing") reverse_number_line(number_line(200, -100), "decreasing") c <- number_line(5, 6) # Shift number_line objects towards the positive end of the number line shift_number_line(x = c(c, c), by = c(2, 3)) # Shift number_line objects towards the negative end of the number line shift_number_line(x = c(c, c), by = c(-2, -3)) # Change the duration, width or length of a number_line object d <- c(number_line(3, 6), number_line(6, 3)) expand_number_line(d, 2) expand_number_line(d, -2) expand_number_line(d, c(2,-1)) expand_number_line(d, 2, "start") expand_number_line(d, 2, "end") # Invert `number_line` objects e <- c(number_line(3, 6), number_line(-3, -6), number_line(-3, 6)) e invert_number_line(e) invert_number_line(e, "start") invert_number_line(e, "end") # Split number line objects x <- number_line(Sys.Date() - 5, Sys.Date()) x number_line_sequence(x, by = 2) number_line_sequence(x, by = 4) number_line_sequence(x, by = 4, fill = FALSE) number_line_sequence(x, length.out = 2)
number_line objectS4 objects representing a range of numeric values.
## S4 method for signature 'number_line' show(object) ## S4 method for signature 'number_line' rep(x, ...) ## S4 method for signature 'number_line' x[i, j, ..., drop = TRUE] ## S4 method for signature 'number_line' x[[i, j, ..., exact = TRUE]] ## S4 replacement method for signature 'number_line' x[i, j, ...] <- value ## S4 replacement method for signature 'number_line' x[[i, j, ...]] <- value ## S4 method for signature 'number_line' x$name ## S4 replacement method for signature 'number_line' x$name <- value ## S4 method for signature 'number_line' c(x, ...) ## S3 method for class 'number_line' unique(x, ...) ## S3 method for class 'number_line' seq(x, precision = NULL, fill = FALSE, ...) ## S3 method for class 'number_line' sort(x, decreasing = FALSE, ...) ## S3 method for class 'number_line' format(x, ...) ## S3 method for class 'number_line' as.list(x, ...) ## S3 method for class 'number_line' as.data.frame(x, ...)## S4 method for signature 'number_line' show(object) ## S4 method for signature 'number_line' rep(x, ...) ## S4 method for signature 'number_line' x[i, j, ..., drop = TRUE] ## S4 method for signature 'number_line' x[[i, j, ..., exact = TRUE]] ## S4 replacement method for signature 'number_line' x[i, j, ...] <- value ## S4 replacement method for signature 'number_line' x[[i, j, ...]] <- value ## S4 method for signature 'number_line' x$name ## S4 replacement method for signature 'number_line' x$name <- value ## S4 method for signature 'number_line' c(x, ...) ## S3 method for class 'number_line' unique(x, ...) ## S3 method for class 'number_line' seq(x, precision = NULL, fill = FALSE, ...) ## S3 method for class 'number_line' sort(x, decreasing = FALSE, ...) ## S3 method for class 'number_line' format(x, ...) ## S3 method for class 'number_line' as.list(x, ...) ## S3 method for class 'number_line' as.data.frame(x, ...)
object |
object |
x |
x |
... |
... |
i |
i |
j |
j |
drop |
drop |
exact |
exact |
value |
value |
name |
slot name |
precision |
Round precision |
fill |
|
decreasing |
If |
startFirst value in the range.
idUnique element id. Optional.
gidUnique group id. Optional.
.DataLength, duration or width of the range.
Identify overlapping number_line objects
overlaps(x, y, methods = 8) overlap(x, y) none(x, y) exact(x, y) across(x, y) x_across_y(x, y) y_across_x(x, y) chain(x, y) x_chain_y(x, y) y_chain_x(x, y) aligns_start(x, y) x_aligns_start_y(x, y) y_aligns_start_x(x, y) aligns_end(x, y) x_aligns_end_y(x, y) y_aligns_end_x(x, y) inbetween(x, y) x_inbetween_y(x, y) y_inbetween_x(x, y) overlap_method(x, y) include_overlap_method(methods) exclude_overlap_method(methods) overlap_method_codes(methods) overlap_method_names(methods)overlaps(x, y, methods = 8) overlap(x, y) none(x, y) exact(x, y) across(x, y) x_across_y(x, y) y_across_x(x, y) chain(x, y) x_chain_y(x, y) y_chain_x(x, y) aligns_start(x, y) x_aligns_start_y(x, y) y_aligns_start_x(x, y) aligns_end(x, y) x_aligns_end_y(x, y) y_aligns_end_x(x, y) inbetween(x, y) x_inbetween_y(x, y) y_inbetween_x(x, y) overlap_method(x, y) include_overlap_method(methods) exclude_overlap_method(methods) overlap_method_codes(methods) overlap_method_names(methods)
x |
|
y |
|
methods |
|
There are 6 mutually exclusive types of overlap;
exact() - identical start_point and end_point points.
inbetween() - Both start_point and end_point of one number_line object are within the start_point and end_point of another.
across() - Only the start_point or end_point of one number_line object is in between the start_point and end_point of another.
chain() - end_point of one number_line object is identical to the start_point of another.
aligns_start() - identical start_point only.
aligns_end() - identical end_point only.
Except exact(), each type of overlap has two variations;
x_`method`_y() - number_line-x starts before number_line-y.
y_`method`_x() - number_line-y starts before number_line-x.
There are two mutually inclusive types of overlap;
overlap() - a convenient option to select "ANY" and "ALL" type of overlap.
none() - a convenient option to select "NO" type of overlap.
Selecting multiple types of overlap;
overlaps() - select specific type(s) of overlap.
overlap_method() - return the type of overlap for a pair of number_line objects.
overlap_method_codes() - return the corresponding overlap method code for a specific type(s) of overlap.
overlap_method_names() - return the corresponding type(s) of overlap for a specific overlap code.
include_overlap_method() - return a character(1) value for specified type(s) of overlap.
exclude_overlap_method() - return a character(1) value for all type(s) of overlap except those specified.
logical; character
a <- number_line(-100, 100) g <- number_line(100, 100) overlaps(a, g) # It's neither an "exact" or "chain"-overlap overlaps(a, g, methods = "exact|chain") # It's an "aligns_end"-overlap overlap_method(a, g) overlaps(a, g, methods = "exact|chain|x_aligns_end_y") # Corresponding overlap code overlap_method_codes("exact|chain|x_aligns_end_y") include_overlap_method(c("exact", "chain", "x_aligns_end_y")) # Corresponding overlap name overlap_method_names(overlap_method_codes("exact|chain|x_aligns_end_y")) # Every other type overlap exclude_overlap_method(c("exact", "chain", "x_aligns_end_y")) overlap_method_names(exclude_overlap_method(c("exact", "chain", "x_aligns_end_y"))) # All the above is based on tests for each specific type of overlap as seen below none(a, g) exact(a, g) across(a, g) x_across_y(a, g) y_across_x(a, g) chain(a, g) x_chain_y(a, g) y_chain_x(a, g) inbetween(a, g) x_inbetween_y(a, g) y_inbetween_x(a, g) aligns_start(a, g) x_aligns_start_y(a, g) y_aligns_start_x(a, g) aligns_end(a, g) x_aligns_end_y(a, g) y_aligns_end_x(a, g)a <- number_line(-100, 100) g <- number_line(100, 100) overlaps(a, g) # It's neither an "exact" or "chain"-overlap overlaps(a, g, methods = "exact|chain") # It's an "aligns_end"-overlap overlap_method(a, g) overlaps(a, g, methods = "exact|chain|x_aligns_end_y") # Corresponding overlap code overlap_method_codes("exact|chain|x_aligns_end_y") include_overlap_method(c("exact", "chain", "x_aligns_end_y")) # Corresponding overlap name overlap_method_names(overlap_method_codes("exact|chain|x_aligns_end_y")) # Every other type overlap exclude_overlap_method(c("exact", "chain", "x_aligns_end_y")) overlap_method_names(exclude_overlap_method(c("exact", "chain", "x_aligns_end_y"))) # All the above is based on tests for each specific type of overlap as seen below none(a, g) exact(a, g) across(a, g) x_across_y(a, g) y_across_x(a, g) chain(a, g) x_chain_y(a, g) y_chain_x(a, g) inbetween(a, g) x_inbetween_y(a, g) y_inbetween_x(a, g) aligns_start(a, g) x_aligns_start_y(a, g) y_aligns_start_x(a, g) aligns_end(a, g) x_aligns_end_y(a, g) y_aligns_end_x(a, g)
pane objectS4 objects storing the result of partitions.
is.pane(x) as.pane(x) ## S3 method for class 'pane' format(x, ...) ## S3 method for class 'pane' unique(x, ...) ## S3 method for class 'pane' summary(object, ...) ## S3 method for class 'pane_summary' print(x, ...) ## S3 method for class 'pane' as.data.frame(x, ..., decode = TRUE) ## S3 method for class 'pane' as.list(x, ..., decode = TRUE) ## S4 method for signature 'pane' show(object) ## S4 method for signature 'pane' rep(x, ...) ## S4 method for signature 'pane' x[i, j, ..., drop = TRUE] ## S4 method for signature 'pane' x[[i, j, ..., exact = TRUE]] ## S4 method for signature 'pane' c(x, ...)is.pane(x) as.pane(x) ## S3 method for class 'pane' format(x, ...) ## S3 method for class 'pane' unique(x, ...) ## S3 method for class 'pane' summary(object, ...) ## S3 method for class 'pane_summary' print(x, ...) ## S3 method for class 'pane' as.data.frame(x, ..., decode = TRUE) ## S3 method for class 'pane' as.list(x, ..., decode = TRUE) ## S4 method for signature 'pane' show(object) ## S4 method for signature 'pane' rep(x, ...) ## S4 method for signature 'pane' x[i, j, ..., drop = TRUE] ## S4 method for signature 'pane' x[[i, j, ..., exact = TRUE]] ## S4 method for signature 'pane' c(x, ...)
x |
x |
... |
... |
object |
object |
decode |
If |
i |
i |
j |
j |
drop |
drop |
exact |
exact |
snUnique record identifier.
.DataUnique pane identifier.
case_nmRecord type in regards to index assignment.
window_listA list of considered windows for each pane.
dist_pane_indexThe difference between each event and it's index event.
pane_datasetData sources in each pane.
pane_intervalThe start and end dates of each pane. A number_line object.
pane_lengthThe duration or length of (pane_interval).
pane_totalThe number of records in each pane.
optionsSome options passed to the instance of partitions.
window_matchedA list of matched windows for each pane.
# A test for pane objects pn <- partitions(date = 1, by = 1) is.pane(pn); is.pane(2)# A test for pane objects pn <- partitions(date = 1, by = 1) is.pane(pn); is.pane(2)
Distribute events into groups defined by time or numerical intervals. Each set of linked records are assigned a unique identifier with relevant group-level data.
partitions( date, window = NULL, windows_total = 1, separate = FALSE, sn = NULL, strata = NULL, data_links = "ANY", custom_sort = NULL, group_stats = FALSE, data_source = NULL, by = NULL, length.out = NULL, fill = TRUE, display = "none", precision = 1 )partitions( date, window = NULL, windows_total = 1, separate = FALSE, sn = NULL, strata = NULL, data_links = "ANY", custom_sort = NULL, group_stats = FALSE, data_source = NULL, by = NULL, length.out = NULL, fill = TRUE, display = "none", precision = 1 )
date |
|
window |
|
windows_total |
|
separate |
|
sn |
|
strata |
|
data_links |
|
custom_sort |
|
group_stats |
|
data_source |
|
by |
|
length.out |
|
fill |
|
display |
|
precision |
Round precision |
Each assigned group is referred to as a pane A pane consists of events within a specific time or numerical intervals (window).
Each window must cover a separate interval. Overlapping windows are merged before events are distributed into panes.
Events that occur over two windows are assigned to the last one listed.
Alternatively, you can create windows by splitting a period into equal parts (length.out), or into a sequence of intervals with fixed widths (by).
By default, the earliest event is taken as the "Index" event of the pane.
An alternative can be chosen with custom_sort.
Note that this is simply a convenience option because it has no bearing on how groups are assigned.
partitions() will categorise records into 3 types;
"Index" - Index event/record of the pane.
"Duplicate_I" - Duplicate of the "Index" record.
"Skipped" - Records that are not assigned to a pane.
Every element in data_links must be named "l" (links) or "g" (groups).
Unnamed elements of data_links will be assumed to be "l".
If named "l", only groups with records from every listed data_source will be retained.
If named "g", only groups with records from any listed data_source will be retained.
NA values in strata excludes records from the partitioning process.
See vignette("episodes") for more information.
pane; number_line_sequence; episodes; links; overlaps; number_line; schema
events <- c(30, 2, 11, 10, 100) windows <- number_line(c(1, 9, 25), c(3, 12, 35)) events partitions(date = events, length.out = 3, separate = TRUE) partitions(date = events, by = 10, separate = TRUE) partitions(date = events, window = windows, separate = TRUE) partitions(date = events, window = windows, separate = FALSE) partitions(date = events, window = windows, separate = FALSE, windows_total = 4)events <- c(30, 2, 11, 10, 100) windows <- number_line(c(1, 9, 25), c(3, 12, 35)) events partitions(date = events, length.out = 3, separate = TRUE) partitions(date = events, by = 10, separate = TRUE) partitions(date = events, window = windows, separate = TRUE) partitions(date = events, window = windows, separate = FALSE) partitions(date = events, window = windows, separate = FALSE, windows_total = 4)
pid objectsS4 objects storing the result of links.
is.pid(x) as.pid(x, ...) ## S3 method for class 'pid' format(x, ...) ## S3 method for class 'pid' unique(x, ...) ## S3 method for class 'pid' summary(object, ...) ## S3 method for class 'pid_summary' print(x, ...) ## S3 method for class 'pid' as.data.frame(x, ..., decode = TRUE) ## S3 method for class 'pid' as.list(x, ..., decode = TRUE) ## S4 method for signature 'pid' show(object) ## S4 method for signature 'pid' rep(x, ...) ## S4 method for signature 'pid' x[i, j, ..., drop = TRUE] ## S4 method for signature 'pid' x[[i, j, ..., exact = TRUE]] ## S4 method for signature 'pid' c(x, ...)is.pid(x) as.pid(x, ...) ## S3 method for class 'pid' format(x, ...) ## S3 method for class 'pid' unique(x, ...) ## S3 method for class 'pid' summary(object, ...) ## S3 method for class 'pid_summary' print(x, ...) ## S3 method for class 'pid' as.data.frame(x, ..., decode = TRUE) ## S3 method for class 'pid' as.list(x, ..., decode = TRUE) ## S4 method for signature 'pid' show(object) ## S4 method for signature 'pid' rep(x, ...) ## S4 method for signature 'pid' x[i, j, ..., drop = TRUE] ## S4 method for signature 'pid' x[[i, j, ..., exact = TRUE]] ## S4 method for signature 'pid' c(x, ...)
x |
x |
... |
... |
object |
object |
decode |
If |
i |
i |
j |
j |
drop |
drop |
exact |
exact |
snUnique record identifier.
.DataUnique group identifier.
link_idUnique reference ID for each match.
pid_criMatch stage of the step-wise linkage.
pid_datasetData sources in each group.
pid_totalThe number of records in each group.
iterationThe iteration when a record was matched to it's group (.Data).
# A test for pid objects pd <- links(criteria = 1) is.pid(pd); is.pid(2)# A test for pid objects pd <- links(criteria = 1) is.pid(pd); is.pid(2)
diyar
A collection of predefined logical tests used with sub_criteria objects
exact_match(x, y) range_match(x, y, range = 10) prob_link( x, y, cmp_func, attr_threshold, score_threshold, probabilistic, return_weights = FALSE ) true(x, y) false(x, y)exact_match(x, y) range_match(x, y, range = 10) prob_link( x, y, cmp_func, attr_threshold, score_threshold, probabilistic, return_weights = FALSE ) true(x, y) false(x, y)
x |
Attribute(s) to be compared against. |
y |
Attribute(s) to be compared by. |
range |
Difference between |
cmp_func |
Logical tests such as string comparators. See |
attr_threshold |
Matching set of weight thresholds for each result of |
score_threshold |
Score threshold determining matched or linked records. See |
probabilistic |
If |
return_weights |
If |
exact_match() - test that x == y
range_match() - test that x y (x + range)
prob_link() - Test that a record-pair relate to the same entity based on Fellegi and Sunter (1969) model for deciding if two records belong to the same entity.
In summary, record-pairs are created and categorised as matches and non-matches (attr_threshold) with user-defined functions (cmp_func).
If probabilistic is TRUE, two probabilities (m and u) are used to calculate weights for matches and non-matches.
The m-probability is the probability that matched records are actually from the same entity i.e. a true match,
while u-probability is the probability that matched records are not from the same entity i.e. a false match.
Record-pairs whose total score are above a certain threshold (score_threshold) are assumed to belong to the same entity.
Agreement (match) and disagreement (non-match) scores are calculated as described by Asher et al. (2020).
For each record pair, an agreement for attribute is calculated as;
For each record pair, a disagreement score for attribute is calculated as;
where and are the m and u-probabilities for each value of attribute .
Note that each probability is calculated as a combined probability for the record pair.
For example, if the values of the record-pair have u-probabilities of 0.1 and 0.2 respectively,
then the u-probability for the pair will be 0.02.
Missing data (NA) are considered non-matches and assigned a u-probability of 0.
`exact_match` exact_match(x = 1, y = 1) exact_match(x = 1, y = 2) `range_match` range_match(x = 10, y = 16, range = 6) range_match(x = 16, y = 10, range = 6)`exact_match` exact_match(x = 1, y = 1) exact_match(x = 1, y = 2) `range_match` range_match(x = 10, y = 16, range = 6) range_match(x = 16, y = 10, range = 6)
sub_criteria objects.Modify the attributes of a sub_criteria object.
reframe(x, ...) ## S3 method for class 'sub_criteria' reframe(x, func = identity, ...) unpack_sub_criteria(x, part = "attribute") flatten_list(x, depth = 1)reframe(x, ...) ## S3 method for class 'sub_criteria' reframe(x, func = identity, ...) unpack_sub_criteria(x, part = "attribute") flatten_list(x, depth = 1)
x |
|
... |
Arguments passed to methods. |
func |
|
sub_criteria; eval_sub_criteria; attr_eval
s_cri <- sub_criteria(month.abb, month.name) reframe(s_cri, func = function(x) x[12]) reframe(s_cri, func = function(x) x[12:1]) reframe(s_cri, func = function(x) attrs(x[1:6], x[7:12])) x <- sub_criteria(rep(1, 5), rep(5 * 10, 5), operator = 'and') x <- sub_criteria(x, c(1,9,1,11,5), operator = 'or') x format(x, show_levels = TRUE) unpack_sub_criteria(x) flatten_list(unpack_sub_criteria(x), depth = 0) lapply(flatten_list(unpack_sub_criteria(x), depth = 0), max)s_cri <- sub_criteria(month.abb, month.name) reframe(s_cri, func = function(x) x[12]) reframe(s_cri, func = function(x) x[12:1]) reframe(s_cri, func = function(x) attrs(x[1:6], x[7:12])) x <- sub_criteria(rep(1, 5), rep(5 * 10, 5), operator = 'and') x <- sub_criteria(x, c(1,9,1,11,5), operator = 'or') x format(x, show_levels = TRUE) unpack_sub_criteria(x) flatten_list(unpack_sub_criteria(x), depth = 0) lapply(flatten_list(unpack_sub_criteria(x), depth = 0), max)
Create schema diagrams for number_line, epid, pid and pane objects.
schema(x, ...) ## S3 method for class 'number_line' schema( x, show_labels = c("date", "case_overlap_methods"), custom_sort = NULL, ... ) ## S3 method for class 'epid' schema( x, title = NULL, show_labels = c("length_arrow"), show_skipped = TRUE, show_non_finite = FALSE, theme = "dark", seed = NULL, custom_label = NULL, ... ) ## S3 method for class 'pane' schema( x, title = NULL, show_labels = c("window_label"), theme = "dark", seed = NULL, custom_label = NULL, ... ) ## S3 method for class 'pid' schema( x, title = NULL, show_labels = TRUE, theme = "dark", orientation = "by_pid", seed = NULL, custom_label = NULL, ... )schema(x, ...) ## S3 method for class 'number_line' schema( x, show_labels = c("date", "case_overlap_methods"), custom_sort = NULL, ... ) ## S3 method for class 'epid' schema( x, title = NULL, show_labels = c("length_arrow"), show_skipped = TRUE, show_non_finite = FALSE, theme = "dark", seed = NULL, custom_label = NULL, ... ) ## S3 method for class 'pane' schema( x, title = NULL, show_labels = c("window_label"), theme = "dark", seed = NULL, custom_label = NULL, ... ) ## S3 method for class 'pid' schema( x, title = NULL, show_labels = TRUE, theme = "dark", orientation = "by_pid", seed = NULL, custom_label = NULL, ... )
x |
|
... |
Other arguments. |
show_labels |
|
custom_sort |
|
title |
|
show_skipped |
|
show_non_finite |
|
theme |
|
seed |
|
custom_label |
|
orientation |
|
A visual aid to describe the data linkage (links), episode tracking (episodes) or partitioning process (partitions).
show_labels options (multi-select)
schema.epid - TRUE, FALSE, "sn", "epid", "date", "case_nm", "wind_nm", "length", "length_arrow", "case_overlap_methods" or "recurrence_overlap_methods"
schema.pane - TRUE, FALSE, "sn", "pane", "date", "case_nm" or "window_label"
schema.pid - TRUE, FALSE, "sn" or "pid"
ggplot objects
schema(number_line(c(1, 2), c(2, 1))) schema(episodes(1:10, 2)) schema(partitions(1:10, by = 2, separate = TRUE)) schema(links(list(c(1, 1, NA, NA), c(NA, 1, 1, NA))))schema(number_line(c(1, 2), c(2, 1))) schema(episodes(1:10, 2)) schema(partitions(1:10, by = 2, separate = TRUE)) schema(links(list(c(1, 1, NA, NA), c(NA, 1, 1, NA))))
Perform set operations on a pair of [number_line]s.
union_number_lines(x, y) intersect_number_lines(x, y) subtract_number_lines(x, y)union_number_lines(x, y) intersect_number_lines(x, y) subtract_number_lines(x, y)
x |
|
y |
union_number_lines() - Combined the range of x and that of y
intersect_number_line() - Subset of x that overlaps with y and vice versa
subtract_number_lines() - Subset of x that does not overlap with y and vice versa.
The direction of the returned [number_line] will be that of the widest one (x or y).
If x and y have the same length, it'll be an "increasing" direction.
If x and y do not overlap, NA ("NA ?? NA") is returned.
[number_line]; list
nl_1 <- c(number_line(1, 5), number_line(1, 5), number_line(5, 9)) nl_2 <- c(number_line(1, 2), number_line(2, 3), number_line(0, 6)) # Union nl_1; nl_2; union_number_lines(nl_1, nl_2) nl_3 <- number_line(as.Date(c("01/01/2020", "03/01/2020","09/01/2020"), "%d/%m/%Y"), as.Date(c("09/01/2020", "09/01/2020","25/12/2020"), "%d/%m/%Y")) nl_4 <- number_line(as.Date(c("04/01/2020","01/01/2020","01/01/2020"), "%d/%m/%Y"), as.Date(c("05/01/2020","05/01/2020","03/01/2020"), "%d/%m/%Y")) # Intersect nl_3; nl_4; intersect_number_lines(nl_3, nl_4) # Subtract nl_3; nl_4; subtract_number_lines(nl_3, nl_4)nl_1 <- c(number_line(1, 5), number_line(1, 5), number_line(5, 9)) nl_2 <- c(number_line(1, 2), number_line(2, 3), number_line(0, 6)) # Union nl_1; nl_2; union_number_lines(nl_1, nl_2) nl_3 <- number_line(as.Date(c("01/01/2020", "03/01/2020","09/01/2020"), "%d/%m/%Y"), as.Date(c("09/01/2020", "09/01/2020","25/12/2020"), "%d/%m/%Y")) nl_4 <- number_line(as.Date(c("04/01/2020","01/01/2020","01/01/2020"), "%d/%m/%Y"), as.Date(c("05/01/2020","05/01/2020","03/01/2020"), "%d/%m/%Y")) # Intersect nl_3; nl_4; intersect_number_lines(nl_3, nl_4) # Subtract nl_3; nl_4; subtract_number_lines(nl_3, nl_4)
diyar packageDatasets in diyar package
data(staff_records) data(missing_staff_id) data(infections) data(infections_2) data(infections_3) data(infections_4) data(hospital_admissions) data(patient_list) data(patient_list_2) data(hourly_data) data(Opes) data(episode_units) data(overlap_methods) data(patient_records)data(staff_records) data(missing_staff_id) data(infections) data(infections_2) data(infections_3) data(infections_4) data(hospital_admissions) data(patient_list) data(patient_list_2) data(hourly_data) data(Opes) data(episode_units) data(overlap_methods) data(patient_records)
data.frame
data.frame
data.frame
data.frame
data.frame
data.frame
data.frame
data.frame
An object of class data.frame with 5 rows and 4 columns.
data.frame
data.frame
list
list
data.frame
staff_records - Staff record with some missing data
missing_staff_id - Staff records with missing staff identifiers
infections, infections_2, infections_3 and infections_4 - Reports of bacterial infections
hospital_admissions - Hospital admissions and discharges
patient_list & patient_list_2 - Patient list with some missing data
Hourly data
Opes - List of individuals with the same name
Duration in seconds for each 'episode_units'
Permutations of number_line overlap methods
data(staff_records) data(missing_staff_id) data(infections) data(infections_2) data(infections_3) data(infections_4) data(hospital_admissions) data(patient_list) data(patient_list_2) data(hourly_data) data(Opes) data(episode_units) data(overlap_methods) data(patient_records)data(staff_records) data(missing_staff_id) data(infections) data(infections_2) data(infections_3) data(infections_4) data(hospital_admissions) data(patient_list) data(patient_list_2) data(hourly_data) data(Opes) data(episode_units) data(overlap_methods) data(patient_records)
Match criteria for record linkage with links and episodes
sub_criteria( ..., match_funcs = c(exact = exact_match), equal_funcs = c(exact = exact_match), operator = "or" ) attrs(..., .obj = NULL) eval_sub_criteria(x, ...) ## S3 method for class 'sub_criteria' print(x, ...) ## S3 method for class 'sub_criteria' format(x, show_levels = FALSE, ...) ## S3 method for class 'sub_criteria' eval_sub_criteria( x, x_pos = seq_len(max(attr_eval(x))), y_pos = rep(1L, length(x_pos)), check_duplicates = TRUE, depth = 0, ... )sub_criteria( ..., match_funcs = c(exact = exact_match), equal_funcs = c(exact = exact_match), operator = "or" ) attrs(..., .obj = NULL) eval_sub_criteria(x, ...) ## S3 method for class 'sub_criteria' print(x, ...) ## S3 method for class 'sub_criteria' format(x, show_levels = FALSE, ...) ## S3 method for class 'sub_criteria' eval_sub_criteria( x, x_pos = seq_len(max(attr_eval(x))), y_pos = rep(1L, length(x_pos)), check_duplicates = TRUE, depth = 0, ... )
... |
Arguments passed to methods for |
match_funcs |
|
equal_funcs |
|
operator |
|
.obj |
|
x |
|
show_levels |
|
x_pos |
|
y_pos |
|
check_duplicates |
|
depth |
|
sub_criteria() - Create a match criteria as a sub_criteria object.
A sub_criteria object contains attributes to be compared,
logical tests for the comparisons (see predefined_tests for examples) and
another set of logical tests to determine identical records.
attrs() - Create a d_attribute object - a collection of atomic objects that can be passed to sub_criteria() as a single attribute.
eval_sub_criteria() - Evaluates a sub_criteria object.
At each iteration of links or episodes, record-pairs are created from each attribute of a sub_criteria object.
eval_sub_criteria() evaluates each record-pair using the match_funcs and equal_funcs functions of a sub_criteria object.
See predefined_tests for examples of match_funcs and equal_funcs.
User-defined functions are also permitted as match_funcs and equal_funcs.
Such functions must meet three requirements:
It must be able to compare the attributes.
It must have two arguments named `x` and `y`, where `y` is the value for one observation being compared against all other observations (`x`).
It must return a logical object i.e. TRUE or FALSE.
attrs() is useful when the match criteria requires an interaction between the multiple attributes. For example, attribute 1 + attribute 2 > attribute 3.
Every attribute, including those in attrs(), must have the same length or a length of 1.
predefined_tests; links; episodes; eval_sub_criteria
# Attributes attr_1 <- c(30, 28, 40, 25, 25, 29, 27) attr_2 <- c("M", "F", "U", "M", "F", "U", "M") # A match criteria ## Example 1 - A maximum difference of 10 in attribute 1 s_cri1 <- sub_criteria(attr_1, match_funcs = range_match) s_cri1 # Evaluate the match criteria ## Compare the first element of 'attr_1' against all other elements eval_sub_criteria(s_cri1) ## Compare the second element of 'attr_1' against all other elements x_pos_val <- seq_len(max(attr_eval(s_cri1))) eval_sub_criteria(s_cri1, x_pos = x_pos_val, y_pos = rep(2, length(x_pos_val))) ## Example 2 - `s_cri1` AND an exact match on attribute 2 s_cri2 <- sub_criteria( s_cri1, sub_criteria(attr_2, match_funcs = exact_match), operator = "and") s_cri2 ## Example 3 - `s_cri1` OR an exact match on attribute 2 s_cri3 <- sub_criteria( s_cri1, sub_criteria(attr_2, match_funcs = exact_match), operator = "or") s_cri3 # Evaluate the match criteria eval_sub_criteria(s_cri2) eval_sub_criteria(s_cri3) # Alternatively, using `attr()` AND_func <- function(x, y) range_match(x$a1, y$a1) & x$a2 == y$a2 OR_func <- function(x, y) range_match(x$a1, y$a1) | x$a2 == y$a2 ## Create a match criteria s_cri2b <- sub_criteria(attrs(.obj = list(a1 = attr_1, a2 = attr_2)), match_funcs = AND_func) s_cri3b <- sub_criteria(attrs(.obj = list(a1 = attr_1, a2 = attr_2)), match_funcs = OR_func) # Evaluate the match criteria eval_sub_criteria(s_cri2b) eval_sub_criteria(s_cri3b)# Attributes attr_1 <- c(30, 28, 40, 25, 25, 29, 27) attr_2 <- c("M", "F", "U", "M", "F", "U", "M") # A match criteria ## Example 1 - A maximum difference of 10 in attribute 1 s_cri1 <- sub_criteria(attr_1, match_funcs = range_match) s_cri1 # Evaluate the match criteria ## Compare the first element of 'attr_1' against all other elements eval_sub_criteria(s_cri1) ## Compare the second element of 'attr_1' against all other elements x_pos_val <- seq_len(max(attr_eval(s_cri1))) eval_sub_criteria(s_cri1, x_pos = x_pos_val, y_pos = rep(2, length(x_pos_val))) ## Example 2 - `s_cri1` AND an exact match on attribute 2 s_cri2 <- sub_criteria( s_cri1, sub_criteria(attr_2, match_funcs = exact_match), operator = "and") s_cri2 ## Example 3 - `s_cri1` OR an exact match on attribute 2 s_cri3 <- sub_criteria( s_cri1, sub_criteria(attr_2, match_funcs = exact_match), operator = "or") s_cri3 # Evaluate the match criteria eval_sub_criteria(s_cri2) eval_sub_criteria(s_cri3) # Alternatively, using `attr()` AND_func <- function(x, y) range_match(x$a1, y$a1) & x$a2 == y$a2 OR_func <- function(x, y) range_match(x$a1, y$a1) | x$a2 == y$a2 ## Create a match criteria s_cri2b <- sub_criteria(attrs(.obj = list(a1 = attr_1, a2 = attr_2)), match_funcs = AND_func) s_cri3b <- sub_criteria(attrs(.obj = list(a1 = attr_1, a2 = attr_2)), match_funcs = OR_func) # Evaluate the match criteria eval_sub_criteria(s_cri2b) eval_sub_criteria(s_cri3b)
Covert windows to and from case_lengths and recurrence_lengths.
epid_windows(date, lengths, episode_unit = "days") epid_lengths(date, windows, episode_unit = "days") index_window(date, from_last = FALSE)epid_windows(date, lengths, episode_unit = "days") epid_lengths(date, windows, episode_unit = "days") index_window(date, from_last = FALSE)
date |
As used in |
lengths |
The duration ( |
episode_unit |
Time unit of |
windows |
The range ( |
from_last |
As used in |
epid_windows - returns the corresponding window for a given a date, and case_length or recurrence_length.
epid_lengths - returns the corresponding case_length or recurrence_length for a given date and window.
index_window - returns the corresponding case_length or recurrence_length for the date only.
index_window(date = x) is a convenience function for epid_lengths(date = x, window = x).
# Which `window` will a given `length` cover? date <- Sys.Date() epid_windows(date, 10) epid_windows(date, number_line(5, 10)) epid_windows(date, number_line(-5, 10)) epid_windows(date, -5) # Which `length` is required to cover a given `window`? date <- number_line(Sys.Date(), Sys.Date() + 20) epid_lengths(date, Sys.Date() + 30) epid_lengths(date, number_line(Sys.Date() + 25, Sys.Date() + 30)) epid_lengths(date, number_line(Sys.Date() - 10, Sys.Date() + 30)) epid_lengths(date, Sys.Date() - 10) # Which `length` is required to cover the `date`? index_window(20) index_window(number_line(15, 20))# Which `window` will a given `length` cover? date <- Sys.Date() epid_windows(date, 10) epid_windows(date, number_line(5, 10)) epid_windows(date, number_line(-5, 10)) epid_windows(date, -5) # Which `length` is required to cover a given `window`? date <- number_line(Sys.Date(), Sys.Date() + 20) epid_lengths(date, Sys.Date() + 30) epid_lengths(date, number_line(Sys.Date() + 25, Sys.Date() + 30)) epid_lengths(date, number_line(Sys.Date() - 10, Sys.Date() + 30)) epid_lengths(date, Sys.Date() - 10) # Which `length` is required to cover the `date`? index_window(20) index_window(number_line(15, 20))