Online training class for Clinical R programming batch starts on Monday, 02Feb2026.
Click here for details.
library(tidyverse)
ae <- tribble(
~subjid, ~term, ~stdtc,
"1001", "Headache", "2010-01-05",
"1001", "Headache", "2010-01-08",
"1001", "Nausea", "2010-01-08",
"1002", "Headache", "2010-02-15"
)
ae1 <- tribble(
~subjid, ~term, ~stdtc,
"1001", "Headache", "2010-01-05",
"1001", "Headache", "2010-01-08",
"1001", "Nausea", "2010-01-08",
"1002", "Headache", "2010-02-15",
"1002", "Headache", "2010-02-15"
)
#---------------------------------------
# Duplicate based on key variables (nodupkey)
#---------------------------------------
nodupkey <- ae %>%
arrange(subjid, term, stdtc) %>%
group_by(subjid, term) %>%
slice(1)
#---------------------------------------
# Duplicate record (noduprec / nodup)
#---------------------------------------
noduprec <- ae1 %>%
distinct() ae <- data.frame(
subjid = c(1001, 1001, 1001, 1002),
term = c("Headache", "Headache", "Nausea", "Headache"),
stdtc = c("2010-01-05", "2010-01-08", "2010-01-08", "2010-02-15")
, stringsAsFactors = FALSE
)
ae1 <- data.frame(
subjid = c(1001, 1001, 1001, 1002, 1002),
term = c("Headache", "Headache", "Nausea", "Headache", "Headache"),
stdtc = c("2010-01-05", "2010-01-08", "2010-01-08", "2010-02-15", "2010-02-15")
, stringsAsFactors = FALSE
)
# duplicate based on key variables (nodupkey)
nodupkey <- ae[order(ae$subjid, ae$term, ae$stdtc), ]
nodupkey <- nodupkey[!duplicated(nodupkey[c("subjid", "term")]), ]
# duplicate record (noduprec/nodup)
noduprec <- ae1[!duplicated(ae1), ] This pattern is used when uniqueness is defined by a subset of variables rather than the entire row. In clinical data, this is very common when we say “one record per subject per term”, or “one record per subject per visit”.
The data is first sorted using order() so that, within each key combination, records are arranged in a meaningful sequence. Here, sorting by stdtc ensures that earlier dates appear first.
The duplicated() function is then applied only to the key variables (subjid, term). This tells R to look for repeated key combinations, not full-row duplicates.
By default, duplicated() keeps the first occurrence and flags later ones. Because we sorted beforehand, “first” now has a clear definition.
This pattern is used when we want to remove rows that are exact copies across all variables, not just selected keys.
Here, duplicated(ae1) checks whether an entire row has appeared before, comparing every column.
No sorting is required because order does not matter when removing exact duplicate rows.
The expression !duplicated(ae1) keeps the first occurrence of each unique row and removes all later identical copies.