df <- data.frame(id=c("9","9","9","5","5","4","4","4","4","4","20","20"),
Date=c("11/29/2018","11/29/2018","11/29/2018","2/13/2019","2/13/2019",
"6/15/2018","6/20/2018","8/17/2018","8/20/2018","8/23/2018","12/25/2018","12/25/2018"),
Buyer= c("John","John","John","Maria","Maria","Sandy","Sandy","Sandy","Sandy","Sandy","Paul","Paul"),
Amount= c("959","1158","596","922","922","1849","4193","4256","65","100","313","99"), stringsAsFactors = F) %>%
group_by(Buyer,id) %>% mutate(diffs = c(NA, diff(as.Date(Date, format = "%m/%d/%Y"))))
将日期从字符更改为日期并将金额从字符更改为数字:
df$Date<-as.Date(df$Date, '%m/%d/%y')
df$Amount<-as.numeric(df$Amount)
现在,我按 id 对数据集进行分组,用日期对其进行排列,并在每个 id 内创建一个排名(例如,Sandy 在她购物的 5 个不同天中将获得从 1 到 5 的排名),然后我定义一个名为 ConsecutiveSum 的新变量,它将每行的值添加到其前一行的值(滞后给出前一行)。如果前一行的值不存在,则 ifelse 语句会强制连续 sum 输出 0。下一步只是执行您的条件:
df %>%
group_by(id) %>%
arrange(Date) %>%
mutate(rank=dense_rank(Date)) %>%
mutate(ConsecutiveSum = ifelse(is.na(lag(Amount)),0,Amount + lag(Amount , default = 0)))%>%
filter(diffs<=5 & ConsecutiveSum>=5000 | ConsecutiveSum==0 & lead(ConsecutiveSum)>=5000)
# id Date Buyer Amount diffs rank ConsecutiveSum
# <chr> <chr> <chr> <dbl> <dbl> <int> <dbl>
# 1 4 6/15/2018 Sandy 1849 NA 1 0
# 2 4 6/20/2018 Sandy 4193 5 2 6042