Reducing some cumbersome code for simplicity

108 views Asked by At

I have a sample of my data as follows (real data is nearly .5 million women of childbearing age): In this dataset, I have a parity of women that consists of all children a woman has). This row continues until 30 children (ageownchild_pernum1: ageownchild_pernum30). For example, a woman has 2 children, This woman has 30 rows (filled with the age of the children) but only the first and second rows are filled with the age of the children the woman has, and other rows are filled with NA. Here I bring just two of the rows and omit others for simplicity.

library("tidyverse")
DataSet1<-
tibble(
id = c(1,2,3,4,5,6,7,8,9,10),
ageownchild_pernum1 = c(18,24,13,16,9,NA,17,13,32,7 ),
ageownchild_pernum2=  c(16,NA,9 ,10,7,NA,13,11,20,5 ),
AGE=  c(38,52 ,41 ,43 ,38 ,36 ,40 ,36 ,56,31),
F_curve_notch_15= c(25.14,30.28,43.33,43.33,25.14,25.14,25.14,25.14,67   ,33.77),
F_curve_notch_15.25= c(34.01,40.33,51.74,51.74,34.01,34.01,34.01,34.01,73.85,41.91),
f_curve_tilde_15= c(25.14,30.28,43.33,43.33,25.14,25.14,25.14,25.14,67   ,33.77),
f_curve_tilde_15.25= c(25.14,30.28,43.33,43.33,25.14,25.14,25.14,25.14,67   ,33.77),
)

F_curve_notch and f_curve_tilde are for ages (15 to 49,.25).

Now, I want to do this bulky procedure on my data that may reach more than a thousand lines of code.

DataSet1$low_notch      <-ifelse((DataSet1$ageownchild_pernum1>=0),DataSet1$AGE - 
DataSet1$ageownchild_pernum1 - 0.75,0)
DataSet1$high_notch     <-ifelse((DataSet1$ageownchild_pernum1>=0),DataSet1$AGE - 
DataSet1$ageownchild_pernum1 + 0.75,0)
DataSet1$low_low_notch  <-ifelse((DataSet1$ageownchild_pernum1>=0),DataSet1$AGE - 
DataSet1$ageownchild_pernum1 - 1.25,0)
DataSet1$high_high_notch<-ifelse((DataSet1$ageownchild_pernum1>=0),DataSet1$AGE - 
DataSet1$ageownchild_pernum1 + 1.25,0)
DataSet1$low_low_notch  <-ifelse ((DataSet1$low_low_notch>=20)   & (DataSet1$low_low_notch<35)  
,DataSet1$low_low_notch+0.25,DataSet1$low_low_notch)
DataSet1$high_high_notch<-ifelse ((DataSet1$high_high_notch>=20) & 
(DataSet1$high_high_notch<35), DataSet1$high_high_notch+0.25, DataSet1$high_high_notch)

notch <- function(a, b,c,d){
ifelse((a<= 15)&(b>=15)&(c!= 0),0.01*d,c)
}
DataSet1$f_curve_notched_15<-mapply('notch',DataSet1$low_low_notch, DataSet1$high_high_notch, 
DataSet1$f_curve_notched_15,DataSet1$f_curve_tilde_15, DataSet1$f_curve_notched_15)

This procedure, should continue for all ageownchild_pernum(1:30) and f_curve_notched(15 to 49, .25). I really appreciate any help you can provide.

2

There are 2 answers

2
asd-tm On

Is it what you are expecting?

left_join(
  DataSet1 %>% 
    select(id, AGE, starts_with("ageownchild_pernum")) %>% 
    pivot_longer(-c(id,AGE)) %>% 
    mutate(
      low_notch = ifelse((value>=0),AGE - 
                           value - 0.75,0),
      high_notch = ifelse((value>=0),AGE - 
                            value + 0.75,0),
      low_low_notch = ifelse((value>=0),AGE - 
                               value - 1.25,0),
      high_high_notch = ifelse((value>=0),AGE - 
                                 value + 1.25,0),
      low_low_notch = ifelse ((low_low_notch>=20)   & (low_low_notch<35)  
                              ,low_low_notch+0.25,low_low_notch),
      high_high_notch = ifelse ((high_high_notch>=20) & 
                                  (high_high_notch<35), high_high_notch+0.25, high_high_notch)
    ),
  
  full_join(
    DataSet1 %>% 
      select(id, starts_with("F_curve_notch_")) %>% 
      pivot_longer(-id, values_to = "F_curve_notch_") %>% 
      mutate(name = str_remove(name, "F_curve_notch_")),
    
    DataSet1 %>% 
      select(id, starts_with("f_curve_tilde_")) %>% 
      pivot_longer(-id, values_to = "f_curve_tilde_")%>% 
      mutate(name = str_remove(name, "f_curve_tilde_")),
    by = c("id", "name")
  ) %>% 
    rename(curve_id = name),
  by = "id",
  relationship = "many-to-many" # consider other by specifications if necessary
) %>% 
  mutate(notch_result = notch(low_low_notch, high_high_notch, F_curve_notch_, f_curve_tilde_))

And the last columnn contains the notch() results:

# A tibble: 40 × 12
      id   AGE name                value low_notch high_notch low_low_notch high_high_notch curve_id F_curve_notch_ f_curve_tilde_ notch_result
   <dbl> <dbl> <chr>               <dbl>     <dbl>      <dbl>         <dbl>           <dbl> <chr>             <dbl>          <dbl>        <dbl>
 1     1    38 ageownchild_pernum1    18      19.2       20.8          18.8            21.5 15                 25.1           25.1         25.1
 2     1    38 ageownchild_pernum1    18      19.2       20.8          18.8            21.5 15.25              34.0           25.1         34.0
 3     1    38 ageownchild_pernum2    16      21.2       22.8          21              23.5 15                 25.1           25.1         25.1
 4     1    38 ageownchild_pernum2    16      21.2       22.8          21              23.5 15.25              34.0           25.1         34.0
 5     2    52 ageownchild_pernum1    24      27.2       28.8          27              29.5 15                 30.3           30.3         30.3
 6     2    52 ageownchild_pernum1    24      27.2       28.8          27              29.5 15.25              40.3           30.3         40.3
 7     2    52 ageownchild_pernum2    NA      NA         NA            NA              NA   15                 30.3           30.3         NA  
 8     2    52 ageownchild_pernum2    NA      NA         NA            NA              NA   15.25              40.3           30.3         NA  
 9     3    41 ageownchild_pernum1    13      27.2       28.8          27              29.5 15                 43.3           43.3         43.3
10     3    41 ageownchild_pernum1    13      27.2       28.8          27              29.5 15.25              51.7           43.3         51.7
# ℹ 30 more rows
# ℹ Use `print(n = ...)` to see more rows

APPENDED

If you need a wider data shape you can append the above listed code with:

%>%  
  transmute(id, AGE, f_curve_ = paste0(name, "_", "f_curve_", curve_id), notch_result) %>% 
  pivot_wider(id_cols = c("id", "AGE"), names_from = f_curve_, values_from = notch_result, names_sort = T)

You will get something like this:

# A tibble: 10 × 6
      id   AGE ageownchild_pernum1_f_curve_15 ageownchild_pernum1_f_curve_15.25 ageownchild_pernum2_f_curve_15 ageownchild_pernum2_f_curve_15.25
   <dbl> <dbl>                          <dbl>                             <dbl>                          <dbl>                             <dbl>
 1     1    38                           25.1                              34.0                           25.1                              34.0
 2     2    52                           30.3                              40.3                           NA                                NA  
 3     3    41                           43.3                              51.7                           43.3                              51.7
 4     4    43                           43.3                              51.7                           43.3                              51.7
 5     5    38                           25.1                              34.0                           25.1                              34.0
 6     6    36                           NA                                NA                             NA                                NA  
 7     7    40                           25.1                              34.0                           25.1                              34.0
 8     8    36                           25.1                              34.0                           25.1                              34.0
 9     9    56                           67                                73.8                           67                                73.8
10    10    31                           33.8                              41.9                           33.8                              41.9
0
Andy Baxter On

It might be possible using several pivots to have all your f_curve ages and all your child columns compared to each other in one run:

library(tidyverse)

DataSet1<-
  tibble(
    id = c(1,2,3,4,5,6,7,8,9,10),
    ageownchild_pernum1 = c(18,24,13,16,9,NA,17,13,32,7 ),
    ageownchild_pernum2=  c(16,NA,9 ,10,7,NA,13,11,20,5 ),
    AGE=  c(38,52 ,41 ,43 ,38 ,36 ,40 ,36 ,56,31),
    f_curve_notch_15= c(25.14,30.28,43.33,43.33,25.14,25.14,25.14,25.14,67   ,33.77),
    f_curve_notch_15.25= c(34.01,40.33,51.74,51.74,34.01,34.01,34.01,34.01,73.85,41.91),
    f_curve_tilde_15= c(25.14,30.28,43.33,43.33,25.14,25.14,25.14,25.14,67   ,33.77),
    f_curve_tilde_15.25= c(25.14,30.28,43.33,43.33,25.14,25.14,25.14,25.14,67   ,33.77),
  )

notch <- function(a, b,c,d){
  ifelse((a<= 15)&(b>=15)&(c!= 0),0.01*d,c)
}



dat_out <- DataSet1 |>
  pivot_longer(
    starts_with("f_curve"),
    names_to = c("marker", "age_cat"),
    names_pattern = c("f_curve_(.*)_(.*)")
  ) |> 
  pivot_wider(names_from = marker, values_from = value) |> 
  pivot_longer(
    starts_with("ageownchild"),
    names_to = "child_n",
    values_to = "child_age",
    names_prefix = "ageownchild_pernum"
  ) |> 
  filter(!is.na(child_age)) |> 
  mutate(
    low_notch = child_age - 0.75,
    high_notch = child_age + 0.75,
    low_low_notch = child_age - 1.25,
    high_high_notch = child_age + 1.25,
    low_low_notch = if_else(low_low_notch>=20 & low_low_notch<35, low_low_notch+0.25, low_low_notch),
    high_high_notch = if_else(high_high_notch>=20 & high_high_notch<35, high_high_notch+0.25, high_high_notch),
    f_curve_notch = notch(low_low_notch, high_high_notch, notch, tilde)
  )

dat_out
#> # A tibble: 34 × 12
#>       id   AGE age_cat notch tilde child_n child_age low_notch high_notch
#>    <dbl> <dbl> <chr>   <dbl> <dbl> <chr>       <dbl>     <dbl>      <dbl>
#>  1     1    38 15       25.1  25.1 1              18     17.2       18.8 
#>  2     1    38 15       25.1  25.1 2              16     15.2       16.8 
#>  3     1    38 15.25    34.0  25.1 1              18     17.2       18.8 
#>  4     1    38 15.25    34.0  25.1 2              16     15.2       16.8 
#>  5     2    52 15       30.3  30.3 1              24     23.2       24.8 
#>  6     2    52 15.25    40.3  30.3 1              24     23.2       24.8 
#>  7     3    41 15       43.3  43.3 1              13     12.2       13.8 
#>  8     3    41 15       43.3  43.3 2               9      8.25       9.75
#>  9     3    41 15.25    51.7  43.3 1              13     12.2       13.8 
#> 10     3    41 15.25    51.7  43.3 2               9      8.25       9.75
#> # ℹ 24 more rows
#> # ℹ 3 more variables: low_low_notch <dbl>, high_high_notch <dbl>,
#> #   f_curve_notch <dbl>

This will mean each woman will have a row for every combination of child and f_curve age bracket. These can be pivoted back to wider dataset to have one column per woman if needs be:

dat_out |> 
  pivot_wider(
    names_from = c(child_n, age_cat),
    values_from = f_curve_notch,
    names_prefix = "f_curve_notch_"
  )
#> # A tibble: 34 × 13
#>       id   AGE notch tilde child_age low_notch high_notch low_low_notch
#>    <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>      <dbl>         <dbl>
#>  1     1    38  25.1  25.1        18     17.2       18.8          16.8 
#>  2     1    38  25.1  25.1        16     15.2       16.8          14.8 
#>  3     1    38  34.0  25.1        18     17.2       18.8          16.8 
#>  4     1    38  34.0  25.1        16     15.2       16.8          14.8 
#>  5     2    52  30.3  30.3        24     23.2       24.8          23   
#>  6     2    52  40.3  30.3        24     23.2       24.8          23   
#>  7     3    41  43.3  43.3        13     12.2       13.8          11.8 
#>  8     3    41  43.3  43.3         9      8.25       9.75          7.75
#>  9     3    41  51.7  43.3        13     12.2       13.8          11.8 
#> 10     3    41  51.7  43.3         9      8.25       9.75          7.75
#> # ℹ 24 more rows
#> # ℹ 5 more variables: high_high_notch <dbl>, f_curve_notch_1_15 <dbl>,
#> #   f_curve_notch_2_15 <dbl>, f_curve_notch_1_15.25 <dbl>,
#> #   f_curve_notch_2_15.25 <dbl>