I am trying to load a vcf file (created using GATK using data.table::fread, and then parse the results of the FORMAT and sample fields, into different columns. The column names are given by the FORMAT field, with each element separated by a :. and the values for each of these columns is given by the sample field, with each corresponding value in the same order as the names in the FORMAT field, and equally separated by :. The issue I'm facing is that, for each VCF file, I have different values for the FORMAT field, at different rows.
For example, I extracted one row for each unique value found in the FORMAT field:
structure(list(
`#CHROM` = c("rs11218343_chr11_121564878_SORL1_C/T","rs16941239_chr16_86420604_FOXF1_A/T",
"rs7384878_chr7_100334426_SPDYE3_C/T", "rs3851179_chr11_86157598_EED_T/C",
"rs10868366_chr9_86085145_GOLM1_G/T", "rs11771145_chr7_143413669_EPHA1_A/G",
"rs6966331_chr7_37844191_EPDR1_T/C"),
POS = c(722L, 1468L, 1367L, 1000L, 246L, 261L, 1384L),
ID = c(".", ".", ".", ".", ".", ".", "."),
REF = c("AGG", "G", "C", "T", "C", "T", "G"),
ALT = c(".", "*,GA", "T", "C", ".", "C", "."),
QUAL = c("29.91", ".", "40.64", "32690.06", ".", ".", "0.64"),
FILTER = c("LowQual", ".", ".", ".", ".", ".", "LowQual"),
INFO = c("DP=5;MLEAC=.;MLEAF=.;MQ=26.79", ".",
"AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802",
"AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776",
".", ".", "BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967"),
FORMAT = c("GT", "GT:AD", "GT:AD:DP:GQ:PGT:PID:PL:PS", "GT:AD:DP:GQ:PL",
"GT:AD:DP:RGQ", "GT:AD:PGT:PID:PS", "GT:DP:RGQ"),
F1601764_S460_L002 = c("0/0", "./.:0,0,0", "0|1:12,2:14:48:0|1:1344_G_A:48,0,498:1344",
"1/1:0,1054:1054:99:32704,3160,0", "0/0:0:0:0", ".|.:0,0:0|1:251_T_C:251",
"0/0:3:16")),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -7L))
#> #CHROM POS ID REF ALT QUAL FILTER
#> 1 rs11218343_chr11_121564878_SORL1_C/T 722 . AGG . 29.91 LowQual
#> 2 rs16941239_chr16_86420604_FOXF1_A/T 1468 . G *,GA . .
#> 3 rs7384878_chr7_100334426_SPDYE3_C/T 1367 . C T 40.64 .
#> 4 rs3851179_chr11_86157598_EED_T/C 1000 . T C 32690.06 .
#> 5 rs10868366_chr9_86085145_GOLM1_G/T 246 . C . . .
#> 6 rs11771145_chr7_143413669_EPHA1_A/G 261 . T C . .
#> 7 rs6966331_chr7_37844191_EPDR1_T/C 1384 . G . 0.64 LowQual
#> INFO
#> 1 DP=5;MLEAC=.;MLEAF=.;MQ=26.79
#> 2 .
#> 3 AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802
#> 4 AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776
#> 5 .
#> 6 .
#> 7 BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967
#> FORMAT F1601764_S460_L002
#> 1 GT 0/0
#> 2 GT:AD ./.:0,0,0
#> 3 GT:AD:DP:GQ:PGT:PID:PL:PS 0|1:12,2:14:48:0|1:1344_G_A:48,0,498:1344
#> 4 GT:AD:DP:GQ:PL 1/1:0,1054:1054:99:32704,3160,0
#> 5 GT:AD:DP:RGQ 0/0:0:0:0
#> 6 GT:AD:PGT:PID:PS .|.:0,0:0|1:251_T_C:251
#> 7 GT:DP:RGQ 0/0:3:16
Created on 2023-10-19 with reprex v2.0.2
What I would like to do, is to find a way to go over every row and separate the values in F1601764_S460_L002 to the corresponding columns given by the elements in the FORMAT field (separated by :).
I started by doing this with data.table::fread and tidyr::separate_wider_delim, thinking that all FORMAT field were: GT:AD:DP:RGQ or GT:AD:DP:GQ:PL. But this, obviously, yield problems.
I also tried the vcfR package, but I feel it's too slow for what I need to do.
What I would like to achieve, in the end, is using a tdyr approach to achieve something like this:
structure(list(`#CHROM` = c("rs6966331_chr7_37844191_EPDR1_T/C",
"rs7384878_chr7_100334426_SPDYE3_C/T", "rs11771145_chr7_143413669_EPHA1_A/G",
"rs10868366_chr9_86085145_GOLM1_G/T", "rs3851179_chr11_86157598_EED_T/C",
"rs11218343_chr11_121564878_SORL1_C/T", "rs16941239_chr16_86420604_FOXF1_A/T"
), POS = c(1384L, 1367L, 261L, 246L, 1000L, 722L, 1468L), AD = c(NA,
"12,2", "0,0", "0", "0,1054", NA, "0,0,0"), DP = c(3L, 14L, NA,
0L, 1054L, NA, NA), GQ = c(NA, 48L, NA, NA, 99L, NA, NA), GT = c("0/0",
"0|1", NA, "0/0", "1/1", "0/0", NA), PGT = c(NA, "0|1", "0|1",
NA, NA, NA, NA), PID = c(NA, "1344_G_A", "251_T_C", NA, NA, NA,
NA), PL = c(NA, "48,0,498", NA, NA, "32704,3160,0", NA, NA),
PS = c(NA, 1344L, 251L, NA, NA, NA, NA), RGQ = c(16L, NA,
NA, 0L, NA, NA, NA), SB = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), GT_alleles = c("G/G", "C|T", ".", "C/C",
"C/C", "AGG/AGG", ".")), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"))
#> #CHROM POS AD DP GQ GT PGT PID
#> 1 rs6966331_chr7_37844191_EPDR1_T/C 1384 <NA> 3 NA 0/0 <NA> <NA>
#> 2 rs7384878_chr7_100334426_SPDYE3_C/T 1367 12,2 14 48 0|1 0|1 1344_G_A
#> 3 rs11771145_chr7_143413669_EPHA1_A/G 261 0,0 NA NA <NA> 0|1 251_T_C
#> 4 rs10868366_chr9_86085145_GOLM1_G/T 246 0 0 NA 0/0 <NA> <NA>
#> 5 rs3851179_chr11_86157598_EED_T/C 1000 0,1054 1054 99 1/1 <NA> <NA>
#> 6 rs11218343_chr11_121564878_SORL1_C/T 722 <NA> NA NA 0/0 <NA> <NA>
#> 7 rs16941239_chr16_86420604_FOXF1_A/T 1468 0,0,0 NA NA <NA> <NA> <NA>
#> PL PS RGQ SB GT_alleles
#> 1 <NA> NA 16 <NA> G/G
#> 2 48,0,498 1344 NA <NA> C|T
#> 3 <NA> 251 NA <NA> .
#> 4 <NA> NA 0 <NA> C/C
#> 5 32704,3160,0 NA NA <NA> C/C
#> 6 <NA> NA NA <NA> AGG/AGG
#> 7 <NA> NA NA <NA> .
Created on 2023-10-19 with reprex v2.0.2
I imagine this isn't an easy task, but if someone has some ideas, or already came across a similar issue... your help would be invaluable.
Do you mean something like this?
Edit: we don't technically need an unnest/pivot operation here, with larger data they might be a little costly. Try this brute-force that only adds columns:
You can look at the value after
bind_rows()to see just the new columns being added.You suggested in a comment that you always have the same columns, I don't think that necessarily helps us here: we might use regex to find each of
"GT","DP", etc inFORMAT, but since we don't have all of them every row, we don't know which position they are in ahead of time, making extraction and pairing with theF16*field values a bit onerous. I think using the samestrsplitapproach as the first, but manually creating the columns, is a better plan for larger data.