I am trying to load a vcf file (created using GATK
using data.table::fread
, and then parse the results of the FORMAT
and sample
fields, into different columns. The column names are given by the FORMAT
field, with each element separated by a :
. and the values for each of these columns is given by the sample
field, with each corresponding value in the same order as the names in the FORMAT
field, and equally separated by :
. The issue I'm facing is that, for each VCF file, I have different values for the FORMAT field, at different rows.
For example, I extracted one row for each unique value found in the FORMAT
field:
structure(list(
`#CHROM` = c("rs11218343_chr11_121564878_SORL1_C/T","rs16941239_chr16_86420604_FOXF1_A/T",
"rs7384878_chr7_100334426_SPDYE3_C/T", "rs3851179_chr11_86157598_EED_T/C",
"rs10868366_chr9_86085145_GOLM1_G/T", "rs11771145_chr7_143413669_EPHA1_A/G",
"rs6966331_chr7_37844191_EPDR1_T/C"),
POS = c(722L, 1468L, 1367L, 1000L, 246L, 261L, 1384L),
ID = c(".", ".", ".", ".", ".", ".", "."),
REF = c("AGG", "G", "C", "T", "C", "T", "G"),
ALT = c(".", "*,GA", "T", "C", ".", "C", "."),
QUAL = c("29.91", ".", "40.64", "32690.06", ".", ".", "0.64"),
FILTER = c("LowQual", ".", ".", ".", ".", ".", "LowQual"),
INFO = c("DP=5;MLEAC=.;MLEAF=.;MQ=26.79", ".",
"AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802",
"AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776",
".", ".", "BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967"),
FORMAT = c("GT", "GT:AD", "GT:AD:DP:GQ:PGT:PID:PL:PS", "GT:AD:DP:GQ:PL",
"GT:AD:DP:RGQ", "GT:AD:PGT:PID:PS", "GT:DP:RGQ"),
F1601764_S460_L002 = c("0/0", "./.:0,0,0", "0|1:12,2:14:48:0|1:1344_G_A:48,0,498:1344",
"1/1:0,1054:1054:99:32704,3160,0", "0/0:0:0:0", ".|.:0,0:0|1:251_T_C:251",
"0/0:3:16")),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -7L))
#> #CHROM POS ID REF ALT QUAL FILTER
#> 1 rs11218343_chr11_121564878_SORL1_C/T 722 . AGG . 29.91 LowQual
#> 2 rs16941239_chr16_86420604_FOXF1_A/T 1468 . G *,GA . .
#> 3 rs7384878_chr7_100334426_SPDYE3_C/T 1367 . C T 40.64 .
#> 4 rs3851179_chr11_86157598_EED_T/C 1000 . T C 32690.06 .
#> 5 rs10868366_chr9_86085145_GOLM1_G/T 246 . C . . .
#> 6 rs11771145_chr7_143413669_EPHA1_A/G 261 . T C . .
#> 7 rs6966331_chr7_37844191_EPDR1_T/C 1384 . G . 0.64 LowQual
#> INFO
#> 1 DP=5;MLEAC=.;MLEAF=.;MQ=26.79
#> 2 .
#> 3 AC=1;AF=0.500;AN=2;BaseQRankSum=1.29;DP=20;ExcessHet=0.0000;FS=2.963;MLEAC=1;MLEAF=0.500;MQ=43.87;MQRankSum=-1.058e+00;QD=2.90;ReadPosRankSum=0.697;SOR=1.802
#> 4 AC=2;AF=1.00;AN=2;DP=1097;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=31.02;SOR=0.776
#> 5 .
#> 6 .
#> 7 BaseQRankSum=-9.670e-01;DP=3;ExcessHet=0.00;MLEAC=.;MLEAF=.;MQ=60.00;MQRankSum=0.00;ReadPosRankSum=0.967
#> FORMAT F1601764_S460_L002
#> 1 GT 0/0
#> 2 GT:AD ./.:0,0,0
#> 3 GT:AD:DP:GQ:PGT:PID:PL:PS 0|1:12,2:14:48:0|1:1344_G_A:48,0,498:1344
#> 4 GT:AD:DP:GQ:PL 1/1:0,1054:1054:99:32704,3160,0
#> 5 GT:AD:DP:RGQ 0/0:0:0:0
#> 6 GT:AD:PGT:PID:PS .|.:0,0:0|1:251_T_C:251
#> 7 GT:DP:RGQ 0/0:3:16
Created on 2023-10-19 with reprex v2.0.2
What I would like to do, is to find a way to go over every row and separate the values in F1601764_S460_L002
to the corresponding columns given by the elements in the FORMAT
field (separated by :
).
I started by doing this with data.table::fread
and tidyr::separate_wider_delim
, thinking that all FORMAT
field were: GT:AD:DP:RGQ
or GT:AD:DP:GQ:PL
. But this, obviously, yield problems.
I also tried the vcfR
package, but I feel it's too slow for what I need to do.
What I would like to achieve, in the end, is using a tdyr
approach to achieve something like this:
structure(list(`#CHROM` = c("rs6966331_chr7_37844191_EPDR1_T/C",
"rs7384878_chr7_100334426_SPDYE3_C/T", "rs11771145_chr7_143413669_EPHA1_A/G",
"rs10868366_chr9_86085145_GOLM1_G/T", "rs3851179_chr11_86157598_EED_T/C",
"rs11218343_chr11_121564878_SORL1_C/T", "rs16941239_chr16_86420604_FOXF1_A/T"
), POS = c(1384L, 1367L, 261L, 246L, 1000L, 722L, 1468L), AD = c(NA,
"12,2", "0,0", "0", "0,1054", NA, "0,0,0"), DP = c(3L, 14L, NA,
0L, 1054L, NA, NA), GQ = c(NA, 48L, NA, NA, 99L, NA, NA), GT = c("0/0",
"0|1", NA, "0/0", "1/1", "0/0", NA), PGT = c(NA, "0|1", "0|1",
NA, NA, NA, NA), PID = c(NA, "1344_G_A", "251_T_C", NA, NA, NA,
NA), PL = c(NA, "48,0,498", NA, NA, "32704,3160,0", NA, NA),
PS = c(NA, 1344L, 251L, NA, NA, NA, NA), RGQ = c(16L, NA,
NA, 0L, NA, NA, NA), SB = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), GT_alleles = c("G/G", "C|T", ".", "C/C",
"C/C", "AGG/AGG", ".")), row.names = c(NA, -7L), class = c("tbl_df",
"tbl", "data.frame"))
#> #CHROM POS AD DP GQ GT PGT PID
#> 1 rs6966331_chr7_37844191_EPDR1_T/C 1384 <NA> 3 NA 0/0 <NA> <NA>
#> 2 rs7384878_chr7_100334426_SPDYE3_C/T 1367 12,2 14 48 0|1 0|1 1344_G_A
#> 3 rs11771145_chr7_143413669_EPHA1_A/G 261 0,0 NA NA <NA> 0|1 251_T_C
#> 4 rs10868366_chr9_86085145_GOLM1_G/T 246 0 0 NA 0/0 <NA> <NA>
#> 5 rs3851179_chr11_86157598_EED_T/C 1000 0,1054 1054 99 1/1 <NA> <NA>
#> 6 rs11218343_chr11_121564878_SORL1_C/T 722 <NA> NA NA 0/0 <NA> <NA>
#> 7 rs16941239_chr16_86420604_FOXF1_A/T 1468 0,0,0 NA NA <NA> <NA> <NA>
#> PL PS RGQ SB GT_alleles
#> 1 <NA> NA 16 <NA> G/G
#> 2 48,0,498 1344 NA <NA> C|T
#> 3 <NA> 251 NA <NA> .
#> 4 <NA> NA 0 <NA> C/C
#> 5 32704,3160,0 NA NA <NA> C/C
#> 6 <NA> NA NA <NA> AGG/AGG
#> 7 <NA> NA NA <NA> .
Created on 2023-10-19 with reprex v2.0.2
I imagine this isn't an easy task, but if someone has some ideas, or already came across a similar issue... your help would be invaluable.
Do you mean something like this?
Edit: we don't technically need an unnest/pivot operation here, with larger data they might be a little costly. Try this brute-force that only adds columns:
You can look at the value after
bind_rows()
to see just the new columns being added.You suggested in a comment that you always have the same columns, I don't think that necessarily helps us here: we might use regex to find each of
"GT"
,"DP"
, etc inFORMAT
, but since we don't have all of them every row, we don't know which position they are in ahead of time, making extraction and pairing with theF16*
field values a bit onerous. I think using the samestrsplit
approach as the first, but manually creating the columns, is a better plan for larger data.