使用dada2、decipher、phangorn进行微生物组分析

如何解决使用dada2、decipher、phangorn进行微生物组分析

我正在研究通过从土壤中提取的 Minion/纳米孔方法生成的测序数据集。我的数据是单次读取的，采用 fastaq 格式，我目前只处理从条码 1 收集的 29 个样本集。

我正在尝试通过“微生物组数据分析工作流程：从原始读数到社区分析”来分析我的数据。由 Callahan et.al (https://bioconductor.org/help/course-materials/2017/BioC2017/Day1/Workshops/Microbiome/MicrobiomeWorkflowII.html) 所描述，除了我使用此处描述的单读取方法的 DADA2 部分：https://benjjneb.github.io/dada2/bigdata.html

这是我的代码（有些部分是概括/简化+注释）：

miseq_path <- "data path" 

filtpath <- file.path(miseq_path,"filtered") # Filtered files go into the filtered/ subdirectory


fns <- list.files(miseq_path,pattern="some pattern") # 
filtfns <- paste0(fns,"_filt.fastq.gz") # to get the "gz" exctension
filtfns

# Filtering
out <- filterandTrim(file.path(miseq_path,fns),file.path(filtpath,filtfns),truncLen=240,maxEE=100,truncQ=1,rm.phix=TRUE,compress=TRUE,verbose=TRUE,multithread=TRUE)

print(out) # Only included some of the samples 

> print(out)
                                          reads.in reads.out
FAO01497_pass_barcode01_8b1d707b_0.fastq      4000      1302
FAO01497_pass_barcode01_8b1d707b_1.fastq      4000      1318
FAO01497_pass_barcode01_8b1d707b_10.fastq     4000      1254
FAO01497_pass_barcode01_8b1d707b_11.fastq     4000      1238
FAO01497_pass_barcode01_8b1d707b_12.fastq     4000      1120
FAO01497_pass_barcode01_8b1d707b_13.fastq     4000      1185
FAO01497_pass_barcode01_8b1d707b_14.fastq     4000      1156
FAO01497_pass_barcode01_8b1d707b_15.fastq     4000      1117

我有点困惑，通过过滤的样本太少，但我对参数进行了很多尝试，这是我可以从过滤中获得的最多样本。

library(dada2); 
# File parsing
filtpath <- "path to filtered" # 
filts <- list.files(filtpath,pattern="_filt.fastq.gz",full.names=TRUE) 
sample.names <- sapply(strsplit(basename(filts),"_"),`[`,5 ) # just to change sample names
sample.names

sample.names
 [1] "0.fastq"  "1.fastq"  "10.fastq" "11.fastq" "12.fastq" "13.fastq" "14.fastq" "15.fastq" "16.fastq" "17.fastq"
[11] "18.fastq" "19.fastq" "2.fastq"  "20.fastq" "21.fastq" "22.fastq" "23.fastq" "24.fastq" "25.fastq" "26.fastq"
[21] "27.fastq" "28.fastq" "3.fastq"  "4.fastq"  "5.fastq"  "6.fastq"  "7.fastq"  "8.fastq"  "9.fastq" 


names(filts) <- sample.names
names(filts)
# Learn error rates
set.seed(100)
err <- learnErrors(filts,nbases = 1e8,multithread=TRUE,randomize=TRUE)

> err <- learnErrors(filts,randomize=TRUE)
8365680 total bases in 34857 reads from 29 samples will be used for learning the error rates.



 # Infer sequence variants
> dds <- vector("list",length(sample.names))
> names(dds) <- sample.names
> for(sam in sample.names) {
+   cat("Processing:",sam,"\n")
+   derep <- derepFastq(filts[[sam]])
+   dds[[sam]] <- dada(derep,err=err,multithread=TRUE)
+ }
Processing: 0.fastq 
Sample 1 - 1302 reads in 1302 unique sequences.
Processing: 1.fastq 
Sample 1 - 1318 reads in 1318 unique sequences.
Processing: 10.fastq 
Sample 1 - 1254 reads in 1254 unique sequences.
Processing: 11.fastq 
Sample 1 - 1238 reads in 1238 unique sequences.
Processing: 12.fastq 
Sample 1 - 1120 reads in 1120 unique sequences.
Processing: 13.fastq 
Sample 1 - 1185 reads in 1185 unique sequences.
Processing: 14.fastq 
Sample 1 - 1156 reads in 1156 unique sequences.
Processing: 15.fastq 
Sample 1 - 1117 reads in 1117 unique sequences.
Processing: 16.fastq 
Sample 1 - 1120 reads in 1120 unique sequences.
Processing: 17.fastq 
Sample 1 - 1114 reads in 1114 unique sequences.
Processing: 18.fastq 
Sample 1 - 1134 reads in 1134 unique sequences.
Processing: 19.fastq 
Sample 1 - 1208 reads in 1208 unique sequences.
Processing: 2.fastq 
Sample 1 - 1294 reads in 1294 unique sequences.
Processing: 20.fastq 
Sample 1 - 1216 reads in 1216 unique sequences.
Processing: 21.fastq 
Sample 1 - 1193 reads in 1193 unique sequences.
Processing: 22.fastq 
Sample 1 - 1208 reads in 1208 unique sequences.
Processing: 23.fastq 
Sample 1 - 1194 reads in 1194 unique sequences.
Processing: 24.fastq 
Sample 1 - 1222 reads in 1222 unique sequences.
Processing: 25.fastq 
Sample 1 - 1206 reads in 1206 unique sequences.
Processing: 26.fastq 
Sample 1 - 1199 reads in 1199 unique sequences.
Processing: 27.fastq 
Sample 1 - 1202 reads in 1202 unique sequences.
Processing: 28.fastq 
Sample 1 - 982 reads in 982 unique sequences.
Processing: 3.fastq 
Sample 1 - 1226 reads in 1226 unique sequences.
Processing: 4.fastq 
Sample 1 - 1255 reads in 1255 unique sequences.
Processing: 5.fastq 
Sample 1 - 1287 reads in 1287 unique sequences.
Processing: 6.fastq 
Sample 1 - 1270 reads in 1270 unique sequences.
Processing: 7.fastq 
Sample 1 - 1214 reads in 1214 unique sequences.
Processing: 8.fastq 
Sample 1 - 1233 reads in 1233 unique sequences.
Processing: 9.fastq 
Sample 1 - 1190 reads in 1190 unique sequences.

好的，读取样本。

# Construct sequence table and write to disk
seqtab <- makeSequenceTable(dds)
saveRDS(seqtab,"miseq_path") # this table look completely empty [see picture][1]



tax <- assignTaxonomy(seqtab,"path/silva_nr_v132_train_set.fa.gz",multithread=TRUE)

saveRDS(seqtab,"miseq_path") 
saveRDS(tax,"miseq_path") 

> taxTab <- assignTaxonomy(seqtab,refFasta = fastaRef,multithread=TRUE)
> unname(head(taxTab))
     [,1]       [,2]             [,3]                  [,4]              [,5] [,6]
[1,] "Bacteria" "Proteobacteria" "Betaproteobacteria"  "Burkholderiales" NA   NA  
[2,] "Bacteria" "Proteobacteria" "Alphaproteobacteria" NA                NA   NA  
[3,] "Bacteria" NA               NA                    NA                NA   NA  
[4,] "Bacteria" "Firmicutes"     "Clostridia"          NA                NA   NA  
[5,] "Bacteria" "Actinobacteria" NA                    NA                NA   NA  
[6,] "Bacteria" NA

为什么我这里的物种这么少？？

 
#Construct phylogenetic tree

seqs <- getSequences(seqtab)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs),anchor=NA,verbose=FALSE)
require(phangorn)
phangalign <- phyDat(as(alignment,"matrix"),type="DNA")
dm <- dist.ml(phangalign)
treeNJ <- NJ(dm) # Note,tip order != sequence order
fit = pml(treeNJ,data=phangalign)
fitGTR <- update(fit,k=4,inv=0.2)
fitGTR <- optim.pml(fitGTR,model="GTR",optInv=TRUE,optGamma=TRUE,rearrangement = "stochastic",control = pml.control(trace = 0))
detach("package:phangorn",unload=TRUE)

require(phyloseq)
> samdf <- read.csv("https://raw.githubusercontent.com/spholmes/F1000_workflow/master/data/MIMARKS_Data_combined.csv",header=TRUE)
> samdf$SampleID <- paste0(gsub("00","",samdf$host_subject_id),"D",samdf$age-21)
> samdf <- samdf[!duplicated(samdf$SampleID),] # Remove dupicate entries for reverse reads
> rownames(seqtab) <- gsub("124","125",rownames(seqtab)) # Fix discrepancy
> all(rownames(seqtab) %in% samdf$SampleID) # TRUE
[1] FALSE

为什么？？？？？？

> rownames(samdf) <- samdf$SampleID
> keep.cols <- c("collection_date","biome","target_gene","target_subfragment",+                "host_common_name","host_subject_id","age","sex","body_product","tot_mass",+                "diet","family_relationship","genotype","SampleID") 
> samdf <- samdf[rownames(seqtab),keep.cols]
> ps <- phyloseq(otu_table(seqtab,taxa_are_rows=FALSE),+                sample_data(samdf),+                tax_table(taxTab),phy_tree(fitGTR$tree))

Error in validobject(.Object) : invalid class “phyloseq” object: 
 Component sample names do not match.
 Try sample_names()

  [1]: https://i.stack.imgur.com/41baN.png

我认为可能错误在于样本名称的数量与表中的名称不匹配。我很抱歉我的代码不整洁，我对 R 很陌生。如果有人对如何解决这个问题有任何建议，我将不胜感激。如果有关于我的代码的更多信息应该提交，请告诉我。

最好，

使用dada2、decipher、phangorn进行微生物组分析

如何解决使用dada2、decipher、phangorn进行微生物组分析

相关推荐