如何解决使用dada2、decipher、phangorn进行微生物组分析
我正在研究通过从土壤中提取的 Minion/纳米孔方法生成的测序数据集。我的数据是单次读取的,采用 fastaq 格式,我目前只处理从条码 1 收集的 29 个样本集。
我正在尝试通过“微生物组数据分析工作流程:从原始读数到社区分析”来分析我的数据。由 Callahan et.al (https://bioconductor.org/help/course-materials/2017/BioC2017/Day1/Workshops/Microbiome/MicrobiomeWorkflowII.html) 所描述,除了我使用此处描述的单读取方法的 DADA2 部分:https://benjjneb.github.io/dada2/bigdata.html
这是我的代码(有些部分是概括/简化+注释):
miseq_path <- "data path"
filtpath <- file.path(miseq_path,"filtered") # Filtered files go into the filtered/ subdirectory
fns <- list.files(miseq_path,pattern="some pattern") #
filtfns <- paste0(fns,"_filt.fastq.gz") # to get the "gz" exctension
filtfns
# Filtering
out <- filterandTrim(file.path(miseq_path,fns),file.path(filtpath,filtfns),truncLen=240,maxEE=100,truncQ=1,rm.phix=TRUE,compress=TRUE,verbose=TRUE,multithread=TRUE)
print(out) # Only included some of the samples
> print(out)
reads.in reads.out
FAO01497_pass_barcode01_8b1d707b_0.fastq 4000 1302
FAO01497_pass_barcode01_8b1d707b_1.fastq 4000 1318
FAO01497_pass_barcode01_8b1d707b_10.fastq 4000 1254
FAO01497_pass_barcode01_8b1d707b_11.fastq 4000 1238
FAO01497_pass_barcode01_8b1d707b_12.fastq 4000 1120
FAO01497_pass_barcode01_8b1d707b_13.fastq 4000 1185
FAO01497_pass_barcode01_8b1d707b_14.fastq 4000 1156
FAO01497_pass_barcode01_8b1d707b_15.fastq 4000 1117
我有点困惑,通过过滤的样本太少,但我对参数进行了很多尝试,这是我可以从过滤中获得的最多样本。
library(dada2);
# File parsing
filtpath <- "path to filtered" #
filts <- list.files(filtpath,pattern="_filt.fastq.gz",full.names=TRUE)
sample.names <- sapply(strsplit(basename(filts),"_"),`[`,5 ) # just to change sample names
sample.names
sample.names
[1] "0.fastq" "1.fastq" "10.fastq" "11.fastq" "12.fastq" "13.fastq" "14.fastq" "15.fastq" "16.fastq" "17.fastq"
[11] "18.fastq" "19.fastq" "2.fastq" "20.fastq" "21.fastq" "22.fastq" "23.fastq" "24.fastq" "25.fastq" "26.fastq"
[21] "27.fastq" "28.fastq" "3.fastq" "4.fastq" "5.fastq" "6.fastq" "7.fastq" "8.fastq" "9.fastq"
names(filts) <- sample.names
names(filts)
# Learn error rates
set.seed(100)
err <- learnErrors(filts,nbases = 1e8,multithread=TRUE,randomize=TRUE)
> err <- learnErrors(filts,randomize=TRUE)
8365680 total bases in 34857 reads from 29 samples will be used for learning the error rates.
# Infer sequence variants
> dds <- vector("list",length(sample.names))
> names(dds) <- sample.names
> for(sam in sample.names) {
+ cat("Processing:",sam,"\n")
+ derep <- derepFastq(filts[[sam]])
+ dds[[sam]] <- dada(derep,err=err,multithread=TRUE)
+ }
Processing: 0.fastq
Sample 1 - 1302 reads in 1302 unique sequences.
Processing: 1.fastq
Sample 1 - 1318 reads in 1318 unique sequences.
Processing: 10.fastq
Sample 1 - 1254 reads in 1254 unique sequences.
Processing: 11.fastq
Sample 1 - 1238 reads in 1238 unique sequences.
Processing: 12.fastq
Sample 1 - 1120 reads in 1120 unique sequences.
Processing: 13.fastq
Sample 1 - 1185 reads in 1185 unique sequences.
Processing: 14.fastq
Sample 1 - 1156 reads in 1156 unique sequences.
Processing: 15.fastq
Sample 1 - 1117 reads in 1117 unique sequences.
Processing: 16.fastq
Sample 1 - 1120 reads in 1120 unique sequences.
Processing: 17.fastq
Sample 1 - 1114 reads in 1114 unique sequences.
Processing: 18.fastq
Sample 1 - 1134 reads in 1134 unique sequences.
Processing: 19.fastq
Sample 1 - 1208 reads in 1208 unique sequences.
Processing: 2.fastq
Sample 1 - 1294 reads in 1294 unique sequences.
Processing: 20.fastq
Sample 1 - 1216 reads in 1216 unique sequences.
Processing: 21.fastq
Sample 1 - 1193 reads in 1193 unique sequences.
Processing: 22.fastq
Sample 1 - 1208 reads in 1208 unique sequences.
Processing: 23.fastq
Sample 1 - 1194 reads in 1194 unique sequences.
Processing: 24.fastq
Sample 1 - 1222 reads in 1222 unique sequences.
Processing: 25.fastq
Sample 1 - 1206 reads in 1206 unique sequences.
Processing: 26.fastq
Sample 1 - 1199 reads in 1199 unique sequences.
Processing: 27.fastq
Sample 1 - 1202 reads in 1202 unique sequences.
Processing: 28.fastq
Sample 1 - 982 reads in 982 unique sequences.
Processing: 3.fastq
Sample 1 - 1226 reads in 1226 unique sequences.
Processing: 4.fastq
Sample 1 - 1255 reads in 1255 unique sequences.
Processing: 5.fastq
Sample 1 - 1287 reads in 1287 unique sequences.
Processing: 6.fastq
Sample 1 - 1270 reads in 1270 unique sequences.
Processing: 7.fastq
Sample 1 - 1214 reads in 1214 unique sequences.
Processing: 8.fastq
Sample 1 - 1233 reads in 1233 unique sequences.
Processing: 9.fastq
Sample 1 - 1190 reads in 1190 unique sequences.
好的,读取样本。
# Construct sequence table and write to disk
seqtab <- makeSequenceTable(dds)
saveRDS(seqtab,"miseq_path") # this table look completely empty [see picture][1]
tax <- assignTaxonomy(seqtab,"path/silva_nr_v132_train_set.fa.gz",multithread=TRUE)
saveRDS(seqtab,"miseq_path")
saveRDS(tax,"miseq_path")
> taxTab <- assignTaxonomy(seqtab,refFasta = fastaRef,multithread=TRUE)
> unname(head(taxTab))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "Bacteria" "Proteobacteria" "Betaproteobacteria" "Burkholderiales" NA NA
[2,] "Bacteria" "Proteobacteria" "Alphaproteobacteria" NA NA NA
[3,] "Bacteria" NA NA NA NA NA
[4,] "Bacteria" "Firmicutes" "Clostridia" NA NA NA
[5,] "Bacteria" "Actinobacteria" NA NA NA NA
[6,] "Bacteria" NA
为什么我这里的物种这么少??
#Construct phylogenetic tree
seqs <- getSequences(seqtab)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs),anchor=NA,verbose=FALSE)
require(phangorn)
phangalign <- phyDat(as(alignment,"matrix"),type="DNA")
dm <- dist.ml(phangalign)
treeNJ <- NJ(dm) # Note,tip order != sequence order
fit = pml(treeNJ,data=phangalign)
fitGTR <- update(fit,k=4,inv=0.2)
fitGTR <- optim.pml(fitGTR,model="GTR",optInv=TRUE,optGamma=TRUE,rearrangement = "stochastic",control = pml.control(trace = 0))
detach("package:phangorn",unload=TRUE)
require(phyloseq)
> samdf <- read.csv("https://raw.githubusercontent.com/spholmes/F1000_workflow/master/data/MIMARKS_Data_combined.csv",header=TRUE)
> samdf$SampleID <- paste0(gsub("00","",samdf$host_subject_id),"D",samdf$age-21)
> samdf <- samdf[!duplicated(samdf$SampleID),] # Remove dupicate entries for reverse reads
> rownames(seqtab) <- gsub("124","125",rownames(seqtab)) # Fix discrepancy
> all(rownames(seqtab) %in% samdf$SampleID) # TRUE
[1] FALSE
为什么??????
> rownames(samdf) <- samdf$SampleID
> keep.cols <- c("collection_date","biome","target_gene","target_subfragment",+ "host_common_name","host_subject_id","age","sex","body_product","tot_mass",+ "diet","family_relationship","genotype","SampleID")
> samdf <- samdf[rownames(seqtab),keep.cols]
> ps <- phyloseq(otu_table(seqtab,taxa_are_rows=FALSE),+ sample_data(samdf),+ tax_table(taxTab),phy_tree(fitGTR$tree))
Error in validobject(.Object) : invalid class “phyloseq” object:
Component sample names do not match.
Try sample_names()
[1]: https://i.stack.imgur.com/41baN.png
我认为可能错误在于样本名称的数量与表中的名称不匹配。 我很抱歉我的代码不整洁,我对 R 很陌生。如果有人对如何解决这个问题有任何建议,我将不胜感激。如果有关于我的代码的更多信息应该提交,请告诉我。
最好,
S
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。