summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael R. Crusoe <michael.crusoe@gmail.com>2020-02-17 15:49:34 +0100
committerMichael R. Crusoe <michael.crusoe@gmail.com>2020-02-17 15:49:34 +0100
commit6631792112a86a455c6a2c176a95efe09dcd6111 (patch)
tree955e57decb355d6be36108ce003543bf50fba072
Import rsem_1.3.3+dfsg.orig.tar.xz
[dgit import orig rsem_1.3.3+dfsg.orig.tar.xz]
-rw-r--r--.gitignore37
-rw-r--r--AlignerRefSeqPolicy.h19
-rw-r--r--BamConverter.h305
-rw-r--r--BamWriter.h148
-rw-r--r--Buffer.h83
-rw-r--r--COPYING674
-rw-r--r--EBSeq/Makefile16
-rw-r--r--EBSeq/calcClusteringInfo.cpp144
-rwxr-xr-xEBSeq/install24
-rwxr-xr-xEBSeq/rsem-for-ebseq-find-DE73
-rwxr-xr-xEBSeq/rsem-for-ebseq-generate-ngvector-from-clustering-info18
-rw-r--r--EM.cpp675
-rw-r--r--GTFItem.h184
-rw-r--r--Gibbs.cpp530
-rw-r--r--GroupInfo.h55
-rw-r--r--HitContainer.h118
-rw-r--r--HitWrapper.h35
-rw-r--r--LenDist.h296
-rw-r--r--Makefile179
-rw-r--r--Model.h7
-rw-r--r--ModelParams.h39
-rw-r--r--NoiseProfile.h159
-rw-r--r--NoiseQProfile.h181
-rw-r--r--Orientation.h42
-rw-r--r--PairedEndHit.h36
-rw-r--r--PairedEndModel.h461
-rw-r--r--PairedEndQModel.h481
-rw-r--r--PairedEndRead.h67
-rw-r--r--PairedEndReadQ.h67
-rw-r--r--PolyARules.h61
-rw-r--r--Profile.h220
-rw-r--r--QProfile.h208
-rw-r--r--QualDist.h151
-rw-r--r--README.md703
-rw-r--r--RSPD.h206
-rw-r--r--Read.h23
-rw-r--r--ReadIndex.h59
-rw-r--r--ReadReader.h118
-rw-r--r--RefSeq.h140
-rw-r--r--RefSeqPolicy.h22
-rw-r--r--Refs.h159
-rw-r--r--SamHeader.cpp111
-rw-r--r--SamHeader.hpp71
-rw-r--r--SamParser.h268
-rw-r--r--SingleHit.h56
-rw-r--r--SingleModel.h526
-rw-r--r--SingleQModel.h546
-rw-r--r--SingleRead.h92
-rw-r--r--SingleReadQ.h97
-rw-r--r--Transcript.h169
-rw-r--r--Transcripts.h145
-rw-r--r--WHAT_IS_NEW327
-rw-r--r--WriteResults.h637
-rw-r--r--bam2readdepth.cpp27
-rw-r--r--bam2wig.cpp27
-rw-r--r--bc_aux.h120
-rw-r--r--buildReadIndex.cpp86
-rw-r--r--calcCI.cpp581
-rw-r--r--cnt_file_description.txt15
-rwxr-xr-xconvert-sam-for-rsem111
-rwxr-xr-xextract-transcript-to-gene-map-from-trinity34
-rw-r--r--extractRef.cpp376
-rw-r--r--getUnique.cpp83
-rw-r--r--model_file_description.txt74
-rw-r--r--my_assert.h107
-rw-r--r--pRSEM/ChIPSeqExperiment.py257
-rw-r--r--pRSEM/ChIPSeqReplicate.py43
-rw-r--r--pRSEM/File.py47
-rw-r--r--pRSEM/Gene.py143
-rw-r--r--pRSEM/Makefile35
-rw-r--r--pRSEM/Param.py178
-rw-r--r--pRSEM/Prsem.py253
-rw-r--r--pRSEM/Transcript.py189
-rw-r--r--pRSEM/Util.py173
-rw-r--r--pRSEM/filterSam2Bed.c52
-rw-r--r--pRSEM/idrCode/README.txt146
-rw-r--r--pRSEM/idrCode/batch-consistency-analysis.r164
-rw-r--r--pRSEM/idrCode/batch-consistency-plot-merged2.r213
-rw-r--r--pRSEM/idrCode/batch-consistency-plot.r67
-rw-r--r--pRSEM/idrCode/functions-all-clayton-12-13.r3182
-rw-r--r--pRSEM/idrCode/idrOverlap2npk.sh37
-rw-r--r--pRSEM/idrCode/peakCallingPipelineForIdr.txt11
-rw-r--r--pRSEM/idrCode/submit.idrmerge.lsf.sh72
-rw-r--r--pRSEM/idrCode/submit.idrpair.lsf.sh90
-rw-r--r--pRSEM/installRLib.R71
-rw-r--r--pRSEM/phantompeakqualtools/README.txt203
-rw-r--r--pRSEM/phantompeakqualtools/peakCallingPipelineForIdr.txt1
-rw-r--r--pRSEM/phantompeakqualtools/run_spp.R885
-rw-r--r--pRSEM/phantompeakqualtools/run_spp_nodups.R886
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/DESCRIPTION12
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/NAMESPACE3
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/R/zroutines.R2501
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure3856
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure.ac7
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/add.broad.peak.regions.Rd27
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/find.binding.positions.Rd128
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.binding.characteristics.Rd55
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.broad.enrichment.clusters.Rd27
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.conservative.fold.enrichment.profile.Rd59
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.Rd46
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.interpolation.Rd56
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.enrichment.mle.Rd35
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.tag.density.Rd45
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/output.binding.results.Rd24
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bam.tags.Rd24
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bin.maqmap.tags.Rd23
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bowtie.tags.Rd23
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.eland.tags.Rd30
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.maqmap.tags.Rd23
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.meland.tags.Rd29
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/remove.local.tag.anomalies.Rd46
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/select.informative.tags.Rd29
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/spp-package.Rd144
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.broadpeak.info.Rd16
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.narrowpeak.binding.Rd21
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/writewig.Rd31
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.cpp398
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.h322
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.cpp696
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.h203
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAux.h227
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.cpp230
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.h145
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.cpp450
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.h136
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.cpp66
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.h130
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.cpp729
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.h137
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.cpp910
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.h213
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.cpp577
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.h192
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.cpp47
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.h50
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.cpp379
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.h63
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/Makevars.in4
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/api_global.h22
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamread.cpp222
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamtools_global.h32
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bed2vector.cpp2628
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/cdensum.c144
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/const.h18
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.c164
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.h70
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqread.cpp207
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/pc.h20
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/peaks.cpp804
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/wdl.cpp657
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/DESCRIPTION12
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/NAMESPACE3
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/R/zroutines.R2501
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure3856
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure.ac7
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/add.broad.peak.regions.Rd27
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/find.binding.positions.Rd130
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.binding.characteristics.Rd58
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.broad.enrichment.clusters.Rd27
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.conservative.fold.enrichment.profile.Rd59
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.Rd46
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.interpolation.Rd56
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.enrichment.mle.Rd35
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.tag.density.Rd45
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/output.binding.results.Rd26
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bam.tags.Rd24
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bin.maqmap.tags.Rd23
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bowtie.tags.Rd23
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.eland.tags.Rd30
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.maqmap.tags.Rd23
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.meland.tags.Rd29
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/remove.local.tag.anomalies.Rd46
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/select.informative.tags.Rd29
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/spp-package.Rd144
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.broadpeak.info.Rd16
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.narrowpeak.binding.Rd21
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/writewig.Rd31
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.cpp398
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.h322
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.cpp696
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.h203
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAux.h227
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.cpp230
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.h145
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.cpp450
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.h136
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.cpp66
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.h130
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.cpp729
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.h137
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.cpp910
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.h213
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.cpp577
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.h192
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.cpp47
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.h50
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.cpp379
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.h63
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/Makevars.in4
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/api_global.h22
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamread.cpp224
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamtools_global.h32
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bed2vector.cpp2630
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/cdensum.c144
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/const.h18
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.c164
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.h70
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqread.cpp209
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/pc.h20
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/peaks.cpp808
-rwxr-xr-xpRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/wdl.cpp660
-rw-r--r--pRSEM/process-chipseq.R61
-rw-r--r--pRSEM/process-rnaseq.R982
-rwxr-xr-xpRSEM/prsem-calculate-expression74
-rwxr-xr-xpRSEM/prsem-prepare-reference70
-rwxr-xr-xpRSEM/prsem-testing-procedure104
-rw-r--r--parseIt.cpp230
-rw-r--r--preRef.cpp90
-rwxr-xr-xrsem-calculate-expression1651
-rwxr-xr-xrsem-control-fdr117
-rwxr-xr-xrsem-gen-transcript-plots173
-rwxr-xr-xrsem-generate-data-matrix77
-rwxr-xr-xrsem-generate-ngvector104
-rwxr-xr-xrsem-gff3-to-gtf309
-rwxr-xr-xrsem-plot-model169
-rwxr-xr-xrsem-plot-transcript-wiggles172
-rwxr-xr-xrsem-prepare-reference508
-rwxr-xr-xrsem-refseq-extract-primary-assembly18
-rwxr-xr-xrsem-run-ebseq128
-rwxr-xr-xrsem-run-prsem-testing-procedure324
-rw-r--r--rsem_perl_utils.pm109
-rw-r--r--samValidator.cpp193
-rw-r--r--sam_utils.h210
-rw-r--r--sampling.h67
-rw-r--r--scanForPairedEndReads.cpp137
-rw-r--r--simul.h45
-rw-r--r--simulation.cpp225
-rw-r--r--synthesisRef.cpp227
-rw-r--r--tbam2gbam.cpp36
-rw-r--r--utils.h166
-rw-r--r--wiggle.cpp139
-rw-r--r--wiggle.h49
242 files changed, 63515 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..85f72e6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+*.o
+*.a
+*~
+rsem-bam2readdepth
+rsem-bam2wig
+rsem-build-read-index
+rsem-calculate-credibility-intervals
+rsem-extract-reference-transcripts
+rsem-get-unique
+rsem-parse-alignments
+rsem-preref
+rsem-run-em
+rsem-run-gibbs
+rsem-simulate-reads
+rsem-synthesis-reference-transcripts
+rsem-tbam2gbam
+rsem-sam-validator
+rsem-scan-for-paired-end-reads
+EBSeq/rsem-for-ebseq-calculate-clustering-info
+pRSEM/*.pyc
+pRSEM/RLib/
+pRSEM/bigWigSummary
+pRSEM/filterSam2Bed
+pRSEM/phantompeakqualtools/spp_1.10.1_on_R*/config.status
+pRSEM/phantompeakqualtools/spp_1.10.1_on_R*/src/Makevars
+pRSEM/phantompeakqualtools/spp_1.10.1_on_R*/src/config.h
+samtools-*/samtools
+samtools-*/version.h
+samtools-*/test
+samtools-*/misc
+samtools-*/config.h
+samtools-*/config.mk
+samtools-*/config.status
+samtools-*/htslib-*/config.h
+samtools-*/htslib-*/config.mk
+samtools-*/htslib-*/config.status
+samtools-*/htslib-*/version.h
diff --git a/AlignerRefSeqPolicy.h b/AlignerRefSeqPolicy.h
new file mode 100644
index 0000000..a5d8e4b
--- /dev/null
+++ b/AlignerRefSeqPolicy.h
@@ -0,0 +1,19 @@
+#ifndef ALIGNERREFSEQPOLICY
+#define ALIGNERREFSEQPOLICY
+
+#include<string>
+
+#include "RefSeqPolicy.h"
+
+class AlignerRefSeqPolicy : public RefSeqPolicy {
+ public :
+ std::string convert(const std::string& rawseq) {
+ int size = (int)rawseq.size();
+ std::string seq = rawseq;
+ for (int i = 0; i < size; i++)
+ if (seq[i] == 'N') seq[i] = 'G';
+ return seq;
+ }
+};
+
+#endif
diff --git a/BamConverter.h b/BamConverter.h
new file mode 100644
index 0000000..f95385b
--- /dev/null
+++ b/BamConverter.h
@@ -0,0 +1,305 @@
+#ifndef BAMCONVERTER_H_
+#define BAMCONVERTER_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+#include<string>
+#include<map>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+#include "SamHeader.hpp"
+
+#include "utils.h"
+#include "my_assert.h"
+#include "bc_aux.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+class BamConverter {
+public:
+ BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts, int nThreads, const std::string& command);
+ ~BamConverter();
+
+ void process();
+private:
+ samFile *in, *out;
+ bam_hdr_t *in_header, *out_header;
+ Transcripts& transcripts;
+
+ std::map<std::string, int> refmap;
+ std::map<std::string, int>::iterator iter;
+
+ CollapseMap collapseMap;
+
+ void convert(bam1_t*, const Transcript&);
+
+ void writeCollapsedLines();
+ void flipSeq(uint8_t*, int);
+ void flipQual(uint8_t*, int);
+ void modifyTags(bam1_t*, const Transcript&); // modify MD tag and XS tag if needed
+};
+
+BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts, int nThreads, const std::string& command)
+ : transcripts(transcripts)
+{
+ general_assert(transcripts.getType() == 0, "Genome information is not provided! RSEM cannot convert the transcript bam file!");
+
+ in = sam_open(inpF, "r");
+ assert(in != 0);
+ in_header = sam_hdr_read(in);
+ assert(in_header != 0);
+
+ transcripts.buildMappings(in_header->n_targets, in_header->target_name);
+
+ SamHeader hdr(in_header->text);
+ hdr.replaceSQ(chr_list);
+ hdr.insertPG("rsem-tbam2gbam", command);
+ // hdr.addComment("This BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.");
+ out_header = hdr.create_header();
+
+ refmap.clear();
+ for (int i = 0; i < out_header->n_targets; ++i) {
+ refmap[out_header->target_name[i]] = i;
+ }
+
+ out = sam_open(outF, "wb");
+ assert(out != 0);
+ sam_hdr_write(out, out_header);
+
+ if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
+}
+
+BamConverter::~BamConverter() {
+ bam_hdr_destroy(in_header);
+ sam_close(in);
+ bam_hdr_destroy(out_header);
+ sam_close(out);
+}
+
+void BamConverter::process() {
+ bam1_t *b, *b2;
+ std::string cqname, qname;
+ bool isPaired = false;
+
+ HIT_INT_TYPE cnt = 0;
+
+ cqname = "";
+ b = bam_init1(); b2 = bam_init1();
+
+ while (sam_read1(in, in_header, b) >= 0) {
+ ++cnt;
+ isPaired = bam_is_paired(b);
+ if (isPaired) {
+ assert(sam_read1(in, in_header, b2) >= 0 && bam_is_paired(b2));
+ if (!bam_is_read1(b)) { bam1_t *tmp = b; b = b2; b2 = tmp; }
+ assert(bam_is_read1(b) && bam_is_read2(b2));
+ general_assert((bam_is_mapped(b) && bam_is_mapped(b2)) || (bam_is_unmapped(b) && bam_is_unmapped(b2)), \
+ "Detected partial alignments for read " + bam_get_canonical_name(b) + ", which RSEM currently does not support!");
+ ++cnt;
+ }
+
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+
+ qname = bam_get_canonical_name(b);
+ if (bam_is_mapped(b)) {
+ // for collapsing
+ if (isPaired) general_assert(b->core.tid == b2->core.tid, qname + "'s two mates are aligned to two different transcripts!");
+
+ const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+
+ convert(b, transcript);
+ if (isPaired) {
+ convert(b2, transcript);
+ b->core.mpos = b2->core.pos;
+ b2->core.mpos = b->core.pos;
+ }
+
+ if (cqname != qname) {
+ writeCollapsedLines();
+ cqname = qname;
+ collapseMap.init(isPaired);
+ }
+
+ uint8_t *p = bam_aux_get(b, "ZW");
+ float prb = (p != NULL? bam_aux2f(p) : 1.0);
+ collapseMap.insert(b, b2, prb);
+ }
+ else {
+ assert(cqname != qname);
+
+ writeCollapsedLines();
+ cqname = qname;
+ collapseMap.init(isPaired);
+
+ sam_write1(out, out_header, b);
+ if (isPaired) sam_write1(out, out_header, b2);
+ }
+ }
+
+ writeCollapsedLines();
+
+ bam_destroy1(b);
+ bam_destroy1(b2);
+
+ if (cnt >= 1000000) printf("\n");
+}
+
+void BamConverter::convert(bam1_t* b, const Transcript& transcript) {
+ int pos = b->core.pos;
+ int readlen = b->core.l_qseq;
+
+ general_assert(readlen > 0, "One alignment line has SEQ field as *. RSEM does not support this currently!");
+
+ iter = refmap.find(transcript.getSeqName());
+ assert(iter != refmap.end());
+ b->core.tid = iter->second;
+ if (bam_is_paired(b)) { b->core.mtid = b->core.tid; }
+ b->core.qual = 255; // set to not available temporarily
+
+ if (transcript.getStrand() == '-') {
+ b->core.flag ^= BAM_FREVERSE;
+ if (bam_is_paired(b)) {
+ b->core.flag ^= BAM_FMREVERSE;
+ b->core.isize = -b->core.isize;
+ }
+ flipSeq(bam_get_seq(b), readlen);
+ flipQual(bam_get_qual(b), readlen);
+ }
+
+ std::vector<uint32_t> data;
+ data.clear();
+
+ int core_pos, core_n_cigar;
+ tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data);
+ assert(core_pos >= 0);
+
+ int rest_len = b->l_data - b->core.l_qname - b->core.n_cigar * 4;
+ b->l_data = b->core.l_qname + core_n_cigar * 4 + rest_len;
+ expand_data_size(b);
+ uint8_t* pt = b->data + b->core.l_qname;
+ memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
+ for (int i = 0; i < core_n_cigar; ++i) { memmove(pt, &data[i], 4); pt += 4; }
+
+ b->core.pos = core_pos;
+ b->core.n_cigar = core_n_cigar;
+ b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
+
+ modifyTags(b, transcript); // check if need to add XS tag, if need, add it
+}
+
+inline void BamConverter::writeCollapsedLines() {
+ bam1_t *tmp_b = NULL,*tmp_b2 = NULL;
+ float prb;
+ bool isPaired;
+ uint8_t *p;
+
+ if (!collapseMap.empty(isPaired)) {
+ while (collapseMap.next(tmp_b, tmp_b2, prb)) {
+ p = bam_aux_get(tmp_b, "ZW");
+ if (p != NULL) {
+ memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b->core.qual = bam_prb_to_mapq(prb);
+ }
+ // otherwise, just use the MAPQ score of the orignal alignment
+
+ sam_write1(out, out_header, tmp_b);
+ if (isPaired) {
+ if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b2->core.qual = tmp_b->core.qual;
+ sam_write1(out, out_header, tmp_b2);
+ }
+ bam_destroy1(tmp_b);
+ if (isPaired) bam_destroy1(tmp_b2);
+ }
+ }
+}
+
+inline void BamConverter::flipSeq(uint8_t* s, int readlen) {
+ uint8_t code, base;
+ std::vector<uint8_t> seq;
+
+ code = 0; base = 0;
+ seq.clear();
+ for (int i = 0; i < readlen; ++i) {
+ switch (bam_seqi(s, readlen - i - 1)) {
+ case 1: base = 8; break;
+ case 2: base = 4; break;
+ case 4: base = 2; break;
+ case 8: base = 1; break;
+ case 15: base = 15; break;
+ default: assert(false);
+ }
+ code |= base << (4 * (1 - i % 2));
+ if (i % 2 == 1) { seq.push_back(code); code = 0; }
+ }
+ if (readlen % 2 == 1) { seq.push_back(code); }
+
+ for (int i = 0; i < (int)seq.size(); ++i) s[i] = seq[i];
+}
+
+inline void BamConverter::flipQual(uint8_t* q, int readlen) {
+ int32_t mid = readlen / 2;
+ uint8_t tmp;
+ for (int i = 0; i < mid; ++i) {
+ tmp = q[i]; q[i] = q[readlen - i - 1]; q[readlen - i - 1] = tmp;
+ }
+}
+
+inline void BamConverter::modifyTags(bam1_t* b, const Transcript& transcript) {
+ char strand = transcript.getStrand();
+ uint8_t *s = NULL;
+
+ if (strand == '-') {
+ s = bam_aux_get(b, "MD");
+ if ((s != NULL) && (*(s) == 'Z') && (bam_aux2Z(s) != NULL)) {
+ char *mis = bam_aux2Z(s);
+ int len = strlen(mis);
+ char *tmp = new char[len];
+ int cur_type = -1, fr = -1, type, base;
+ for (int i = 0; i < len; i++) {
+ type = (mis[i] >= '0' && mis[i] <= '9');
+ if (cur_type != type) {
+ switch(cur_type) {
+ case 0:
+ base = len - 1;
+ if (mis[fr] == '^') { tmp[len - i] = mis[fr]; ++fr; ++base; }
+ for (int j = fr; j < i; j++) tmp[base - j] = ((mis[j] == 'A' || mis[j] == 'C' || mis[j] == 'G' || mis[j] == 'T') ? getOpp(mis[j]) : mis[j]);
+ break;
+ case 1:
+ base = len - i - fr;
+ for (int j = fr; j < i; j++) tmp[base + j] = mis[j];
+ break;
+ }
+ cur_type = type;
+ fr = i;
+ }
+ }
+ switch(cur_type) {
+ case 0:
+ base = len - 1;
+ if (mis[fr] == '^') { tmp[0] = mis[fr]; ++fr; ++base; }
+ for (int j = fr; j < len; j++) tmp[base - j] = ((mis[j] == 'A' || mis[j] == 'C' || mis[j] == 'G' || mis[j] == 'T') ? getOpp(mis[j]) : mis[j]);
+ break;
+ case 1:
+ for (int j = fr; j < len; j++) tmp[j - fr] = mis[j];
+ break;
+ }
+ strncpy(mis, tmp, len);
+ delete[] tmp;
+ }
+ }
+
+ // append XS:A field if necessary
+ s = bam_aux_get(b, "XS");
+ if (s != NULL) bam_aux_del(b, s);
+ bool hasN = false;
+ uint32_t* p = bam_get_cigar(b);
+ for (int i = 0; i < (int)b->core.n_cigar; i++)
+ if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; }
+ if (hasN) bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand);
+}
+
+#endif /* BAMCONVERTER_H_ */
diff --git a/BamWriter.h b/BamWriter.h
new file mode 100644
index 0000000..4ade913
--- /dev/null
+++ b/BamWriter.h
@@ -0,0 +1,148 @@
+#ifndef BAMWRITER_H_
+#define BAMWRITER_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+#include<string>
+#include<sstream>
+#include<iostream>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+#include "SamHeader.hpp"
+
+#include "utils.h"
+#include "my_assert.h"
+
+#include "SingleHit.h"
+#include "PairedEndHit.h"
+
+#include "HitWrapper.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+class BamWriter {
+public:
+ BamWriter(const char* inpF, const char* aux, const char* outF, Transcripts& transcripts, int nThreads);
+ ~BamWriter();
+
+ void work(HitWrapper<SingleHit> wrapper);
+ void work(HitWrapper<PairedEndHit> wrapper);
+private:
+ samFile *in, *out;
+ bam_hdr_t *in_header, *out_header;
+ Transcripts& transcripts;
+
+ void set_alignment_weight(bam1_t *b, double prb) {
+ b->core.qual = bam_prb_to_mapq(prb);
+ float val = (float)prb;
+ uint8_t *p = bam_aux_get(b, "ZW");
+ if (p != NULL) {
+ memcpy(p + 1, (uint8_t*)&(val), bam_aux_type2size('f'));
+ } else {
+ bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val);
+ }
+ }
+};
+
+//aux can be NULL
+BamWriter::BamWriter(const char* inpF, const char* aux, const char* outF, Transcripts& transcripts, int nThreads) : transcripts(transcripts) {
+ in = sam_open(inpF, "r");
+ assert(in != 0);
+
+ if (aux == NULL) hts_set_fai_filename(in, aux);
+ in_header = sam_hdr_read(in);
+ assert(in_header != 0);
+
+ //build mappings from external sid to internal sid
+ transcripts.buildMappings(in_header->n_targets, in_header->target_name);
+
+ //generate output's header
+ SamHeader hdr(in_header->text);
+ hdr.insertPG("RSEM");
+ out_header = hdr.create_header();
+
+ out = sam_open(outF, "wb"); // If CRAM format is desired, use "wc"
+ assert(out != 0);
+ sam_hdr_write(out, out_header);
+
+ if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
+}
+
+BamWriter::~BamWriter() {
+ bam_hdr_destroy(in_header);
+ sam_close(in);
+ bam_hdr_destroy(out_header);
+ sam_close(out);
+}
+
+void BamWriter::work(HitWrapper<SingleHit> wrapper) {
+ bam1_t *b;
+ SingleHit *hit;
+
+ HIT_INT_TYPE cnt = 0;
+
+ b = bam_init1();
+
+ while (sam_read1(in, in_header, b) >= 0) {
+ ++cnt;
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
+
+ if (bam_is_mapped(b)) {
+ hit = wrapper.getNextHit();
+ assert(hit != NULL);
+
+ assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+ set_alignment_weight(b, hit->getConPrb());
+ }
+ sam_write1(out, out_header, b);
+ }
+
+ assert(wrapper.getNextHit() == NULL);
+
+ bam_destroy1(b);
+ if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; }
+}
+
+void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
+ bam1_t *b, *b2;
+ PairedEndHit *hit;
+
+ HIT_INT_TYPE cnt = 0;
+
+ b = bam_init1();
+ b2 = bam_init1();
+
+ while (sam_read1(in, in_header, b) >= 0 && sam_read1(in, in_header, b2) >= 0) {
+ cnt += 2;
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
+
+ if (!bam_is_read1(b)) { bam1_t * tmp = b; b = b2; b2 = tmp; }
+
+ if (bam_is_mapped(b) && bam_is_mapped(b2)) {
+ hit = wrapper.getNextHit();
+ assert(hit != NULL);
+
+ assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+ assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
+
+ set_alignment_weight(b, hit->getConPrb());
+ set_alignment_weight(b2, hit->getConPrb());
+ }
+
+ sam_write1(out, out_header, b);
+ sam_write1(out, out_header, b2);
+ }
+
+ assert(wrapper.getNextHit() == NULL);
+
+ bam_destroy1(b);
+ bam_destroy1(b2);
+
+ if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; }
+}
+
+#endif /* BAMWRITER_H_ */
diff --git a/Buffer.h b/Buffer.h
new file mode 100644
index 0000000..3e45094
--- /dev/null
+++ b/Buffer.h
@@ -0,0 +1,83 @@
+#ifndef BUFFER_H_
+#define BUFFER_H_
+
+#include<cstdio>
+#include<fstream>
+#include<pthread.h>
+
+#include "my_assert.h"
+
+typedef unsigned long long bufsize_type;
+const int FLOATSIZE = sizeof(float);
+
+class Buffer {
+public:
+ // in_mem_arr must be allocated memory before the Buffer is constructed
+ Buffer(int nMB, int nSamples, int vlen, float* in_mem_arr, const char* tmpF) {
+ cpos = 0;
+ size = bufsize_type(nMB) * 1024 * 1024 / FLOATSIZE / vlen;
+ if (size > (bufsize_type)nSamples) size = nSamples;
+ general_assert(size > 0, "Memory allocated for credibility intervals is not enough!");
+ size *= vlen;
+
+ buffer = new float[size];
+ ftmpOut.open(tmpF, std::ios::binary);
+ pthread_mutex_init(&lock, NULL);
+
+ fr = to = 0;
+ this->nSamples = nSamples;
+ this->vlen = vlen;
+ this->in_mem_arr = in_mem_arr;
+ }
+
+ ~Buffer() {
+ if (fr < to) flushToTempFile();
+
+ delete[] buffer;
+ pthread_mutex_destroy(&lock);
+ ftmpOut.close();
+ }
+
+ void write(float value, float *vec) {
+ pthread_assert(pthread_mutex_lock(&lock), "pthread_mutex_lock", "Error occurred while acquiring the lock!");
+ if (size - cpos < bufsize_type(vlen)) flushToTempFile();
+ in_mem_arr[to] = value;
+ memcpy(buffer + cpos, vec, FLOATSIZE * vlen);
+ cpos += vlen;
+ ++to;
+ pthread_assert(pthread_mutex_unlock(&lock), "pthread_mutex_unlock", "Error occurred while releasing the lock!");
+ }
+
+private:
+ bufsize_type size, cpos; // cpos : current position
+
+ float *buffer;
+ float *in_mem_arr;
+ std::ofstream ftmpOut;
+ pthread_mutex_t lock;
+
+ int fr, to; // each flush, sample fr .. to - 1
+ int nSamples, vlen; // vlen : vector length
+
+ void flushToTempFile() {
+ std::streampos gap1 = std::streampos(fr) * FLOATSIZE;
+ std::streampos gap2 = std::streampos(nSamples - to) * FLOATSIZE;
+ float *p = NULL;
+
+ ftmpOut.seekp(0, std::ios::beg);
+ for (int i = 0; i < vlen; i++) {
+ p = buffer + i;
+ ftmpOut.seekp(gap1, std::ios::cur);
+ for (int j = fr; j < to; j++) {
+ ftmpOut.write((char*)p, FLOATSIZE);
+ p += vlen;
+ }
+ ftmpOut.seekp(gap2, std::ios::cur);
+ }
+
+ cpos = 0;
+ fr = to;
+ }
+};
+
+#endif /* BUFFER_H_ */
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/EBSeq/Makefile b/EBSeq/Makefile
new file mode 100644
index 0000000..ee358a9
--- /dev/null
+++ b/EBSeq/Makefile
@@ -0,0 +1,16 @@
+CXX = g++
+PROGRAMS = EBSeq rsem-for-ebseq-calculate-clustering-info
+DEPENDENCIES = blockmodeling gplots gtools gdata caTools bitops KernSmooth
+
+.PHONY : all EBSeq clean
+
+all : $(PROGRAMS)
+
+EBSeq :
+ ./install
+
+rsem-for-ebseq-calculate-clustering-info : calcClusteringInfo.cpp
+ $(CXX) -O3 -Wall calcClusteringInfo.cpp -o $@
+
+clean :
+ rm -rf $(PROGRAMS) $(DEPENDENCIES) BiocInstaller *~
diff --git a/EBSeq/calcClusteringInfo.cpp b/EBSeq/calcClusteringInfo.cpp
new file mode 100644
index 0000000..9caefbc
--- /dev/null
+++ b/EBSeq/calcClusteringInfo.cpp
@@ -0,0 +1,144 @@
+#include<cstdio>
+#include<cctype>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<fstream>
+#include<iomanip>
+#include<string>
+#include<vector>
+#include<algorithm>
+using namespace std;
+
+int M;
+int k; // k-mer size
+vector<string> names;
+vector<string> seqs;
+vector<int> effL;
+
+// tid starts from 1
+struct ReadType {
+ int tid, pos;
+
+ ReadType(int tid, int pos) {
+ this->tid = tid;
+ this->pos = pos;
+ }
+
+ bool operator< (const ReadType& o) const {
+ string& a = seqs[tid];
+ string& b = seqs[o.tid];
+ for (int i = 0; i < k; i++) {
+ if (a[pos + i] != b[o.pos + i]) {
+ return a[pos + i] < b[o.pos + i];
+ }
+ }
+ return tid < o.tid;
+ }
+
+ bool seq_equal(const ReadType& o) const {
+ string& a = seqs[tid];
+ string& b = seqs[o.tid];
+ for (int i = 0; i < k; i++)
+ if (a[pos + i] != b[o.pos + i]) return false;
+ return true;
+ }
+};
+
+vector<ReadType> cands;
+vector<double> clusteringInfo;
+
+string convert(const string& rawseq) {
+ int size = (int)rawseq.size();
+ string seq = rawseq;
+ for (int i = 0; i < size; i++) {
+ seq[i] = toupper(rawseq[i]);
+ if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
+ }
+ return seq;
+}
+
+void loadRef(char* inpF) {
+ ifstream fin(inpF);
+ string tag, line, rawseq;
+
+ assert(fin.is_open());
+
+ names.clear(); names.push_back("");
+ seqs.clear(); seqs.push_back("");
+
+ getline(fin, line);
+ while ((fin) && (line[0] == '>')) {
+ tag = line.substr(1);
+ rawseq = "";
+ while((getline(fin, line)) && (line[0] != '>')) {
+ rawseq += line;
+ }
+ if (rawseq.size() <= 0) {
+ printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
+ continue;
+ }
+ names.push_back(tag);
+ seqs.push_back(convert(rawseq));
+ }
+
+ fin.close();
+
+ M = names.size() - 1;
+
+ printf("The reference is loaded.\n");
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 4) {
+ printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n");
+ exit(-1);
+ }
+
+ k = atoi(argv[1]);
+ loadRef(argv[2]);
+
+ cands.clear();
+ effL.assign(M + 1, 0);
+ for (int i = 1; i <= M; i++) {
+ effL[i] = int(seqs[i].length()) - k + 1;
+ if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative
+ for (int j = 0; j < effL[i]; j++)
+ cands.push_back(ReadType(i, j));
+ }
+ printf("All possbile %d mers are generated.\n", k);
+
+ sort(cands.begin(), cands.end());
+ printf("All %d mers are sorted.\n", k);
+
+ size_t p = 0;
+ clusteringInfo.assign(M + 1, 0.0);
+
+ for (size_t i = 1; i <= cands.size(); i++)
+ if (i == cands.size() || !cands[p].seq_equal(cands[i])) {
+ size_t denominator = i - p;
+ size_t q = p;
+ for (size_t j = p + 1; j <= i; j++)
+ if (j == i || cands[q].tid != cands[j].tid) {
+ size_t numerator = j - q;
+ //double prob = numerator * 1.0 / denominator;
+ //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob);
+ if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator;
+ q = j;
+ }
+ p = i;
+ }
+
+ for (int i = 1; i <= M; i++)
+ if (effL[i] == 0) clusteringInfo[i] = -1.0;
+ else clusteringInfo[i] /= effL[i];
+
+ printf("Clustering information is calculated.\n");
+
+
+ ofstream fout(argv[3]);
+ for (int i = 1; i <= M; i++) fout<<names[i]<<"\t"<<setprecision(6)<<clusteringInfo[i]<<endl;
+ fout.close();
+
+ return 0;
+}
diff --git a/EBSeq/install b/EBSeq/install
new file mode 100755
index 0000000..ee2eab6
--- /dev/null
+++ b/EBSeq/install
@@ -0,0 +1,24 @@
+#!/usr/bin/env Rscript
+
+.libPaths(c(".", .libPaths()))
+result <- suppressWarnings(tryCatch({
+ library("EBSeq")
+ cat("EBSeq v", as.character(packageVersion("EBSeq")), " already exists.\n", sep = "")
+ }, error = function(err) {
+ tryCatch({
+ source("http://www.bioconductor.org/biocLite.R")
+ try(biocLite("BiocUpgrade"))
+ biocLite("EBSeq", lib = ".")
+ library("EBSeq")
+ cat("EBSeq v", as.character(packageVersion("EBSeq")), " is successfully installed from Bioconductor.\n", sep = "")
+ }, error = function(err) {
+ tryCatch({
+ cat("Failed to install EBSeq from Bioconductor! Try to install EBSeq v1.2.0 locally instead.\n")
+ install.packages(c("blockmodeling_0.1.8.tar.gz", "gtools_3.5.0.tar.gz", "gdata_2.17.0.tar.gz", "bitops_1.0-6.tar.gz",
+ "caTools_1.17.1.tar.gz", "KernSmooth_2.23-15.tar.gz", "gplots_2.17.0.tar.gz", "EBSeq_1.2.0.tar.gz"),
+ lib = ".", repos = NULL, type = "source")
+ library("EBSeq")
+ cat("EBSeq v1.2.0 is successfully installed locally.\n")
+ }, error = function(err) { cat("Failed to install EBSeq v1.2.0 locally!\n") })
+ })
+ }))
diff --git a/EBSeq/rsem-for-ebseq-find-DE b/EBSeq/rsem-for-ebseq-find-DE
new file mode 100755
index 0000000..62aad0d
--- /dev/null
+++ b/EBSeq/rsem-for-ebseq-find-DE
@@ -0,0 +1,73 @@
+#!/usr/bin/env Rscript
+
+argv <- commandArgs(TRUE)
+if (length(argv) < 6) {
+ cat("Usage: rsem-for-ebseq-find-DE path ngvector_file data_matrix_file output_file number_of_replicate_for_condition_1 number_of_replicate_for_condition_2 ...\n")
+ q(status = 1)
+}
+
+path <- argv[1]
+ngvector_file <- argv[2]
+data_matrix_file <- argv[3]
+output_file <- argv[4]
+norm_out_file <- paste0(output_file, ".normalized_data_matrix")
+
+nc <- length(argv) - 4;
+num_reps <- as.numeric(argv[5:(5+nc-1)])
+
+.libPaths(c(.libPaths(), path))
+library(EBSeq)
+
+DataMat <- data.matrix(read.table(data_matrix_file))
+n <- dim(DataMat)[2]
+if (sum(num_reps) != n) stop("Total number of replicates given does not match the number of columns from the data matrix!")
+
+conditions <- as.factor(rep(paste("C", 1:nc, sep=""), times = num_reps))
+Sizes <- MedianNorm(DataMat)
+NormMat <- GetNormalizedMat(DataMat, Sizes)
+ngvector <- NULL
+if (ngvector_file != "#") {
+ ngvector <- as.vector(data.matrix(read.table(ngvector_file)))
+ stopifnot(!is.null(ngvector))
+}
+
+if (nc == 2) {
+ EBOut <- NULL
+ EBOut <- EBTest(Data = DataMat, NgVector = ngvector, Conditions = conditions, sizeFactors = Sizes, maxround = 5)
+ stopifnot(!is.null(EBOut))
+
+ PP <- as.data.frame(GetPPMat(EBOut))
+ fc_res <- PostFC(EBOut)
+
+ results <- cbind(PP, fc_res$PostFC, fc_res$RealFC,unlist(EBOut$C1Mean)[rownames(PP)], unlist(EBOut$C2Mean)[rownames(PP)])
+ colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC","C1Mean","C2Mean")
+ results <- results[order(results[,"PPDE"], decreasing = TRUE),]
+ write.table(results, file = output_file, sep = "\t")
+
+} else {
+ patterns <- GetPatterns(conditions)
+ eename <- rownames(patterns)[which(rowSums(patterns) == nc)]
+ stopifnot(length(eename) == 1)
+
+ MultiOut <- NULL
+ MultiOut <- EBMultiTest(Data = DataMat, NgVector = ngvector, Conditions = conditions, AllParti = patterns, sizeFactors = Sizes, maxround = 5)
+ stopifnot(!is.null(MultiOut))
+
+ MultiPP <- GetMultiPP(MultiOut)
+
+ PP <- as.data.frame(MultiPP$PP)
+ pos <- which(names(PP) == eename)
+ probs <- rowSums(PP[,-pos])
+
+ results <- cbind(PP, MultiPP$MAP[rownames(PP)], probs)
+ colnames(results) <- c(colnames(PP), "MAP", "PPDE")
+ ord <- order(results[,"PPDE"], decreasing = TRUE)
+ results <- results[ord,]
+ write.table(results, file = output_file, sep = "\t")
+
+ write.table(MultiPP$Patterns, file = paste(output_file, ".pattern", sep = ""), sep = "\t")
+
+ MultiFC <- GetMultiFC(MultiOut)
+ write.table(MultiFC$CondMeans[ord,], file = paste(output_file, ".condmeans", sep = ""), sep = "\t")
+}
+write.table(NormMat, file = norm_out_file, sep = "\t")
diff --git a/EBSeq/rsem-for-ebseq-generate-ngvector-from-clustering-info b/EBSeq/rsem-for-ebseq-generate-ngvector-from-clustering-info
new file mode 100755
index 0000000..ccc52f7
--- /dev/null
+++ b/EBSeq/rsem-for-ebseq-generate-ngvector-from-clustering-info
@@ -0,0 +1,18 @@
+#!/usr/bin/env Rscript
+
+argv <- commandArgs(TRUE)
+if (length(argv) != 2) {
+ cat("Usage: rsem-for-ebseq-generate-ngvector-from-clustering-info input_file output_file\n")
+ q(status = 1)
+}
+
+data <- read.table(file = argv[1], stringsAsFactors = F, sep = "\t")
+idx <- data[,2] >= 0
+kmr <- kmeans(data[idx, 2], 3)
+order <- order(kmr$centers)
+
+ngvec <- rep(0, length(idx))
+ngvec[idx] <- order[kmr$cluster]
+ngvec[!idx] <- 3
+
+write.table(ngvec, file = argv[2], row.names = F, col.names = F)
diff --git a/EM.cpp b/EM.cpp
new file mode 100644
index 0000000..5a5b89e
--- /dev/null
+++ b/EM.cpp
@@ -0,0 +1,675 @@
+#include<ctime>
+#include<cmath>
+#include<cstdio>
+#include<cstdlib>
+#include<cstring>
+#include<cassert>
+#include<string>
+#include<vector>
+#include<algorithm>
+#include<fstream>
+#include<iostream>
+#include<pthread.h>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "sampling.h"
+
+#include "Read.h"
+#include "SingleRead.h"
+#include "SingleReadQ.h"
+#include "PairedEndRead.h"
+#include "PairedEndReadQ.h"
+
+#include "SingleHit.h"
+#include "PairedEndHit.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+#include "Transcript.h"
+#include "Transcripts.h"
+
+#include "Refs.h"
+#include "GroupInfo.h"
+#include "HitContainer.h"
+#include "ReadIndex.h"
+#include "ReadReader.h"
+
+#include "ModelParams.h"
+
+#include "HitWrapper.h"
+#include "BamWriter.h"
+
+#include "WriteResults.h"
+
+using namespace std;
+
+bool verbose = true;
+
+const double STOP_CRITERIA = 0.001;
+const int MAX_ROUND = 10000;
+const int MIN_ROUND = 20;
+
+struct Params {
+ void *model;
+ void *reader, *hitv, *ncpv, *mhp, *countv;
+};
+
+int read_type;
+int m, M; // m genes, M isoforms
+READ_INT_TYPE N0, N1, N2, N_tot;
+int nThreads;
+
+
+bool genBamF; // If user wants to generate bam file, true; otherwise, false.
+bool bamSampling; // true if sampling from read posterior distribution when bam file is generated
+bool updateModel, calcExpectedWeights;
+bool genGibbsOut; // generate file for Gibbs sampler
+
+char refName[STRLEN], outName[STRLEN];
+char imdName[STRLEN], statName[STRLEN];
+char refF[STRLEN], cntF[STRLEN], tiF[STRLEN];
+char mparamsF[STRLEN];
+char modelF[STRLEN], thetaF[STRLEN];
+
+char inpSamF[STRLEN], outBamF[STRLEN], *aux;
+
+char out_for_gibbs_F[STRLEN];
+
+vector<double> theta, eel; // eel : expected effective length
+
+double *probv, **countvs;
+
+Refs refs;
+Transcripts transcripts;
+
+ModelParams mparams;
+
+bool hasSeed;
+seedType seed;
+
+bool appendNames;
+
+template<class ReadType, class HitType, class ModelType>
+void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, double **&ncpvs, ModelType **&mhps) {
+ READ_INT_TYPE nReads;
+ HIT_INT_TYPE nHits;
+ int rt; // read type
+
+ READ_INT_TYPE nrLeft, curnr; // nrLeft : number of reads left, curnr: current number of reads
+ HIT_INT_TYPE nhT; // nhT : hit threshold per thread
+ char datF[STRLEN];
+
+ int s;
+ char readFs[2][STRLEN];
+ ReadIndex *indices[2];
+ ifstream fin;
+
+ readers = new ReadReader<ReadType>*[nThreads];
+ genReadFileNames(imdName, 1, read_type, s, readFs);
+ for (int i = 0; i < s; i++) {
+ indices[i] = new ReadIndex(readFs[i]);
+ }
+ for (int i = 0; i < nThreads; i++) {
+ readers[i] = new ReadReader<ReadType>(s, readFs, refs.hasPolyA(), mparams.seedLen); // allow calculation of calc_lq() function
+ readers[i]->setIndices(indices);
+ }
+
+ hitvs = new HitContainer<HitType>*[nThreads];
+ for (int i = 0; i < nThreads; i++) {
+ hitvs[i] = new HitContainer<HitType>();
+ }
+
+ sprintf(datF, "%s.dat", imdName);
+ fin.open(datF);
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(datF) + "! It may not exist.");
+ fin>>nReads>>nHits>>rt;
+ general_assert(nReads == N1, "Number of alignable reads does not match!");
+ general_assert(rt == read_type, "Data file (.dat) does not have the right read type!");
+
+
+ //A just so so strategy for paralleling
+ nhT = nHits / nThreads;
+ nrLeft = N1;
+ curnr = 0;
+
+ ncpvs = new double*[nThreads];
+ for (int i = 0; i < nThreads; i++) {
+ HIT_INT_TYPE ntLeft = nThreads - i - 1; // # of threads left
+
+ general_assert(readers[i]->locate(curnr), "Read indices files do not match!");
+
+ while (nrLeft > ntLeft && (i == nThreads - 1 || hitvs[i]->getNHits() < nhT)) {
+ general_assert(hitvs[i]->read(fin), "Cannot read alignments from .dat file!");
+
+ --nrLeft;
+ if (verbose && nrLeft > 0 && nrLeft % 1000000 == 0) { cout<< "DAT "<< nrLeft << " reads left"<< endl; }
+ }
+ ncpvs[i] = new double[hitvs[i]->getN()];
+ memset(ncpvs[i], 0, sizeof(double) * hitvs[i]->getN());
+ curnr += hitvs[i]->getN();
+
+ if (verbose) { cout<<"Thread "<< i<< " : N = "<< hitvs[i]->getN()<< ", NHit = "<< hitvs[i]->getNHits()<< endl; }
+ }
+
+ fin.close();
+
+ mhps = new ModelType*[nThreads];
+ for (int i = 0; i < nThreads; i++) {
+ mhps[i] = new ModelType(mparams, false); // just model helper
+ }
+
+ probv = new double[M + 1];
+ countvs = new double*[nThreads];
+ for (int i = 0; i < nThreads; i++) {
+ countvs[i] = new double[M + 1];
+ }
+
+
+ if (verbose) { printf("EM_init finished!\n"); }
+}
+
+template<class ReadType, class HitType, class ModelType>
+void* E_STEP(void* arg) {
+ Params *params = (Params*)arg;
+ ModelType *model = (ModelType*)(params->model);
+ ReadReader<ReadType> *reader = (ReadReader<ReadType>*)(params->reader);
+ HitContainer<HitType> *hitv = (HitContainer<HitType>*)(params->hitv);
+ double *ncpv = (double*)(params->ncpv);
+ ModelType *mhp = (ModelType*)(params->mhp);
+ double *countv = (double*)(params->countv);
+
+ bool needCalcConPrb = model->getNeedCalcConPrb();
+
+ ReadType read;
+
+ READ_INT_TYPE N = hitv->getN();
+ double sum;
+ vector<double> fracs; //to remove this, do calculation twice
+ HIT_INT_TYPE fr, to, id;
+
+ if (needCalcConPrb || updateModel) { reader->reset(); }
+ if (updateModel) { mhp->init(); }
+
+ memset(countv, 0, sizeof(double) * (M + 1));
+ for (READ_INT_TYPE i = 0; i < N; i++) {
+ if (needCalcConPrb || updateModel) {
+ general_assert(reader->next(read), "Can not load a read!");
+ }
+
+ fr = hitv->getSAt(i);
+ to = hitv->getSAt(i + 1);
+ fracs.resize(to - fr + 1);
+
+ sum = 0.0;
+
+ if (needCalcConPrb) { ncpv[i] = model->getNoiseConPrb(read); }
+ fracs[0] = probv[0] * ncpv[i];
+ if (fracs[0] < EPSILON) fracs[0] = 0.0;
+ sum += fracs[0];
+ for (HIT_INT_TYPE j = fr; j < to; j++) {
+ HitType &hit = hitv->getHitAt(j);
+ if (needCalcConPrb) { hit.setConPrb(model->getConPrb(read, hit)); }
+ id = j - fr + 1;
+ fracs[id] = probv[hit.getSid()] * hit.getConPrb();
+ if (fracs[id] < EPSILON) fracs[id] = 0.0;
+ sum += fracs[id];
+ }
+
+ if (sum >= EPSILON) {
+ fracs[0] /= sum;
+ countv[0] += fracs[0];
+ if (updateModel) { mhp->updateNoise(read, fracs[0]); }
+ if (calcExpectedWeights) { ncpv[i] = fracs[0]; }
+ for (HIT_INT_TYPE j = fr; j < to; j++) {
+ HitType &hit = hitv->getHitAt(j);
+ id = j - fr + 1;
+ fracs[id] /= sum;
+ countv[hit.getSid()] += fracs[id];
+ if (updateModel) { mhp->update(read, hit, fracs[id]); }
+ if (calcExpectedWeights) { hit.setConPrb(fracs[id]); }
+ }
+ }
+ else if (calcExpectedWeights) {
+ ncpv[i] = 0.0;
+ for (HIT_INT_TYPE j = fr; j < to; j++) {
+ HitType &hit = hitv->getHitAt(j);
+ hit.setConPrb(0.0);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+template<class ReadType, class HitType, class ModelType>
+void* calcConProbs(void* arg) {
+ Params *params = (Params*)arg;
+ ModelType *model = (ModelType*)(params->model);
+ ReadReader<ReadType> *reader = (ReadReader<ReadType>*)(params->reader);
+ HitContainer<HitType> *hitv = (HitContainer<HitType>*)(params->hitv);
+ double *ncpv = (double*)(params->ncpv);
+
+ ReadType read;
+ READ_INT_TYPE N = hitv->getN();
+ HIT_INT_TYPE fr, to;
+
+ assert(model->getNeedCalcConPrb());
+ reader->reset();
+
+ for (READ_INT_TYPE i = 0; i < N; i++) {
+ general_assert(reader->next(read), "Can not load a read!");
+
+ fr = hitv->getSAt(i);
+ to = hitv->getSAt(i + 1);
+
+ ncpv[i] = model->getNoiseConPrb(read);
+ for (HIT_INT_TYPE j = fr; j < to; j++) {
+ HitType &hit = hitv->getHitAt(j);
+ hit.setConPrb(model->getConPrb(read, hit));
+ }
+ }
+
+ return NULL;
+}
+
+template<class ModelType>
+void writeResults(ModelType& model, double* counts) {
+ sprintf(modelF, "%s.model", statName);
+ model.write(modelF);
+ writeResultsEM(M, refName, imdName, transcripts, theta, eel, countvs[0], appendNames);
+}
+
+template<class ReadType, class HitType, class ModelType>
+void release(ReadReader<ReadType> **readers, HitContainer<HitType> **hitvs, double **ncpvs, ModelType **mhps) {
+ delete[] probv;
+ for (int i = 0; i < nThreads; i++) {
+ delete[] countvs[i];
+ }
+ delete[] countvs;
+
+ for (int i = 0; i < nThreads; i++) {
+ delete readers[i];
+ delete hitvs[i];
+ delete[] ncpvs[i];
+ delete mhps[i];
+ }
+ delete[] readers;
+ delete[] hitvs;
+ delete[] ncpvs;
+ delete[] mhps;
+}
+
+inline bool doesUpdateModel(int ROUND) {
+ // return ROUND <= 20 || ROUND % 100 == 0;
+ return ROUND <= 10;
+}
+
+//Including initialize, algorithm and results saving
+template<class ReadType, class HitType, class ModelType>
+void EM() {
+ FILE *fo;
+
+ int ROUND;
+ double sum;
+
+ double bChange = 0.0, change = 0.0; // bChange : biggest change
+ int totNum = 0;
+
+ ModelType model(mparams); //master model
+ ReadReader<ReadType> **readers;
+ HitContainer<HitType> **hitvs;
+ double **ncpvs;
+ ModelType **mhps; //model helpers
+
+ Params fparams[nThreads];
+ pthread_t threads[nThreads];
+ pthread_attr_t attr;
+ int rc;
+
+
+ //initialize boolean variables
+ updateModel = calcExpectedWeights = false;
+
+ theta.clear();
+ theta.resize(M + 1, 0.0);
+ init<ReadType, HitType, ModelType>(readers, hitvs, ncpvs, mhps);
+
+ //set initial parameters
+ assert(N_tot > N2);
+ theta[0] = max(N0 * 1.0 / (N_tot - N2), 1e-8);
+ double val = (1.0 - theta[0]) / M;
+ for (int i = 1; i <= M; i++) theta[i] = val;
+
+ model.estimateFromReads(imdName);
+
+ for (int i = 0; i < nThreads; i++) {
+ fparams[i].model = (void*)(&model);
+
+ fparams[i].reader = (void*)readers[i];
+ fparams[i].hitv = (void*)hitvs[i];
+ fparams[i].ncpv = (void*)ncpvs[i];
+ fparams[i].mhp = (void*)mhps[i];
+ fparams[i].countv = (void*)countvs[i];
+ }
+
+ /* set thread attribute to be joinable */
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+ ROUND = 0;
+ do {
+ ++ROUND;
+
+ updateModel = doesUpdateModel(ROUND);
+
+ for (int i = 0; i <= M; i++) probv[i] = theta[i];
+
+ //E step
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_create(&threads[i], &attr, E_STEP<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
+ pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) at ROUND " + itos(ROUND) + "!");
+ }
+
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_join(threads[i], NULL);
+ pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) at ROUND " + itos(ROUND) + "!");
+ }
+
+ model.setNeedCalcConPrb(false);
+
+ for (int i = 1; i < nThreads; i++) {
+ for (int j = 0; j <= M; j++) {
+ countvs[0][j] += countvs[i][j];
+ }
+ }
+
+ //add N0 noise reads
+ countvs[0][0] += N0;
+
+ //M step;
+ sum = 0.0;
+ for (int i = 0; i <= M; i++) sum += countvs[0][i];
+ assert(sum >= EPSILON);
+ for (int i = 0; i <= M; i++) theta[i] = countvs[0][i] / sum;
+
+ if (updateModel) {
+ model.init();
+ for (int i = 0; i < nThreads; i++) { model.collect(*mhps[i]); }
+ model.finish();
+ }
+
+ // Relative error
+ bChange = 0.0; totNum = 0;
+ for (int i = 0; i <= M; i++)
+ if (probv[i] >= 1e-7) {
+ change = fabs(theta[i] - probv[i]) / probv[i];
+ if (change >= STOP_CRITERIA) ++totNum;
+ if (bChange < change) bChange = change;
+ }
+
+ if (verbose) { cout<< "ROUND = "<< ROUND<< ", SUM = "<< setprecision(15)<< sum<< ", bChange = " << setprecision(6)<< bChange<< ", totNum = " << totNum<< endl; }
+ } while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND));
+// } while (ROUND < 1);
+
+ if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND);
+
+ //generate output file used by Gibbs sampler
+ if (genGibbsOut) {
+ if (model.getNeedCalcConPrb()) {
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_create(&threads[i], &attr, calcConProbs<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
+ pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) when generating files for Gibbs sampler!");
+ }
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_join(threads[i], NULL);
+ pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) when generating files for Gibbs sampler!");
+ }
+ }
+ model.setNeedCalcConPrb(false);
+
+ sprintf(out_for_gibbs_F, "%s.ofg", imdName);
+ ofstream fout(out_for_gibbs_F);
+ fout<< M<< " "<< N0<< endl;
+ for (int i = 0; i < nThreads; i++) {
+ READ_INT_TYPE numN = hitvs[i]->getN();
+ for (READ_INT_TYPE j = 0; j < numN; j++) {
+ HIT_INT_TYPE fr = hitvs[i]->getSAt(j);
+ HIT_INT_TYPE to = hitvs[i]->getSAt(j + 1);
+ HIT_INT_TYPE totNum = 0;
+
+ if (ncpvs[i][j] >= EPSILON) { ++totNum; fout<< "0 "<< setprecision(15)<< ncpvs[i][j]<< " "; }
+ for (HIT_INT_TYPE k = fr; k < to; k++) {
+ HitType &hit = hitvs[i]->getHitAt(k);
+ if (hit.getConPrb() >= EPSILON) {
+ ++totNum;
+ fout<< hit.getSid()<< " "<< setprecision(15)<< hit.getConPrb()<< " ";
+ }
+ }
+
+ if (totNum > 0) { fout<< endl; }
+ }
+ }
+ fout.close();
+ }
+
+ //calculate expected weights and counts using learned parameters
+ //just use the raw theta learned from the data, do not correct for eel or mw
+ updateModel = false; calcExpectedWeights = true;
+ for (int i = 0; i <= M; i++) probv[i] = theta[i];
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_create(&threads[i], &attr, E_STEP<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
+ pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) when calculating expected weights!");
+ }
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_join(threads[i], NULL);
+ pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) when calculating expected weights!");
+ }
+ model.setNeedCalcConPrb(false);
+ for (int i = 1; i < nThreads; i++) {
+ for (int j = 0; j <= M; j++) {
+ countvs[0][j] += countvs[i][j];
+ }
+ }
+ countvs[0][0] += N0;
+
+ /* destroy attribute */
+ pthread_attr_destroy(&attr);
+
+
+ sprintf(thetaF, "%s.theta", statName);
+ fo = fopen(thetaF, "w");
+ fprintf(fo, "%d\n", M + 1);
+
+ // output theta'
+ for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+ fprintf(fo, "%.15g\n", theta[M]);
+
+ //calculate expected effective lengths for each isoform
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
+ polishTheta(M, theta, eel, model.getMW());
+
+ // output theta
+ for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+ fprintf(fo, "%.15g\n", theta[M]);
+
+ fclose(fo);
+
+ writeResults<ModelType>(model, countvs[0]);
+
+ if (genBamF) {
+ sprintf(outBamF, "%s.transcript.bam", outName);
+
+ if (bamSampling) {
+ READ_INT_TYPE local_N;
+ HIT_INT_TYPE fr, to, len, id;
+ vector<double> arr;
+ engine_type engine(hasSeed ? seed : time(NULL));
+ uniform_01_dist uniform_01;
+ uniform_01_generator rg(engine, uniform_01);
+
+ if (verbose) cout<< "Begin to sample reads from their posteriors."<< endl;
+ for (int i = 0; i < nThreads; i++) {
+ local_N = hitvs[i]->getN();
+ for (READ_INT_TYPE j = 0; j < local_N; j++) {
+ fr = hitvs[i]->getSAt(j);
+ to = hitvs[i]->getSAt(j + 1);
+ len = to - fr + 1;
+ arr.assign(len, 0);
+ arr[0] = ncpvs[i][j];
+ for (HIT_INT_TYPE k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb();
+ id = (arr[len - 1] < EPSILON ? -1 : sample(rg, arr, len)); // if all entries in arr are 0, let id be -1
+ for (HIT_INT_TYPE k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0);
+ }
+ }
+
+ if (verbose) cout<< "Sampling is finished."<< endl;
+ }
+
+ BamWriter writer(inpSamF, aux, outBamF, transcripts, nThreads);
+ HitWrapper<HitType> wrapper(nThreads, hitvs);
+ writer.work(wrapper);
+ }
+
+ release<ReadType, HitType, ModelType>(readers, hitvs, ncpvs, mhps);
+}
+
+int main(int argc, char* argv[]) {
+ ifstream fin;
+
+ if (argc < 6) {
+ printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpF has_fai? [fai_file]] [-q] [--gibbs-out] [--sampling] [--seed seed] [--append-names]\n\n");
+ printf(" refName: reference name\n");
+ printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
+ printf(" sampleName: sample's name, including the path\n");
+ printf(" sampleToken: sampleName excludes the path\n");
+ printf(" -p: number of threads which user wants to use. (default: 1)\n");
+ printf(" -b: produce bam format output file. (default: off)\n");
+ printf(" -q: set it quiet\n");
+ printf(" --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
+ printf(" --sampling: sample each read from its posterior distribution when BAM file is generated. (default: off)\n");
+ printf(" --seed uint32: the seed used for the BAM sampling. (default: off)\n");
+ printf(" --append-names: append transcript_name/gene_name when available. (default: off)\n");
+ printf("// model parameters should be in imdName.mparams.\n");
+ exit(-1);
+ }
+
+ time_t a = time(NULL);
+
+ strcpy(refName, argv[1]);
+ read_type = atoi(argv[2]);
+ strcpy(outName, argv[3]);
+ strcpy(imdName, argv[4]);
+ strcpy(statName, argv[5]);
+
+ nThreads = 1;
+
+ genBamF = false;
+ bamSampling = false;
+ genGibbsOut = false;
+ aux = NULL;
+ hasSeed = false;
+ appendNames = false;
+
+ for (int i = 6; i < argc; i++) {
+ if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
+ if (!strcmp(argv[i], "-b")) {
+ genBamF = true;
+ strcpy(inpSamF, argv[i + 1]);
+ if (atoi(argv[i + 2]) == 1) aux = argv[i + 3];
+ }
+ if (!strcmp(argv[i], "-q")) { verbose = false; }
+ if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
+ if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
+ if (!strcmp(argv[i], "--seed")) {
+ hasSeed = true;
+ int len = strlen(argv[i + 1]);
+ seed = 0;
+ for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
+ }
+ if (!strcmp(argv[i], "--append-names")) appendNames = true;
+ }
+
+ general_assert(nThreads > 0, "Number of threads should be bigger than 0!");
+
+ //basic info loading
+ sprintf(refF, "%s.seq", refName);
+ refs.loadRefs(refF);
+ M = refs.getM();
+
+ sprintf(tiF, "%s.ti", refName);
+ transcripts.readFrom(tiF);
+
+ sprintf(cntF, "%s.cnt", statName);
+ fin.open(cntF);
+
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");
+
+ fin>>N0>>N1>>N2>>N_tot;
+ fin.close();
+
+ if (N1 == 0) {
+ printf("Warning: There are no alignable reads!\n");
+ theta.resize(M + 1, 0.0);
+ FILE *fo = NULL;
+ sprintf(thetaF, "%s.theta", statName);
+ fo = fopen(thetaF, "w");
+ fclose(fo);
+ sprintf(modelF, "%s.model", statName);
+ fo = fopen(modelF, "w");
+ fclose(fo);
+ eel.resize(M + 1, 0.0);
+ for (int i = 1; i <= M; ++i) eel[i] = transcripts.getTranscriptAt(i).getLength();
+ double *countv = new double[M + 1];
+ memset(countv, 0, sizeof(double) * (M + 1));
+ writeResultsEM(M, refName, imdName, transcripts, theta, eel, countv, appendNames);
+ if (genBamF) {
+ sprintf(outBamF, "%s.transcript.bam", outName);
+ char command[1005];
+ sprintf(command, "cp %s %s", inpSamF, outBamF);
+ printf("%s\n", command);
+ system(command);
+ }
+ delete[] countv;
+ }
+ else {
+ if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;
+
+ //set model parameters
+ mparams.M = M;
+ mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
+ mparams.refs = &refs;
+
+ sprintf(mparamsF, "%s.mparams", imdName);
+ fin.open(mparamsF);
+
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");
+
+ fin>> mparams.minL>> mparams.maxL>> mparams.probF;
+ int val; // 0 or 1 , for estRSPD
+ fin>>val;
+ mparams.estRSPD = (val != 0);
+ fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
+ fin>> mparams.seedLen;
+ fin.close();
+
+ //run EM
+ switch(read_type) {
+ case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
+ case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
+ case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
+ case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
+ default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
+ }
+ }
+
+ time_t b = time(NULL);
+
+ printTimeUsed(a, b, "EM.cpp");
+
+ return 0;
+}
diff --git a/GTFItem.h b/GTFItem.h
new file mode 100644
index 0000000..f7c794c
--- /dev/null
+++ b/GTFItem.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2015
+ Bo Li (University of California, Berkeley)
+ bli25@berkeley.edu
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA
+*/
+
+#ifndef GTFITEM_H_
+#define GTFITEM_H_
+
+#include<cstdio>
+#include<cctype>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<sstream>
+
+class GTFItem {
+ public:
+
+ GTFItem() {
+ seqname = source = feature = "";
+ score = "";
+ start = end = 0;
+ strand = 0; //strand is a char variable
+ frame = "";
+ gene_id = transcript_id = "";
+ gene_name = transcript_name = "";
+ left = "";
+ }
+
+ bool operator<(const GTFItem& o) const {
+ if (gene_id != o.gene_id) return gene_id < o.gene_id;
+ if (transcript_id != o.transcript_id) return transcript_id < o.transcript_id;
+ return start < o.start;
+ }
+
+ void parse(const std::string& line) {
+ std::istringstream strin(line);
+ std::string tmp;
+
+ getline(strin, seqname, '\t');
+ getline(strin, source, '\t');
+ getline(strin, feature, '\t');
+ getline(strin, tmp, '\t');
+ start = atoi(tmp.c_str());
+ getline(strin, tmp, '\t');
+ end = atoi(tmp.c_str());
+ getline(strin, score, '\t');
+ getline(strin, tmp, '\t');
+ gtf_assert((tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-')), line, "Strand is neither '+' nor '-'!");
+ strand = tmp[0];
+ getline(strin, frame, '\t');
+
+ getline(strin, left); // assign attributes and possible comments into "left"
+ }
+
+ void parseAttributes(const std::string& line) {
+ assert(feature == "exon");
+ gene_id = transcript_id = "";
+ gene_name = transcript_name = "";
+
+ int nleft = 4;
+ int pos, lpos = 0, rpos, left_len = left.length();
+ std::string identifier;
+
+ while (nleft > 0 && get_an_attribute(lpos, rpos, left_len)) {
+ // locate identifier
+ pos = lpos;
+ while (pos < rpos && !isspace(left[pos])) ++pos;
+ gtf_assert(isspace(left[pos]), line, "Cannot locate the identifier from attribute " + left.substr(lpos, rpos + 1 - lpos) + "!");
+ identifier = left.substr(lpos, pos - lpos);
+
+ // prepare for the next attribute
+ lpos = rpos + 1;
+
+ // locate value
+ ++pos;
+ while (pos < rpos && isspace(left[pos])) ++pos;
+ if (left[pos] != '"') pos = rpos;
+ --rpos;
+ while (rpos > pos && isspace(left[rpos])) --rpos;
+ if (rpos > pos && left[rpos] != '"') rpos = pos;
+
+ // test if the identifier is interested
+ if (identifier == "gene_id") {
+ gtf_assert(gene_id == "", line, "gene_id appear more than once!");
+ gtf_assert(rpos - pos > 1 , line, "Attribute " + identifier + "'s value should be surrounded by double quotes and cannot be empty!");
+ gene_id = left.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ else if (identifier == "transcript_id") {
+ gtf_assert(transcript_id == "", line, "transcript_id appear more than once!");
+ gtf_assert(rpos - pos > 1 , line, "Attribute " + identifier + "'s value should be surrounded by double quotes and cannot be empty!");
+ transcript_id = left.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ else if (identifier == "gene_name" && gene_name == "" && rpos - pos > 1) {
+ gene_name = left.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ else if (identifier == "transcript_name" && transcript_name == "" && rpos - pos > 1) {
+ transcript_name = left.substr(pos + 1, rpos - pos - 1);
+ --nleft;
+ }
+ }
+
+ gtf_assert(gene_id != "", line, "Cannot find gene_id!");
+ gtf_assert(transcript_id != "", line, "Cannot find transcript_id!");
+ }
+
+ const std::string& getSeqName() const { return seqname; }
+ const std::string& getSource() const { return source; }
+ const std::string getFeature() const { return feature; }
+ int getStart() const { return start; }
+ int getEnd() const { return end; }
+ char getStrand() const { return strand; }
+ const std::string& getScore() const { return score; } // float, integer or "." ; let downstream programs parse it
+ const std::string& getFrame() const { return frame; } // 0, 1, 2, or "."; let downstream programs parse it
+ const std::string& getGeneID() const { return gene_id; }
+ const std::string& getTranscriptID() const { return transcript_id; }
+ const std::string& getGeneName() const { return gene_name; }
+ const std::string& getTranscriptName() const { return transcript_name; }
+ const std::string getLeft() { return left; }
+
+ void setGeneID(const std::string& gene_id) {
+ this->gene_id = gene_id;
+ }
+
+ std::string toString() {
+ std::ostringstream strout("");
+ strout<< seqname<< '\t'<< source<< '\t'<< feature<< '\t'<< start<< '\t'<< end<< '\t'<< score<< '\t'<< strand<< '\t'<< frame<< '\t'<< left;
+ return strout.str();
+ }
+
+ private:
+ std::string seqname, source, feature;
+ std::string score;
+ int start, end;
+ char strand;
+ std::string frame;
+ std::string gene_id, transcript_id;
+ std::string gene_name, transcript_name;
+ std::string left;
+
+
+ bool get_an_attribute(int& lpos, int& rpos, int left_len) {
+ bool in_quote;
+
+ while (lpos < left_len && isspace(left[lpos])) ++lpos; // remove leading white spaces
+
+ rpos = lpos; in_quote = false;
+ while (rpos < left_len && (left[rpos] != ';' || in_quote)) {
+ if (left[rpos] == '"') in_quote ^= true;
+ ++rpos;
+ }
+
+ return rpos < left_len;
+ }
+
+ void gtf_assert(bool expr, const std::string& line, const std::string& msg) {
+ if (!expr) {
+ fprintf(stderr, "The GTF file might be corrupted!\n");
+ fprintf(stderr, "Stop at line : %s\n", line.c_str());
+ fprintf(stderr, "Error Message: %s\n", msg.c_str());
+ exit(-1);
+ }
+ }
+};
+
+#endif
diff --git a/Gibbs.cpp b/Gibbs.cpp
new file mode 100644
index 0000000..80f1ead
--- /dev/null
+++ b/Gibbs.cpp
@@ -0,0 +1,530 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<fstream>
+#include<sstream>
+#include<vector>
+#include<pthread.h>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "sampling.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+#include "Refs.h"
+
+#include "GroupInfo.h"
+#include "WriteResults.h"
+
+using namespace std;
+
+bool verbose = true;
+
+struct Params {
+ int no, nsamples;
+ FILE *fo;
+ engine_type *engine;
+ double *pme_c, *pve_c; //posterior mean and variance vectors on counts
+ double *pme_tpm, *pme_fpkm;
+
+ double *pve_c_genes, *pve_c_trans;
+};
+
+struct Item {
+ int sid;
+ double conprb;
+
+ Item(int sid, double conprb) {
+ this->sid = sid;
+ this->conprb = conprb;
+ }
+};
+
+int nThreads;
+
+int model_type;
+int M;
+READ_INT_TYPE N0, N1;
+HIT_INT_TYPE nHits;
+double totc;
+int BURNIN, NSAMPLES, GAP;
+char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
+char thetaF[STRLEN], ofgF[STRLEN], refF[STRLEN], modelF[STRLEN];
+char cvsF[STRLEN];
+
+Refs refs;
+
+vector<HIT_INT_TYPE> s;
+vector<Item> hits;
+
+vector<double> eel;
+double *mw;
+
+vector<int> init_counts;
+double pseudoC;
+
+vector<double> pme_c, pve_c; //global posterior mean and variance vectors on counts
+vector<double> pme_tpm, pme_fpkm;
+
+bool quiet;
+
+Params *paramsArray;
+pthread_t *threads;
+pthread_attr_t attr;
+int rc;
+
+bool hasSeed;
+seedType seed;
+
+int m;
+char groupF[STRLEN];
+GroupInfo gi;
+
+bool alleleS;
+int m_trans;
+GroupInfo gt, ta;
+vector<double> pve_c_genes, pve_c_trans;
+
+// pliu
+// if has prior file; file's name; and a vector to save prior parameters
+bool has_prior;
+char fprior[STRLEN];
+vector<double> pseudo_counts;
+//////
+
+void load_data(char* refName, char* statName, char* imdName) {
+ ifstream fin;
+ string line;
+ int tmpVal;
+
+ //load reference file
+ sprintf(refF, "%s.seq", refName);
+ refs.loadRefs(refF, 1);
+ M = refs.getM();
+
+ //load ofgF;
+ sprintf(ofgF, "%s.ofg", imdName);
+ fin.open(ofgF);
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(ofgF) + "!");
+ fin>>tmpVal>>N0;
+ general_assert(tmpVal == M, "M in " + cstrtos(ofgF) + " is not consistent with " + cstrtos(refF) + "!");
+ getline(fin, line);
+
+ s.clear(); hits.clear();
+ s.push_back(0);
+ while (getline(fin, line)) {
+ istringstream strin(line);
+ int sid;
+ double conprb;
+
+ while (strin>>sid>>conprb) {
+ hits.push_back(Item(sid, conprb));
+ }
+ s.push_back(hits.size());
+ }
+ fin.close();
+
+ N1 = s.size() - 1;
+ nHits = hits.size();
+
+ if (verbose) { printf("Loading data is finished!\n"); }
+}
+
+void load_group_info(char* refName) {
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific
+ m_trans = (alleleS ? ta.getm() : 0);
+
+ if (verbose) { printf("Loading group information is finished!\n"); }
+}
+
+// Load imdName.omit and initialize the init count vector.
+void load_omit_info(const char* imdName) {
+ char omitF[STRLEN];
+ FILE *fi = NULL;
+ int tid;
+
+ sprintf(omitF, "%s.omit", imdName);
+ fi = fopen(omitF, "r");
+ init_counts.assign(M + 1, 0);
+ totc = M + 1;
+ while (fscanf(fi, "%d", &tid) == 1) {
+ init_counts[tid] = -1;
+ --totc;
+ }
+ fclose(fi);
+ totc = totc * pseudoC + N0 + N1;
+}
+
+// pliu
+// load isoform's prior information and re-calculate totc
+void load_prior_info(const char* fprior){
+ pseudo_counts.assign(M+1, 0.0);
+ ifstream fin;
+ string line;
+ fin.open(fprior);
+ for(int i=1; i<=M; ++i){
+ double prior;
+ getline(fin, line);
+ sscanf(line.c_str(), "%lf%*s", &prior);
+ if ( init_counts[i] == 0 ){ // not to-be-omitted
+ pseudo_counts[i] = prior;
+ }
+ }
+ fin.close();
+
+ // re-calculate 'totc' by considering prior parameters
+ totc = 1;
+ for ( int i=1; i<=M; ++i ) {
+ if ( init_counts[i] == 0 ) { // not to-be-omitted
+ totc += pseudo_counts[i];
+ }
+ }
+ totc += N0 + N1;
+}
+//////
+
+template<class ModelType>
+void init_model_related(char* modelF) {
+ ModelType model;
+ model.read(modelF);
+
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
+ memcpy(mw, model.getMW(), sizeof(double) * (M + 1)); // otherwise, after exiting this procedure, mw becomes undefined
+}
+
+// assign threads
+void init() {
+ int quotient, left;
+ char outF[STRLEN];
+
+ quotient = NSAMPLES / nThreads;
+ left = NSAMPLES % nThreads;
+
+ sprintf(cvsF, "%s.countvectors", imdName);
+ paramsArray = new Params[nThreads];
+ threads = new pthread_t[nThreads];
+
+ hasSeed ? engineFactory::init(seed) : engineFactory::init();
+ for (int i = 0; i < nThreads; i++) {
+ paramsArray[i].no = i;
+
+ paramsArray[i].nsamples = quotient;
+ if (i < left) paramsArray[i].nsamples++;
+
+ sprintf(outF, "%s%d", cvsF, i);
+ paramsArray[i].fo = fopen(outF, "w");
+
+ paramsArray[i].engine = engineFactory::new_engine();
+ paramsArray[i].pme_c = new double[M + 1];
+ memset(paramsArray[i].pme_c, 0, sizeof(double) * (M + 1));
+ paramsArray[i].pve_c = new double[M + 1];
+ memset(paramsArray[i].pve_c, 0, sizeof(double) * (M + 1));
+ paramsArray[i].pme_tpm = new double[M + 1];
+ memset(paramsArray[i].pme_tpm, 0, sizeof(double) * (M + 1));
+ paramsArray[i].pme_fpkm = new double[M + 1];
+ memset(paramsArray[i].pme_fpkm, 0, sizeof(double) * (M + 1));
+
+ paramsArray[i].pve_c_genes = new double[m];
+ memset(paramsArray[i].pve_c_genes, 0, sizeof(double) * m);
+
+ paramsArray[i].pve_c_trans = NULL;
+ if (alleleS) {
+ paramsArray[i].pve_c_trans = new double[m_trans];
+ memset(paramsArray[i].pve_c_trans, 0, sizeof(double) * m_trans);
+ }
+ }
+ engineFactory::finish();
+
+ /* set thread attribute to be joinable */
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+ if (verbose) { printf("Initialization finished!\n"); }
+}
+
+
+void writeCountVector(FILE* fo, vector<int>& counts) {
+ for (int i = 0; i < M; i++) {
+ fprintf(fo, "%d ", counts[i]);
+ }
+ fprintf(fo, "%d\n", counts[M]);
+}
+
+
+void* Gibbs(void* arg) {
+ int CHAINLEN;
+ HIT_INT_TYPE len, fr, to;
+ Params *params = (Params*)arg;
+
+ vector<double> theta, tpm, fpkm;
+ vector<int> z, counts(init_counts);
+ vector<double> arr;
+
+ uniform_01_generator rg(*params->engine, uniform_01_dist());
+
+ // generate initial state
+ theta.assign(M + 1, 0.0);
+ z.assign(N1, 0);
+ counts[0] += N0;
+
+ for (READ_INT_TYPE i = 0; i < N1; i++) {
+ fr = s[i]; to = s[i + 1];
+ len = to - fr;
+ arr.assign(len, 0);
+ for (HIT_INT_TYPE j = fr; j < to; j++) {
+ arr[j - fr] = hits[j].conprb;
+ if (j > fr) arr[j - fr] += arr[j - fr - 1]; // cumulative
+ }
+ z[i] = hits[fr + sample(rg, arr, len)].sid;
+ ++counts[z[i]];
+ }
+
+ // Gibbs sampling
+ CHAINLEN = 1 + (params->nsamples - 1) * GAP;
+ for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN; ROUND++) {
+
+ for (READ_INT_TYPE i = 0; i < N1; i++) {
+ --counts[z[i]];
+ fr = s[i]; to = s[i + 1]; len = to - fr;
+ arr.assign(len, 0);
+ for (HIT_INT_TYPE j = fr; j < to; j++) {
+ if ( has_prior ) {
+ arr[j - fr] = (counts[hits[j].sid] + pseudo_counts[hits[j].sid]) * hits[j].conprb;
+ } else {
+ arr[j - fr] = (counts[hits[j].sid] + pseudoC) * hits[j].conprb;
+ }
+ if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative
+ }
+ z[i] = hits[fr + sample(rg, arr, len)].sid;
+ ++counts[z[i]];
+ }
+
+ if (ROUND > BURNIN) {
+ if ((ROUND - BURNIN - 1) % GAP == 0) {
+ writeCountVector(params->fo, counts);
+ for (int i = 0; i <= M; i++) {
+ if ( has_prior ) {
+ theta[i] = (counts[i] < 0 ? 0.0 : (counts[i] + pseudo_counts[i]) / totc);
+ } else {
+ theta[i] = (counts[i] < 0 ? 0.0 : (counts[i] + pseudoC) / totc);
+ }
+ }
+ polishTheta(M, theta, eel, mw);
+ calcExpressionValues(M, theta, eel, tpm, fpkm);
+ for (int i = 0; i <= M; i++) {
+ params->pme_c[i] += counts[i];
+ params->pve_c[i] += double(counts[i]) * counts[i];
+ params->pme_tpm[i] += tpm[i];
+ params->pme_fpkm[i] += fpkm[i];
+ }
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ double count = 0.0;
+ for (int j = b; j < e; j++) count += counts[j];
+ params->pve_c_genes[i] += count * count;
+ }
+
+ if (alleleS)
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ double count = 0.0;
+ for (int j = b; j < e; j++) count += counts[j];
+ params->pve_c_trans[i] += count * count;
+ }
+ }
+ }
+
+ if (verbose && ROUND % 100 == 0) { printf("Thread %d, ROUND %d is finished!\n", params->no, ROUND); }
+ }
+
+ return NULL;
+}
+
+void release() {
+// char inpF[STRLEN], command[STRLEN];
+ string line;
+
+ /* destroy attribute */
+ pthread_attr_destroy(&attr);
+ delete[] threads;
+
+ pme_c.assign(M + 1, 0);
+ pve_c.assign(M + 1, 0);
+ pme_tpm.assign(M + 1, 0);
+ pme_fpkm.assign(M + 1, 0);
+
+ pve_c_genes.assign(m, 0);
+ pve_c_trans.clear();
+ if (alleleS) pve_c_trans.assign(m_trans, 0);
+
+ for (int i = 0; i < nThreads; i++) {
+ fclose(paramsArray[i].fo);
+ delete paramsArray[i].engine;
+ for (int j = 0; j <= M; j++) {
+ pme_c[j] += paramsArray[i].pme_c[j];
+ pve_c[j] += paramsArray[i].pve_c[j];
+ pme_tpm[j] += paramsArray[i].pme_tpm[j];
+ pme_fpkm[j] += paramsArray[i].pme_fpkm[j];
+ }
+
+ for (int j = 0; j < m; j++)
+ pve_c_genes[j] += paramsArray[i].pve_c_genes[j];
+
+ if (alleleS)
+ for (int j = 0; j < m_trans; j++)
+ pve_c_trans[j] += paramsArray[i].pve_c_trans[j];
+
+ delete[] paramsArray[i].pme_c;
+ delete[] paramsArray[i].pve_c;
+ delete[] paramsArray[i].pme_tpm;
+ delete[] paramsArray[i].pme_fpkm;
+
+ delete[] paramsArray[i].pve_c_genes;
+ if (alleleS) delete[] paramsArray[i].pve_c_trans;
+ }
+ delete[] paramsArray;
+
+ for (int i = 0; i <= M; i++) {
+ pme_c[i] /= NSAMPLES;
+ pve_c[i] = (pve_c[i] - double(NSAMPLES) * pme_c[i] * pme_c[i]) / double(NSAMPLES - 1);
+ if (pve_c[i] < 0.0) pve_c[i] = 0.0;
+ pme_tpm[i] /= NSAMPLES;
+ pme_fpkm[i] /= NSAMPLES;
+ }
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ double pme_c_gene = 0.0;
+ for (int j = b; j < e; j++) pme_c_gene += pme_c[j];
+ pve_c_genes[i] = (pve_c_genes[i] - double(NSAMPLES) * pme_c_gene * pme_c_gene) / double(NSAMPLES - 1);
+ if (pve_c_genes[i] < 0.0) pve_c_genes[i] = 0.0;
+ }
+
+ if (alleleS)
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ double pme_c_tran = 0.0;
+ for (int j = b; j < e; j++) pme_c_tran += pme_c[j];
+ pve_c_trans[i] = (pve_c_trans[i] - double(NSAMPLES) * pme_c_tran * pme_c_tran) / double(NSAMPLES - 1);
+ if (pve_c_trans[i] < 0.0) pve_c_trans[i] = 0.0;
+ }
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 7) {
+ // pliu
+ // add an option --prior to take priors
+ printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--seed seed] [--pseudo-count pseudo_count] [--prior file] [-q]\n");
+ printf("\n");
+ printf("Format of the prior file:\n");
+ printf("- One isoform's prior per line\n");
+ printf("- Priors must be in the same order as in the .ti file\n");
+ printf("- Priors for those to-be-omitted isoforms must be included as well\n");
+ printf("- Comments can be added after prior separated by space(s)\n");
+ exit(-1);
+ }
+
+ strcpy(refName, argv[1]);
+ strcpy(imdName, argv[2]);
+ strcpy(statName, argv[3]);
+
+ BURNIN = atoi(argv[4]);
+ NSAMPLES = atoi(argv[5]);
+ GAP = atoi(argv[6]);
+
+ nThreads = 1;
+ hasSeed = false;
+ pseudoC = 1.0;
+ quiet = false;
+
+ // pliu
+ has_prior = false;
+ //////
+
+ for (int i = 7; i < argc; i++) {
+ if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
+ if (!strcmp(argv[i], "--seed")) {
+ hasSeed = true;
+ int len = strlen(argv[i + 1]);
+ seed = 0;
+ for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
+ }
+ if (!strcmp(argv[i], "--pseudo-count")) pseudoC = atof(argv[i + 1]);
+ if (!strcmp(argv[i], "-q")) quiet = true;
+
+ // pliu
+ if ( ! strcmp(argv[i], "--prior") ) {
+ has_prior = true;
+ strcpy(fprior, argv[i+1]);
+ }
+ //////
+ }
+ verbose = !quiet;
+
+ assert(NSAMPLES > 1); // Otherwise, we cannot calculate posterior variance
+
+ if (nThreads > NSAMPLES) {
+ nThreads = NSAMPLES;
+ fprintf(stderr, "Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
+ }
+
+ load_data(refName, statName, imdName);
+ load_group_info(refName);
+ load_omit_info(imdName);
+
+ // pliu
+ // have to do it after load_data() in order to use 'M'
+ // the variable 'totc' will be re-calculated by including the prior info
+ if ( has_prior ) {
+ load_prior_info(fprior);
+ }
+ //////
+
+ sprintf(modelF, "%s.model", statName);
+ FILE *fi = fopen(modelF, "r");
+ general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
+ assert(fscanf(fi, "%d", &model_type) == 1);
+ fclose(fi);
+
+ mw = new double[M + 1]; // make an extra copy
+
+ switch(model_type) {
+ case 0 : init_model_related<SingleModel>(modelF); break;
+ case 1 : init_model_related<SingleQModel>(modelF); break;
+ case 2 : init_model_related<PairedEndModel>(modelF); break;
+ case 3 : init_model_related<PairedEndQModel>(modelF); break;
+ }
+
+ if (verbose) printf("Gibbs started!\n");
+
+ init();
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_create(&threads[i], &attr, Gibbs, (void*)(&paramsArray[i]));
+ pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0)!");
+ }
+ for (int i = 0; i < nThreads; i++) {
+ rc = pthread_join(threads[i], NULL);
+ pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0)!");
+ }
+ release();
+
+ if (verbose) printf("Gibbs finished!\n");
+
+ writeResultsGibbs(M, m, m_trans, gi, gt, ta, alleleS, imdName, pme_c, pme_fpkm, pme_tpm, pve_c, pve_c_genes, pve_c_trans);
+
+ delete mw; // delete the copy
+
+ return 0;
+}
diff --git a/GroupInfo.h b/GroupInfo.h
new file mode 100644
index 0000000..2508244
--- /dev/null
+++ b/GroupInfo.h
@@ -0,0 +1,55 @@
+#ifndef GROUPINFO_H_
+#define GROUPINFO_H_
+
+#include<cstdio>
+#include<cassert>
+#include<vector>
+
+class GroupInfo {
+public:
+ GroupInfo() { m = 0; starts.clear(); gids = NULL; }
+ ~GroupInfo() { m = 0; starts.clear(); if (gids != NULL) delete[] gids; }
+
+ void load(const char*);
+
+ int getm() const { return m; }
+
+ int gidAt(int sid) const {
+ assert(sid > 0 && sid < starts.back());
+ return gids[sid];
+ }
+
+ // sp : start position
+ int spAt(int gid) const {
+ assert(gid >= 0 && gid <= m);
+ return starts[gid];
+ }
+
+private:
+ int m; // m genes
+ std::vector<int> starts; // genes' start positions
+ int *gids; // hash
+};
+
+void GroupInfo::load(const char* groupF) {
+ FILE *fi = fopen(groupF, "r");
+ int pos;
+
+ if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", groupF); exit(-1); }
+
+ starts.clear();
+ while(fscanf(fi, "%d", &pos) == 1) {
+ starts.push_back(pos);
+ }
+ fclose(fi);
+
+ m = starts.size() - 1;
+ gids = new int[starts.back()];
+ for (int i = 0; i < m; i++) {
+ for (int j = starts[i]; j < starts[i + 1]; j++) {
+ gids[j] = i;
+ }
+ }
+}
+
+#endif /* GROUPINFO_H_ */
diff --git a/HitContainer.h b/HitContainer.h
new file mode 100644
index 0000000..5dfbf7e
--- /dev/null
+++ b/HitContainer.h
@@ -0,0 +1,118 @@
+#ifndef HITCONTAINER_H_
+#define HITCONTAINER_H_
+
+#include<cassert>
+#include<iostream>
+#include<vector>
+#include<algorithm>
+
+#include "utils.h"
+#include "GroupInfo.h"
+
+template<class HitType>
+class HitContainer {
+public:
+ HitContainer() {
+ clear();
+ }
+
+ void clear() {
+ n = nhits = 0;
+ s.clear();
+ hits.clear();
+
+ s.push_back(0);
+ }
+
+ bool read(std::istream&); // each time a read
+ void write(std::ostream&); // write all reads' hit out
+
+ void push_back(const HitType& hit) {
+ hits.push_back(hit);
+ ++nhits;
+ }
+
+ //update read information vector etc
+ void updateRI() {
+ if (nhits > s.back()) { //Do not change if last read does not have hits
+ s.push_back(nhits);
+ ++n;
+ }
+ }
+
+ READ_INT_TYPE getN() { return n; }
+
+ HIT_INT_TYPE getNHits() { return nhits; }
+
+ READ_INT_TYPE calcNumGeneMultiReads(const GroupInfo&);
+ READ_INT_TYPE calcNumIsoformMultiReads();
+
+ HIT_INT_TYPE getSAt(READ_INT_TYPE pos) { assert(pos >= 0 && pos <= n); return s[pos]; }
+
+ HitType& getHitAt(HIT_INT_TYPE pos) { assert(pos >= 0 && pos < nhits); return hits[pos]; }
+
+private:
+ READ_INT_TYPE n; // n reads in total
+ HIT_INT_TYPE nhits; // # of hits
+ std::vector<HIT_INT_TYPE> s;
+ std::vector<HitType> hits;
+};
+
+//Each time only read one read's hits. If you want to start over, must call clear() first!
+template<class HitType>
+bool HitContainer<HitType>::read(std::istream& in) {
+ HIT_INT_TYPE tot;
+
+ if (!(in>>tot)) return false;
+ assert(tot > 0);
+ for (HIT_INT_TYPE i = 0; i < tot; i++) {
+ HitType hit;
+ if (!hit.read(in)) return false;
+ hits.push_back(hit);
+ }
+
+ nhits = nhits + tot;
+ ++n;
+ s.push_back(nhits);
+
+ return true;
+}
+
+template<class HitType>
+void HitContainer<HitType>::write(std::ostream& out) {
+ if (n <= 0) return;
+ for (READ_INT_TYPE i = 0; i < n; i++) {
+ out<<s[i + 1] - s[i];
+ for (HIT_INT_TYPE j = s[i]; j < s[i + 1]; j++) {
+ hits[j].write(out);
+ }
+ out<<std::endl;
+ }
+}
+
+template<class HitType>
+READ_INT_TYPE HitContainer<HitType>::calcNumGeneMultiReads(const GroupInfo& gi) {
+ READ_INT_TYPE res = 0;
+ int *sortgids = NULL;
+
+ for (READ_INT_TYPE i = 0; i < n; i++) {
+ HIT_INT_TYPE num = s[i + 1] - s[i];
+ sortgids = new int[num];
+ for (HIT_INT_TYPE j = s[i]; j < s[i + 1]; j++) sortgids[j] = gi.gidAt(hits[j].getSid());
+ std::sort(sortgids, sortgids + num);
+ if (std::unique(sortgids, sortgids + num) - sortgids > 1) ++res;
+ delete[] sortgids;
+ }
+
+ return res;
+}
+
+template<class HitType>
+READ_INT_TYPE HitContainer<HitType>::calcNumIsoformMultiReads() {
+ READ_INT_TYPE res = 0;
+ for (READ_INT_TYPE i = 0; i < n; i++)
+ if (s[i + 1] - s[i] > 1) ++res;
+ return res;
+}
+
+#endif /* HITCONTAINER_H_ */
diff --git a/HitWrapper.h b/HitWrapper.h
new file mode 100644
index 0000000..b2fbbd8
--- /dev/null
+++ b/HitWrapper.h
@@ -0,0 +1,35 @@
+#ifndef HITWRAPPER_H_
+#define HITWRAPPER_H_
+
+#include "utils.h"
+#include "HitContainer.h"
+
+// assume each hit vector contains at least one hit
+
+template<class HitType>
+class HitWrapper {
+public:
+ HitWrapper(int nThreads, HitContainer<HitType> **hitvs) {
+ this->nThreads = nThreads;
+ this->hitvs = hitvs;
+ i = 0; j = 0;
+ }
+
+ HitType* getNextHit() {
+ HitType *res;
+
+ if (i >= nThreads) return NULL;
+ res = &(hitvs[i]->getHitAt(j));
+ ++j;
+ if (j >= hitvs[i]->getNHits()) { ++i; j = 0; }
+
+ return res;
+ }
+
+private:
+ int i, nThreads;
+ HIT_INT_TYPE j;
+ HitContainer<HitType> **hitvs;
+};
+
+#endif /* HITWRAPPER_H_ */
diff --git a/LenDist.h b/LenDist.h
new file mode 100644
index 0000000..90d47d7
--- /dev/null
+++ b/LenDist.h
@@ -0,0 +1,296 @@
+#ifndef LENDIST_H_
+#define LENDIST_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<algorithm>
+
+#include "boost/math/distributions/normal.hpp"
+
+#include "utils.h"
+#include "simul.h"
+
+class LenDist {
+public:
+ LenDist(int minL = 1, int maxL = 1000) {
+ lb = minL - 1;
+ ub = maxL;
+ span = ub - lb;
+ assert(span > 0);
+
+ pdf = new double[span + 1];
+ cdf = new double[span + 1];
+
+ //set initial parameters
+ pdf[0] = cdf[0] = 0.0;
+ for (int i = 1; i <= span; i++) {
+ pdf[i] = 1.0 / span;
+ cdf[i] = i * 1.0 / span;
+ }
+ }
+
+ ~LenDist() {
+ delete[] pdf;
+ delete[] cdf;
+ }
+
+ LenDist& operator=(const LenDist&);
+
+ void setAsNormal(double, double, int, int);
+
+ void init();
+
+ //the corresponding lb and ub are the original one
+ void update(int len, double frac) {
+ assert(len > lb && len <= ub);
+ pdf[len - lb] += frac;
+ }
+
+ void finish();
+
+ int getMinL() const { return lb + 1; }
+ int getMaxL() const { return ub; }
+
+ double getProb(int len) const {
+ assert(len > lb && len <= ub);
+ return pdf[len - lb];
+ }
+
+ //len : mate/fragment length
+ //refL : reference sequence length, in fact, this is totLen for global length distribution
+ double getAdjustedProb(int len, int refL) const {
+ if (len <= lb || len > ub || refL <= lb) return 0.0;
+ double denom = cdf[std::min(ub, refL) - lb];
+ assert(denom >= EPSILON);
+ return pdf[len - lb] / denom;
+ }
+
+ //len : length threshold, any length <= len should be calculated
+ //refL : reference sequence length
+ double getAdjustedCumulativeProb(int len, int refL) const {
+ assert(len > lb && len <= ub && refL > lb);
+ double denom = cdf[std::min(ub, refL) - lb];
+ assert(denom >= EPSILON);
+ return cdf[len - lb] / denom;
+ }
+
+ //for multi-thread usage
+ void collect(const LenDist&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void copyTo(double*&, double*&, int&, int&, int&) const;
+
+ int simulate(simul*, int);
+
+ private:
+ int lb, ub, span; // (lb, ub]
+ double *pdf, *cdf;
+
+ void trim();
+};
+
+LenDist& LenDist::operator=(const LenDist& rv) {
+ if (this == &rv) return *this;
+ if (span != rv.span) {
+ delete[] pdf;
+ delete[] cdf;
+ pdf = new double[rv.span + 1];
+ cdf = new double[rv.span + 1];
+ }
+ lb = rv.lb; ub = rv.ub; span = rv.span;
+ memcpy(pdf, rv.pdf, sizeof(double) * (span + 1));
+ memcpy(cdf, rv.cdf, sizeof(double) * (span + 1));
+
+ return *this;
+}
+
+//Please give interger mean, thanks!
+//minL: new minimum length, maxL: new maximum length
+void LenDist::setAsNormal(double mean, double sd, int minL, int maxL) {
+ int meanL = int(mean + .5); // assume meanL is a integer; if not, round to nearest number.
+ delete[] pdf;
+ delete[] cdf;
+
+ if (sd < EPSILON) {
+ if (meanL < minL || meanL > maxL) {
+ fprintf(stderr, "Length distribution's probability mass is not within the possible range! MeanL = %d, MinL = %d, MaxL = %d\n", meanL, minL, maxL);
+ exit(-1);
+ }
+ span = 1;
+ lb = meanL - 1; ub = meanL;
+ pdf = new double[span + 1];
+ cdf = new double[span + 1];
+ pdf[0] = cdf[0] = 0.0;
+ pdf[1] = cdf[1] = 1.0;
+
+ return;
+ }
+
+
+ boost::math::normal norm(mean, sd);
+
+ if (maxL - minL + 1 > RANGE) {
+ if (meanL <= minL) maxL = minL + RANGE - 1;
+ else if (meanL >= maxL) minL = maxL - RANGE + 1;
+ else {
+ double lg = mean - (minL - 0.5);
+ double rg = (maxL + 0.5) - mean;
+ double half = RANGE / 2.0;
+
+ if (lg < half) { assert(rg > half); maxL = minL + RANGE - 1; }
+ else if (rg < half) { assert(lg > half); minL = maxL - RANGE + 1; }
+ else { minL = int(mean - half + 1.0); maxL = int(mean + half); }
+ }
+ }
+
+ assert(maxL - minL + 1 <= RANGE);
+
+ lb = minL - 1;
+ ub = maxL;
+ span = ub - lb;
+ assert(span > 0);
+
+ pdf = new double[span + 1];
+ cdf = new double[span + 1];
+
+ pdf[0] = cdf[0] = 0.0;
+
+ double old_val, val, sum;
+
+ sum = 0.0;
+ old_val = boost::math::cdf(norm, minL - 0.5);
+ for (int i = 1; i <= span; i++) {
+ val = boost::math::cdf(norm, lb + i + 0.5);
+ pdf[i] = val - old_val;
+ sum += pdf[i];
+ old_val = val;
+ }
+ assert(sum >= EPSILON);
+ for (int i = 1; i <= span; i++) {
+ pdf[i] /= sum;
+ cdf[i] = cdf[i - 1] + pdf[i];
+ }
+
+ trim();
+}
+
+void LenDist::init() {
+ memset(pdf, 0, sizeof(double) * (span + 1));
+ memset(cdf, 0, sizeof(double) * (span + 1));
+}
+
+void LenDist::finish() {
+ double sum = 0.0;
+
+ for (int i = 1; i <= span; i++) {
+ sum += pdf[i];
+ }
+
+ if (sum <= EPSILON) { fprintf(stderr, "No valid read to estimate the length distribution!\n"); exit(-1); }
+
+ for (int i = 1; i <= span; i++) {
+ pdf[i] = pdf[i] / sum;
+ cdf[i] = cdf[i - 1] + pdf[i];
+ }
+ trim();
+}
+
+
+void LenDist::collect(const LenDist& o) {
+ if (lb != o.lb || ub != o.ub) {
+ delete[] pdf;
+ delete[] cdf;
+ lb = o.lb; ub = o.ub; span = o.span;
+ pdf = new double[span + 1];
+ cdf = new double[span + 1];
+ memset(pdf, 0, sizeof(double) * (span + 1));
+ memset(cdf, 0, sizeof(double) * (span + 1));
+ }
+ for (int i = 1; i <= span; i++) {
+ pdf[i] += o.pdf[i];
+ }
+}
+
+void LenDist::read(FILE *fi) {
+ //release default space first
+ delete[] pdf;
+ delete[] cdf;
+
+ assert(fscanf(fi, "%d %d %d", &lb, &ub, &span) == 3);
+ pdf = new double[span + 1];
+ cdf = new double[span + 1];
+ pdf[0] = cdf[0] = 0.0;
+ for (int i = 1; i <= span; i++) {
+ assert(fscanf(fi, "%lf", &pdf[i]) == 1);
+ cdf[i] = cdf[i - 1] + pdf[i];
+ }
+
+ trim();
+}
+
+void LenDist::write(FILE *fo) {
+ fprintf(fo, "%d %d %d\n", lb, ub, span);
+ for (int i = 1; i < span; i++) {
+ fprintf(fo, "%.10g ", pdf[i]);
+ }
+ fprintf(fo, "%.10g\n", pdf[span]);
+}
+
+void LenDist::copyTo(double*& pdf, double*& cdf, int& lb, int& ub, int& span) const {
+ lb = this->lb;
+ ub = this->ub;
+ span = this->span;
+
+ pdf = new double[span + 1];
+ memcpy(pdf, this->pdf, sizeof(double) * (span + 1));
+ cdf = new double[span + 1];
+ memcpy(cdf, this->cdf, sizeof(double) * (span + 1));
+}
+
+//refL = -1 means that this length is generated for noise isoform
+int LenDist::simulate(simul* sampler, int refL) {
+ int dlen;
+
+ if (refL == -1) refL = ub;
+ if (refL <= lb || cdf[(dlen = std::min(ub, refL) - lb)] <= 0.0) return -1;
+ int len = lb + 1 + sampler->sample(cdf + 1, dlen);
+
+ return len;
+}
+
+void LenDist::trim() {
+ int newlb, newub;
+ double *newpdf, *newcdf;
+
+ for (newlb = 1; newlb <= span && pdf[newlb] < EPSILON; newlb++);
+ newlb--;
+ for (newub = span; newub > newlb && pdf[newub] < EPSILON; newub--);
+ assert(newlb < newub);
+ if (newlb == 0 && newub == span) return;
+
+ span = newub - newlb;
+ newpdf = new double[span + 1];
+ memset(newpdf, 0, sizeof(double) * (span + 1));
+ newcdf = new double[span + 1];
+ memset(newcdf, 0, sizeof(double) * (span + 1));
+
+ for (int i = 1; i <= span; i++) {
+ newpdf[i] = pdf[i + newlb];
+ newcdf[i] = cdf[i + newlb];
+ }
+
+ delete[] pdf;
+ delete[] cdf;
+
+ pdf = newpdf;
+ cdf = newcdf;
+
+ lb += newlb;
+ ub = lb + span;
+}
+
+#endif /* LENDIST_H_ */
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f9514e2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,179 @@
+SAMTOOLS = samtools-1.3
+HTSLIB = htslib-1.3
+
+ifneq ($(cygwin), true)
+ SAMTOOLS_MAKEFILE = Makefile
+else
+ SAMTOOLS_MAKEFILE = Makefile.cygwin
+endif
+
+# overridable, defaulting to local copy
+BOOST = .
+
+# Compilation variables
+CXX = g++
+CXXFLAGS = -std=gnu++98 -Wall -I. -I$(BOOST) -I$(SAMTOOLS)/$(HTSLIB)
+CPPFLAGS =
+
+LDFLAGS =
+LDLIBS =
+
+# Installation variables
+INSTALL = install
+INSTALL_PROGRAM = $(INSTALL) -p
+INSTALL_DATA = $(INSTALL) -p -m 644
+INSTALL_DIR = $(INSTALL) -d
+STRIP ?=strip
+
+prefix ?= /usr/local
+exec_prefix = $(prefix)
+bindir = $(exec_prefix)/bin
+
+# Auxiliary variables for compilation
+SAMHEADERS = $(SAMTOOLS)/$(HTSLIB)/htslib/sam.h
+SAMLIBS = $(SAMTOOLS)/$(HTSLIB)/libhts.a
+CONFIGURE = ./configure
+
+OBJS1 = parseIt.o
+OBJS2 = extractRef.o synthesisRef.o preRef.o buildReadIndex.o wiggle.o tbam2gbam.o bam2wig.o bam2readdepth.o getUnique.o samValidator.o scanForPairedEndReads.o SamHeader.o
+OBJS3 = EM.o Gibbs.o calcCI.o simulation.o
+
+PROGS1 = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-build-read-index rsem-simulate-reads
+PROGS2 = rsem-parse-alignments rsem-run-em rsem-tbam2gbam rsem-bam2wig rsem-bam2readdepth rsem-get-unique rsem-sam-validator rsem-scan-for-paired-end-reads
+PROGS3 = rsem-run-gibbs rsem-calculate-credibility-intervals
+
+PROGRAMS = $(PROGS1) $(PROGS2) $(PROGS3)
+
+# Auxiliary variables for installation
+SCRIPTS = rsem-prepare-reference rsem-calculate-expression rsem-refseq-extract-primary-assembly rsem-gff3-to-gtf rsem-plot-model \
+ rsem-plot-transcript-wiggles rsem-gen-transcript-plots rsem-generate-data-matrix \
+ extract-transcript-to-gene-map-from-trinity convert-sam-for-rsem
+
+
+
+
+.PHONY : all ebseq pRSEM clean
+
+all : $(PROGRAMS) $(SAMTOOLS)/samtools
+
+$(SAMTOOLS)/samtools :
+ cd $(SAMTOOLS) && $(CONFIGURE) --without-curses && $(MAKE) -f $(SAMTOOLS_MAKEFILE) samtools
+
+$(SAMLIBS) : $(SAMTOOLS)/samtools
+
+
+# Compile objects
+$(OBJS1) :
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -O2 -c -o $@ $<
+
+$(OBJS2) :
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -O3 -c -o $@ $<
+
+$(OBJS3) :
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -O3 -ffast-math -c -o $@ $<
+
+
+# Generate executables
+$(PROGS1) :
+ $(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+
+$(PROGS2) :
+ $(CXX) $(LDFLAGS) -pthread -o $@ $^ $(LDLIBS) -lz
+
+$(PROGS3) :
+ $(CXX) $(LDFLAGS) -pthread -o $@ $^ $(LDLIBS)
+
+
+# Dependencies for executables
+rsem-extract-reference-transcripts : extractRef.o
+rsem-synthesis-reference-transcripts : synthesisRef.o
+rsem-preref : preRef.o
+rsem-build-read-index : buildReadIndex.o
+rsem-simulate-reads : simulation.o
+
+rsem-parse-alignments : parseIt.o $(SAMLIBS)
+rsem-run-em : EM.o SamHeader.o $(SAMLIBS)
+rsem-tbam2gbam : tbam2gbam.o SamHeader.o $(SAMLIBS)
+rsem-bam2wig : bam2wig.o wiggle.o $(SAMLIBS)
+rsem-bam2readdepth : bam2readdepth.o wiggle.o $(SAMLIBS)
+rsem-get-unique : getUnique.o $(SAMLIBS)
+rsem-sam-validator : samValidator.o $(SAMLIBS)
+rsem-scan-for-paired-end-reads : scanForPairedEndReads.o $(SAMLIBS)
+
+rsem-run-gibbs : Gibbs.o
+rsem-calculate-credibility-intervals : calcCI.o
+
+# Dependencies for objects
+parseIt.o : parseIt.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h GroupInfo.h Transcripts.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h
+
+extractRef.o : extractRef.cpp utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h
+synthesisRef.o : synthesisRef.cpp utils.h my_assert.h Transcript.h Transcripts.h
+preRef.o : preRef.cpp utils.h RefSeq.h Refs.h PolyARules.h RefSeqPolicy.h AlignerRefSeqPolicy.h
+buildReadIndex.o : buildReadIndex.cpp utils.h
+wiggle.o: wiggle.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h wiggle.h
+tbam2gbam.o : tbam2gbam.cpp $(SAMHEADERS) utils.h Transcripts.h Transcript.h BamConverter.h sam_utils.h SamHeader.hpp my_assert.h bc_aux.h
+bam2wig.o : bam2wig.cpp utils.h my_assert.h wiggle.h
+bam2readdepth.o : bam2readdepth.cpp utils.h my_assert.h wiggle.h
+getUnique.o : getUnique.cpp $(SAMHEADERS) sam_utils.h utils.h
+samValidator.o : samValidator.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h
+scanForPairedEndReads.o : scanForPairedEndReads.cpp $(SAMHEADERS) sam_utils.h utils.h my_assert.h
+SamHeader.o : SamHeader.cpp $(SAMHEADERS) SamHeader.hpp
+
+EM.o : EM.cpp $(SAMHEADERS) utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h simul.h sam_utils.h SamHeader.hpp sampling.h $(BOOST)/boost/random.hpp WriteResults.h
+Gibbs.o : Gibbs.cpp utils.h my_assert.h $(BOOST)/boost/random.hpp sampling.h simul.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h ModelParams.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h
+calcCI.o : calcCI.cpp utils.h my_assert.h $(BOOST)/boost/random.hpp sampling.h simul.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h ModelParams.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h
+simulation.o : simulation.cpp utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h $(BOOST)/boost/random.hpp WriteResults.h
+
+# Dependencies for header files
+Transcript.h : utils.h
+Transcripts.h : utils.h my_assert.h Transcript.h
+BowtieRefSeqPolicy.h : RefSeqPolicy.h
+RefSeq.h : utils.h
+Refs.h : utils.h RefSeq.h RefSeqPolicy.h PolyARules.h
+SingleRead.h : Read.h
+SingleReadQ.h : Read.h
+PairedEndRead.h : Read.h SingleRead.h
+PairedEndReadQ.h : Read.h SingleReadQ.h
+PairedEndHit.h : SingleHit.h
+HitContainer.h : GroupInfo.h
+sam_utils.h : $(SAMHEADERS) Transcript.h Transcripts.h
+SamParser.h : $(SAMHEADERS) sam_utils.h utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
+simul.h : $(BOOST)/boost/random.hpp
+ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h
+SingleModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h
+SingleQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h SingleHit.h ReadReader.h simul.h
+PairedEndModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h PairedEndRead.h PairedEndHit.h ReadReader.h simul.h
+PairedEndQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h PairedEndReadQ.h PairedEndHit.h ReadReader.h simul.h
+HitWrapper.h : HitContainer.h
+BamWriter.h : $(SAMHEADERS) sam_utils.h SamHeader.hpp utils.h my_assert.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h
+sampling.h : $(BOOST)/boost/random.hpp
+WriteResults.h : utils.h my_assert.h GroupInfo.h Transcript.h Transcripts.h RefSeq.h Refs.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h
+bc_aux.h : $(SAMHEADERS)
+BamConverter.h : $(SAMHEADERS) sam_utils.h SamHeader.hpp utils.h my_assert.h bc_aux.h Transcript.h Transcripts.h
+Buffer.h : my_assert.h
+SamHeader.hpp : $(SAMHEADERS)
+
+# Compile EBSeq
+ebseq :
+ cd EBSeq && $(MAKE) all
+
+# Compile pRSEM
+pRSEM :
+ cd pRSEM && $(MAKE) all
+
+
+# Install RSEM
+install : $(PROGRAMS) $(SCRIPTS) $(SAMTOOLS)/samtools rsem_perl_utils.pm
+ $(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(bindir)/$(SAMTOOLS)
+ $(foreach prog,$(PROGRAMS),$(INSTALL_PROGRAM) $(prog) $(DESTDIR)$(bindir)/$(prog) ; $(STRIP) $(DESTDIR)$(bindir)/$(prog) ;)
+ $(INSTALL_PROGRAM) $(SAMTOOLS)/samtools $(DESTDIR)$(bindir)/$(SAMTOOLS)/samtools
+ $(STRIP) $(DESTDIR)$(bindir)/$(SAMTOOLS)/samtools
+ $(foreach script,$(SCRIPTS),$(INSTALL_PROGRAM) $(script) $(DESTDIR)$(bindir)/$(script) ;)
+ $(INSTALL_DATA) rsem_perl_utils.pm $(DESTDIR)$(bindir)/rsem_perl_utils.pm
+
+# Clean
+clean :
+ rm -f *.o *~ $(PROGRAMS)
+ cd $(SAMTOOLS) && $(MAKE) clean-all
+ cd EBSeq && $(MAKE) clean
+ cd pRSEM && $(MAKE) clean
diff --git a/Model.h b/Model.h
new file mode 100644
index 0000000..08ee70f
--- /dev/null
+++ b/Model.h
@@ -0,0 +1,7 @@
+#ifndef MODEL_H_
+#define MODEL_H_
+
+class Model {
+
+};
+#endif /* MODEL_H_ */
diff --git a/ModelParams.h b/ModelParams.h
new file mode 100644
index 0000000..069db2a
--- /dev/null
+++ b/ModelParams.h
@@ -0,0 +1,39 @@
+#ifndef MODELPARAMS_H_
+#define MODELPARAMS_H_
+
+#include<cstdio>
+#include<cstring>
+
+#include "utils.h"
+#include "Refs.h"
+
+struct ModelParams {
+ int M;
+ READ_INT_TYPE N[3];
+ int minL, maxL;
+ bool estRSPD; // true if user wants to estimate RSPD; false if use uniform distribution
+ int B; // number of bins in RSPD
+ int mate_minL, mate_maxL;
+ double probF; //probability of forward strand
+ double mean, sd;
+ Refs *refs;
+
+ int seedLen;
+
+ //default parameters
+ ModelParams() {
+ minL = 1; maxL = 1000;
+ estRSPD = false;
+ B = 20; // default bin size if estRSPD is true
+ mate_minL = 1; mate_maxL = 1000;
+ probF = 0.5;
+ mean = -1; sd = 0;
+
+ M = 0;
+ memset(N, 0, sizeof(N));
+ refs = NULL;
+
+ seedLen = 0;
+ }
+};
+#endif /* MODELPARAMS_H_ */
diff --git a/NoiseProfile.h b/NoiseProfile.h
new file mode 100644
index 0000000..5662e0b
--- /dev/null
+++ b/NoiseProfile.h
@@ -0,0 +1,159 @@
+#ifndef NOISEPROFILE_H_
+#define NOISEPROFILE_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cstring>
+#include<string>
+#include<cassert>
+
+#include "utils.h"
+#include "RefSeq.h"
+#include "simul.h"
+
+class NoiseProfile {
+public:
+ NoiseProfile() {
+ logp = 0.0;
+ memset(c, 0, sizeof(c));
+ memset(p, 0, sizeof(p));
+ }
+
+ NoiseProfile& operator=(const NoiseProfile&);
+
+ void init();
+ void updateC(const std::string&);
+ void update(const std::string&, double frac);
+ void finish();
+ void calcInitParams();
+
+ double getProb(const std::string&);
+ double getLogP() { return logp; }
+
+ void collect(const NoiseProfile&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void startSimulation();
+ std::string simulate(simul*, int);
+ void finishSimulation();
+
+private:
+ static const int NCODES = 5;
+
+ double logp;
+ double c[NCODES]; // counts in N0;
+ double p[NCODES];
+
+ double *pc; // for simulation
+};
+
+NoiseProfile& NoiseProfile::operator=(const NoiseProfile& rv) {
+ if (this == &rv) return *this;
+ logp = rv.logp;
+ memcpy(c, rv.c, sizeof(rv.c));
+ memcpy(p, rv.p, sizeof(rv.p));
+ return *this;
+}
+
+void NoiseProfile::init() {
+ memset(p, 0, sizeof(p));
+}
+
+void NoiseProfile::updateC(const std::string& readseq) {
+ int len = readseq.size();
+ for (int i = 0; i < len; i++) {
+ ++c[get_base_id(readseq[i])];
+ }
+}
+
+void NoiseProfile::update(const std::string& readseq, double frac) {
+ int len = readseq.size();
+ for (int i = 0; i < len; i++) {
+ p[get_base_id(readseq[i])] += frac;
+ }
+}
+
+void NoiseProfile::finish() {
+ double sum;
+
+ logp = 0.0;
+ sum = 0.0;
+ for (int i = 0; i < NCODES; i++) sum += (p[i] + c[i]);
+ if (sum <= EPSILON) return;
+ for (int i = 0; i < NCODES; i++) {
+ p[i] = (p[i] + c[i]) / sum;
+ if (c[i] > 0.0) { logp += c[i] * log(p[i]); }
+ }
+}
+
+void NoiseProfile::calcInitParams() {
+ double sum;
+
+ logp = 0.0;
+ sum = 0.0;
+ for (int i = 0; i < NCODES; i++) sum += (1.0 + c[i]);
+ for (int i = 0; i < NCODES; i++) {
+ p[i] = (1.0 + c[i]) / sum;
+ if (c[i] > 0.0) { logp += c[i] * log(p[i]); }
+ }
+}
+
+double NoiseProfile::getProb(const std::string& readseq) {
+ double prob = 1.0;
+ int len = readseq.size();
+
+ for (int i = 0; i < len; i++) {
+ prob *= p[get_base_id(readseq[i])];
+ }
+
+ return prob;
+}
+
+void NoiseProfile::collect(const NoiseProfile& o) {
+ for (int i = 0; i < NCODES; i++)
+ p[i] += o.p[i];
+}
+
+void NoiseProfile::read(FILE *fi) {
+ int tmp_ncodes;
+
+ memset(c, 0, sizeof(c));
+ assert(fscanf(fi, "%d", &tmp_ncodes) == 1);
+ assert(tmp_ncodes == NCODES);
+ for (int i = 0; i < NCODES; i++)
+ assert(fscanf(fi, "%lf", &p[i]) == 1);
+}
+
+void NoiseProfile::write(FILE *fo) {
+ fprintf(fo, "%d\n", NCODES);
+ for (int i = 0; i < NCODES - 1; i++) {
+ fprintf(fo, "%.10g ", p[i]);
+ }
+ fprintf(fo, "%.10g\n", p[NCODES - 1]);
+}
+
+void NoiseProfile::startSimulation() {
+ pc = new double[NCODES];
+
+ for (int i = 0; i < NCODES; i++) {
+ pc[i] = p[i];
+ if (i > 0) pc[i] += pc[i - 1];
+ }
+}
+
+std::string NoiseProfile::simulate(simul* sampler, int len) {
+ std::string readseq = "";
+
+ for (int i = 0; i < len; i++) {
+ readseq.push_back(getCharacter(sampler->sample(pc, NCODES)));
+ }
+ return readseq;
+}
+
+void NoiseProfile::finishSimulation() {
+ delete[] pc;
+}
+
+#endif /* NOISEPROFILE_H_ */
diff --git a/NoiseQProfile.h b/NoiseQProfile.h
new file mode 100644
index 0000000..63c5897
--- /dev/null
+++ b/NoiseQProfile.h
@@ -0,0 +1,181 @@
+#ifndef NOISEQPROFILE_H_
+#define NOISEQPROFILE_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cstring>
+#include<string>
+#include<cassert>
+
+#include "utils.h"
+#include "RefSeq.h"
+#include "simul.h"
+
+class NoiseQProfile {
+public:
+ NoiseQProfile() {
+ logp = 0.0;
+ memset(c, 0, sizeof(c));
+ memset(p, 0, sizeof(p));
+ }
+
+ NoiseQProfile& operator=(const NoiseQProfile&);
+
+ void init();
+ void updateC(const std::string&, const std::string&);
+ void update(const std::string&, const std::string&, double frac);
+ void finish();
+ void calcInitParams();
+
+ double getProb(const std::string&, const std::string&);
+ double getLogP() { return logp; }
+
+ void collect(const NoiseQProfile&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void startSimulation();
+ std::string simulate(simul*, int, const std::string&);
+ void finishSimulation();
+
+private:
+ static const int NCODES = 5; // number of possible codes
+ static const int SIZE = 100;
+
+ double logp; //log prob;
+ double c[SIZE][NCODES]; //counts in N0;
+ double p[SIZE][NCODES]; //p[q][c] = p(c|q)
+
+ int c2q(char c) { assert(c >= 33 && c <= 126); return c - 33; }
+
+ double (*pc)[NCODES]; // for simulation
+};
+
+NoiseQProfile& NoiseQProfile::operator=(const NoiseQProfile& rv) {
+ if (this == &rv) return *this;
+ logp = rv.logp;
+ memcpy(c, rv.c, sizeof(rv.c));
+ memcpy(p, rv.p, sizeof(rv.p));
+ return *this;
+}
+
+void NoiseQProfile::init() {
+ memset(p, 0, sizeof(p));
+}
+
+void NoiseQProfile::updateC(const std::string& readseq, const std::string& qual) {
+ int len = readseq.size();
+ for (int i = 0; i < len; i++) {
+ ++c[c2q(qual[i])][get_base_id(readseq[i])];
+ }
+}
+
+void NoiseQProfile::update(const std::string& readseq, const std::string& qual, double frac) {
+ int len = readseq.size();
+ for (int i = 0; i < len; i++) {
+ p[c2q(qual[i])][get_base_id(readseq[i])] += frac;
+ }
+}
+
+void NoiseQProfile::finish() {
+ double sum;
+
+ //If N0 is 0, p(c|q) = 0 for all c, q
+ logp = 0.0;
+ for (int i = 0; i < SIZE; i++) {
+ sum = 0.0;
+ for (int j = 0; j < NCODES; j++) sum += (p[i][j] + c[i][j]);
+ if (sum <= 0.0) continue;
+ //if (isZero(sum)) continue;
+ for (int j = 0; j < NCODES; j++) {
+ p[i][j] = (p[i][j] + c[i][j]) /sum;
+ if (c[i][j] > 0.0) { logp += c[i][j] * log(p[i][j]); }
+ }
+ }
+}
+
+//make init parameters not zero
+void NoiseQProfile::calcInitParams() {
+ double sum;
+
+ logp = 0.0;
+ for (int i = 0; i < SIZE; i++) {
+ sum = 0.0;
+ for (int j = 0; j < NCODES; j++) sum += (1.0 + c[i][j]); // 1.0 pseudo count
+ for (int j = 0; j < NCODES; j++) {
+ p[i][j] = (c[i][j] + 1.0) / sum;
+ if (c[i][j] > 0.0) { logp += c[i][j] * log(p[i][j]); }
+ }
+ }
+}
+
+double NoiseQProfile::getProb(const std::string& readseq, const std::string& qual) {
+ double prob = 1.0;
+ int len = readseq.size();
+
+ for (int i = 0; i < len; i++) {
+ prob *= p[c2q(qual[i])][get_base_id(readseq[i])];
+ }
+
+ return prob;
+}
+
+void NoiseQProfile::collect(const NoiseQProfile& o) {
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES; j++)
+ p[i][j] += o.p[i][j];
+ }
+}
+
+//If read from file, assume do not need to estimate from data
+void NoiseQProfile::read(FILE *fi) {
+ int tmp_size, tmp_ncodes;
+
+ memset(c, 0, sizeof(c));
+
+ assert(fscanf(fi, "%d %d", &tmp_size, &tmp_ncodes) == 2);
+ assert(tmp_size == SIZE && tmp_ncodes == NCODES);
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES; j++)
+ assert(fscanf(fi, "%lf", &p[i][j]) == 1);
+ }
+}
+
+void NoiseQProfile::write(FILE *fo) {
+ fprintf(fo, "%d %d\n", SIZE, NCODES);
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES - 1; j++) { fprintf(fo, "%.10g ", p[i][j]); }
+ fprintf(fo, "%.10g\n", p[i][NCODES - 1]);
+ }
+}
+
+void NoiseQProfile::startSimulation() {
+ pc = new double[SIZE][NCODES];
+
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES; j++) {
+ pc[i][j] = p[i][j];
+ if (j > 0) pc[i][j] += pc[i][j - 1];
+ }
+ if (isZero(pc[i][NCODES - 1])) {
+ assert(NCODES == 5);
+ pc[i][0] = 0.25; pc[i][1] = 0.5; pc[i][2] = 0.75; pc[i][3] = 1.0; pc[i][4] = 1.0;
+ }
+ }
+}
+
+std::string NoiseQProfile::simulate(simul* sampler, int len, const std::string& qual) {
+ std::string readseq = "";
+
+ for (int i = 0; i < len; i++) {
+ readseq.push_back(getCharacter(sampler->sample(pc[c2q(qual[i])], NCODES)));
+ }
+ return readseq;
+}
+
+void NoiseQProfile::finishSimulation() {
+ delete[] pc;
+}
+
+#endif /* NOISEQPROFILE_H_ */
diff --git a/Orientation.h b/Orientation.h
new file mode 100644
index 0000000..6f31dac
--- /dev/null
+++ b/Orientation.h
@@ -0,0 +1,42 @@
+#ifndef ORIENTATION_H_
+#define ORIENTATION_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+
+#include "simul.h"
+
+class Orientation {
+public:
+ Orientation(double probF = 0.5) {
+ prob[0] = probF;
+ prob[1] = 1.0 - probF;
+ }
+
+ Orientation& operator= (const Orientation& rv) {
+ if (this == &rv) return *this;
+ memcpy(prob, rv.prob, sizeof(rv.prob));
+ return *this;
+ }
+
+ //dir : 0 + 1 -
+ double getProb(int dir) { return prob[dir]; }
+
+ void read(FILE* fi) {
+ assert(fscanf(fi, "%lf", &prob[0]) == 1);
+ prob[1] = 1.0 - prob[0];
+ }
+
+ void write(FILE* fo) {
+ fprintf(fo, "%.10g\n", prob[0]);
+ }
+
+
+ int simulate(simul* sampler) { return (sampler->random() < prob[0] ? 0 : 1); }
+
+private:
+ double prob[2]; //0 + 1 -
+};
+
+#endif /* ORIENTATION_H_ */
diff --git a/PairedEndHit.h b/PairedEndHit.h
new file mode 100644
index 0000000..be3ea54
--- /dev/null
+++ b/PairedEndHit.h
@@ -0,0 +1,36 @@
+#ifndef PAIREDENDHIT_H_
+#define PAIREDENDHIT_H_
+
+#include<iostream>
+
+#include "SingleHit.h"
+
+class PairedEndHit : public SingleHit {
+public:
+ PairedEndHit() : SingleHit() {
+ insertL = 0;
+ }
+
+ PairedEndHit(int sid, int pos, int insertL, double conprb = 0.0) : SingleHit(sid, pos, conprb) {
+ this->insertL = insertL;
+ }
+
+ int getInsertL() const { return insertL; }
+
+ bool read(std::istream&);
+ void write(std::ostream&);
+
+private:
+ int insertL; // insert length
+};
+
+bool PairedEndHit::read(std::istream& in) {
+ conprb = 0.0;
+ return (in>>sid>>pos>>insertL);
+}
+
+void PairedEndHit::write(std::ostream& out) {
+ out<<" "<<sid<<" "<<pos<<" "<<insertL;
+}
+
+#endif /* PAIREDENDHIT_H_ */
diff --git a/PairedEndModel.h b/PairedEndModel.h
new file mode 100644
index 0000000..9712249
--- /dev/null
+++ b/PairedEndModel.h
@@ -0,0 +1,461 @@
+#ifndef PAIREDENDMODEL_H_
+#define PAIREDENDMODEL_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cassert>
+#include<cstring>
+#include<string>
+#include<algorithm>
+#include<sstream>
+#include<iostream>
+#include<vector>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Orientation.h"
+#include "LenDist.h"
+#include "RSPD.h"
+#include "Profile.h"
+#include "NoiseProfile.h"
+
+#include "ModelParams.h"
+#include "RefSeq.h"
+#include "Refs.h"
+#include "SingleRead.h"
+#include "PairedEndRead.h"
+#include "PairedEndHit.h"
+#include "ReadReader.h"
+
+#include "simul.h"
+
+class PairedEndModel {
+public:
+ PairedEndModel(Refs* refs = NULL) {
+ this->refs = refs;
+ M = (refs != NULL ? refs->getM() : 0);
+ memset(N, 0, sizeof(N));
+ estRSPD = false;
+ needCalcConPrb = true;
+
+ ori = new Orientation();
+ gld = new LenDist();
+ rspd = new RSPD(estRSPD);
+ pro = new Profile();
+ npro = new NoiseProfile();
+ mld = new LenDist();
+
+ mw = NULL;
+ seedLen = 0;
+ }
+
+ //If it is not a master node, only init & update can be used!
+ PairedEndModel(ModelParams& params, bool isMaster = true) {
+ M = params.M;
+ memcpy(N, params.N, sizeof(params.N));
+ refs = params.refs;
+ estRSPD = params.estRSPD;
+ seedLen = params.seedLen;
+ needCalcConPrb = true;
+
+ ori = NULL; gld = NULL; rspd = NULL; pro = NULL; npro = NULL; mld = NULL;
+ mw = NULL;
+
+ if (isMaster) {
+ if (!estRSPD) rspd = new RSPD(estRSPD);
+ mld = new LenDist(params.mate_minL, params.mate_maxL);
+ }
+
+ ori = new Orientation(params.probF);
+ gld = new LenDist(params.minL, params.maxL);
+ if (estRSPD) rspd = new RSPD(estRSPD, params.B);
+ pro = new Profile(params.maxL);
+ npro = new NoiseProfile();
+ }
+
+ ~PairedEndModel() {
+ refs = NULL;
+ if (ori != NULL) delete ori;
+ if (gld != NULL) delete gld;
+ if (rspd != NULL) delete rspd;
+ if (pro != NULL) delete pro;
+ if (npro != NULL) delete npro;
+ if (mld != NULL) delete mld;
+ if (mw != NULL) delete mw;
+ }
+
+ void estimateFromReads(const char*);
+
+ //if prob is too small, just make it 0
+ double getConPrb(const PairedEndRead& read, const PairedEndHit& hit) {
+ if (read.isLowQuality()) return 0.0;
+
+ double prob;
+ int sid = hit.getSid();
+ RefSeq &ref = refs->getRef(sid);
+ int dir = hit.getDir();
+ int pos = hit.getPos();
+ int fullLen = ref.getFullLen();
+ int totLen = ref.getTotLen();
+ int insertLen = hit.getInsertL();
+
+ int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
+ int effL = std::min(fullLen, totLen - insertLen + 1);
+
+ general_assert(fpos >= 0, "The alignment of fragment " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
+ " from the forward direction, which should be a non-negative number! " + \
+ "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(fpos + insertLen <= totLen,"Fragment " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
+ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(insertLen <= totLen, "Fragment " + read.getName() + " has length " + itos(insertLen) + ", but it is aligned to transcript " \
+ + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the fragment's length!");
+
+
+ if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
+
+ prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
+ rspd->getAdjustedProb(fpos, effL, fullLen);
+
+ const SingleRead& mate1 = read.getMate1();
+ prob *= mld->getAdjustedProb(mate1.getReadLength(), insertLen) *
+ pro->getProb(mate1.getReadSeq(), ref, pos, dir);
+
+ const SingleRead& mate2 = read.getMate2();
+ int m2pos = totLen - pos - insertLen;
+ int m2dir = !dir;
+ prob *= mld->getAdjustedProb(mate2.getReadLength(), insertLen) *
+ pro->getProb(mate2.getReadSeq(), ref, m2pos, m2dir);
+
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
+
+ return prob;
+ }
+
+ double getNoiseConPrb(const PairedEndRead& read) {
+ if (read.isLowQuality()) return 0.0;
+ double prob;
+ const SingleRead& mate1 = read.getMate1();
+ const SingleRead& mate2 = read.getMate2();
+
+ prob = mld->getProb(mate1.getReadLength()) * npro->getProb(mate1.getReadSeq());
+ prob *= mld->getProb(mate2.getReadLength()) * npro->getProb(mate2.getReadSeq());
+
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[0] < EPSILON ? 0.0: prob / mw[0]);
+
+ return prob;
+ }
+
+ double getLogP() { return npro->getLogP(); }
+
+ void init();
+
+ void update(const PairedEndRead& read, const PairedEndHit& hit, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ RefSeq& ref = refs->getRef(hit.getSid());
+ const SingleRead& mate1 = read.getMate1();
+ const SingleRead& mate2 = read.getMate2();
+
+ gld->update(hit.getInsertL(), frac);
+ if (estRSPD) {
+ int fpos = (hit.getDir() == 0 ? hit.getPos() : ref.getTotLen() - hit.getPos() - hit.getInsertL());
+ rspd->update(fpos, ref.getFullLen(), frac);
+ }
+ pro->update(mate1.getReadSeq(), ref, hit.getPos(), hit.getDir(), frac);
+
+ int m2pos = ref.getTotLen() - hit.getPos() - hit.getInsertL();
+ int m2dir = !hit.getDir();
+ pro->update(mate2.getReadSeq(), ref, m2pos, m2dir, frac);
+ }
+
+ void updateNoise(const PairedEndRead& read, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ const SingleRead& mate1 = read.getMate1();
+ const SingleRead& mate2 = read.getMate2();
+
+ npro->update(mate1.getReadSeq(), frac);
+ npro->update(mate2.getReadSeq(), frac);
+ }
+
+ void finish();
+
+ void collect(const PairedEndModel&);
+
+ bool getNeedCalcConPrb() { return needCalcConPrb; }
+ void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
+
+ void read(const char*);
+ void write(const char*);
+
+ const LenDist& getGLD() { return *gld; }
+
+ void startSimulation(simul*, const std::vector<double>&);
+ bool simulate(READ_INT_TYPE, PairedEndRead&, int&);
+ void finishSimulation();
+
+ //Use it after function 'read' or 'estimateFromReads'
+ const double* getMW() {
+ assert(mw != NULL);
+ return mw;
+ }
+
+ int getModelType() const { return model_type; }
+
+private:
+ static const int model_type = 2;
+ static const int read_type = 2;
+
+ int M;
+ READ_INT_TYPE N[3];
+ Refs *refs;
+ int seedLen;
+
+ bool estRSPD;
+ bool needCalcConPrb; //true need, false does not need
+
+ Orientation *ori;
+ LenDist *gld, *mld; //mld1 mate_length_dist
+ RSPD *rspd;
+ Profile *pro;
+ NoiseProfile *npro;
+
+ simul *sampler; // for simulation
+ double *theta_cdf; // for simulation
+
+ double *mw; // for masking
+
+ void calcMW();
+};
+
+void PairedEndModel::estimateFromReads(const char* readFN) {
+ int s;
+ char readFs[2][STRLEN];
+ PairedEndRead read;
+
+ int n_warns = 0;
+
+ mld->init();
+ for (int i = 0; i < 3; i++)
+ if (N[i] > 0) {
+ genReadFileNames(readFN, i, read_type, s, readFs);
+ ReadReader<PairedEndRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
+
+ READ_INT_TYPE cnt = 0;
+ while (reader.next(read)) {
+ SingleRead mate1 = read.getMate1();
+ SingleRead mate2 = read.getMate2();
+
+ if (!read.isLowQuality()) {
+ mld->update(mate1.getReadLength(), 1.0);
+ mld->update(mate2.getReadLength(), 1.0);
+
+ if (i == 0) {
+ npro->updateC(mate1.getReadSeq());
+ npro->updateC(mate2.getReadSeq());
+ }
+ }
+ else if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)
+ if (++n_warns <= MAX_WARNS)
+ fprintf(stderr, "Warning: Read %s is ignored due to at least one of the mates' length < seed length (= %d)!\n", read.getName().c_str(), seedLen);
+
+ ++cnt;
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
+ }
+
+ if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
+ }
+
+ if (n_warns > 0) fprintf(stderr, "Warning: There are %d reads ignored in total.\n", n_warns);
+
+ mld->finish();
+ npro->calcInitParams();
+
+ mw = new double[M + 1];
+ calcMW();
+}
+
+void PairedEndModel::init() {
+ gld->init();
+ if (estRSPD) rspd->init();
+ pro->init();
+ npro->init();
+}
+
+void PairedEndModel::finish() {
+ gld->finish();
+ if (estRSPD) rspd->finish();
+ pro->finish();
+ npro->finish();
+ needCalcConPrb = true;
+ calcMW();
+}
+
+void PairedEndModel::collect(const PairedEndModel& o) {
+ gld->collect(*(o.gld));
+ if (estRSPD) rspd->collect(*(o.rspd));
+ pro->collect(*(o.pro));
+ npro->collect(*(o.npro));
+}
+
+//Only master node can call
+void PairedEndModel::read(const char* inpF) {
+ int val;
+ FILE *fi = fopen(inpF, "r");
+
+ general_assert(fi != NULL, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+
+ assert(fscanf(fi, "%d", &val) == 1);
+ assert(val == model_type);
+
+ ori->read(fi);
+ gld->read(fi);
+ mld->read(fi);
+ rspd->read(fi);
+ pro->read(fi);
+ npro->read(fi);
+
+ if (fscanf(fi, "%d", &val) == 1) {
+ if (M == 0) M = val;
+ if (M == val) {
+ mw = new double[M + 1];
+ for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
+ }
+ }
+
+ fclose(fi);
+}
+
+//Only master node can call. Only be called at EM.cpp
+void PairedEndModel::write(const char* outF) {
+ FILE *fo = fopen(outF, "w");
+
+ fprintf(fo, "%d\n", model_type);
+ fprintf(fo, "\n");
+
+ ori->write(fo); fprintf(fo, "\n");
+ gld->write(fo); fprintf(fo, "\n");
+ mld->write(fo); fprintf(fo, "\n");
+ rspd->write(fo); fprintf(fo, "\n");
+ pro->write(fo); fprintf(fo, "\n");
+ npro->write(fo);
+
+ if (mw != NULL) {
+ fprintf(fo, "\n%d\n", M);
+ for (int i = 0; i < M; i++) {
+ fprintf(fo, "%.15g ", mw[i]);
+ }
+ fprintf(fo, "%.15g\n", mw[M]);
+ }
+
+ fclose(fo);
+}
+
+void PairedEndModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
+ this->sampler = sampler;
+
+ theta_cdf = new double[M + 1];
+ for (int i = 0; i <= M; i++) {
+ theta_cdf[i] = theta[i];
+ if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
+ }
+
+ rspd->startSimulation(M, refs);
+ pro->startSimulation();
+ npro->startSimulation();
+}
+
+bool PairedEndModel::simulate(READ_INT_TYPE rid, PairedEndRead& read, int& sid) {
+ int dir, pos;
+ int insertL, mateL1, mateL2;
+ std::string name;
+ std::string readseq1, readseq2;
+ std::ostringstream strout;
+
+ sid = sampler->sample(theta_cdf, M + 1);
+
+ if (sid == 0) {
+ dir = pos = insertL = 0;
+ mateL1 = mld->simulate(sampler, -1);
+ readseq1 = npro->simulate(sampler, mateL1);
+
+ mateL2 = mld->simulate(sampler, -1);
+ readseq2 = npro->simulate(sampler, mateL2);
+ }
+ else {
+ RefSeq &ref = refs->getRef(sid);
+ dir = ori->simulate(sampler);
+ insertL = gld->simulate(sampler, ref.getTotLen());
+ if (insertL < 0) return false;
+ int effL = std::min(ref.getFullLen(), ref.getTotLen() - insertL + 1);
+ pos = rspd->simulate(sampler, sid, effL);
+ if (pos < 0) return false;
+ if (dir > 0) pos = ref.getTotLen() - pos - insertL;
+
+ mateL1 = mld->simulate(sampler, insertL);
+ readseq1 = pro->simulate(sampler, mateL1, pos, dir, ref);
+
+ int m2pos = ref.getTotLen() - pos - insertL;
+ int m2dir = !dir;
+
+ mateL2 = mld->simulate(sampler, insertL);
+ readseq2 = pro->simulate(sampler, mateL2, m2pos, m2dir, ref);
+ }
+
+ strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
+ name = strout.str();
+
+ read = PairedEndRead(SingleRead(name + "/1", readseq1), SingleRead(name + "/2", readseq2));
+
+ return true;
+}
+
+void PairedEndModel::finishSimulation() {
+ delete[] theta_cdf;
+
+ rspd->finishSimulation();
+ pro->finishSimulation();
+ npro->finishSimulation();
+}
+
+void PairedEndModel::calcMW() {
+ assert(mld->getMinL() >= seedLen);
+
+ memset(mw, 0, sizeof(double) * (M + 1));
+ mw[0] = 1.0;
+
+ for (int i = 1; i <= M; i++) {
+ RefSeq& ref = refs->getRef(i);
+ int totLen = ref.getTotLen();
+ int fullLen = ref.getFullLen();
+ int end = std::min(fullLen, totLen - gld->getMinL() + 1);
+ double value = 0.0;
+ int minL, maxL;
+ int effL, pfpos;
+
+ //seedPos is fpos here
+ for (int seedPos = 0; seedPos < end; seedPos++)
+ if (ref.getMask(seedPos)) {
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), totLen - seedPos);
+ pfpos = seedPos;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
+ }
+ }
+
+ mw[i] = 1.0 - value;
+
+ if (mw[i] < 1e-8) {
+ //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+ mw[i] = 0.0;
+ }
+ }
+}
+
+#endif /* PAIREDENDMODEL_H_ */
diff --git a/PairedEndQModel.h b/PairedEndQModel.h
new file mode 100644
index 0000000..8237056
--- /dev/null
+++ b/PairedEndQModel.h
@@ -0,0 +1,481 @@
+#ifndef PAIREDENDQMODEL_H_
+#define PAIREDENDQMODEL_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cassert>
+#include<cstring>
+#include<string>
+#include<algorithm>
+#include<sstream>
+#include<iostream>
+#include<vector>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Orientation.h"
+#include "LenDist.h"
+#include "RSPD.h"
+#include "QualDist.h"
+#include "QProfile.h"
+#include "NoiseQProfile.h"
+
+#include "ModelParams.h"
+#include "RefSeq.h"
+#include "Refs.h"
+#include "SingleReadQ.h"
+#include "PairedEndReadQ.h"
+#include "PairedEndHit.h"
+#include "ReadReader.h"
+
+#include "simul.h"
+
+class PairedEndQModel {
+public:
+ PairedEndQModel(Refs* refs = NULL) {
+ this->refs = refs;
+ M = (refs != NULL ? refs->getM() : 0);
+ memset(N, 0, sizeof(N));
+ estRSPD = false;
+ needCalcConPrb = true;
+
+ ori = new Orientation();
+ gld = new LenDist();
+ rspd = new RSPD(estRSPD);
+ qd = new QualDist();
+ qpro = new QProfile();
+ nqpro = new NoiseQProfile();
+ mld = new LenDist();
+
+ mw = NULL;
+ seedLen = 0;
+ }
+
+ //If it is not a master node, only init & update can be used!
+ PairedEndQModel(ModelParams& params, bool isMaster = true) {
+ M = params.M;
+ memcpy(N, params.N, sizeof(params.N));
+ refs = params.refs;
+ estRSPD = params.estRSPD;
+ seedLen = params.seedLen;
+ needCalcConPrb = true;
+
+ ori = NULL; gld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL; mld = NULL;
+ mw = NULL;
+
+ if (isMaster) {
+ if (!estRSPD) rspd = new RSPD(estRSPD);
+ qd = new QualDist();
+ mld = new LenDist(params.mate_minL, params.mate_maxL);
+ }
+
+ ori = new Orientation(params.probF);
+ gld = new LenDist(params.minL, params.maxL);
+ if (estRSPD) rspd = new RSPD(estRSPD, params.B);
+ qpro = new QProfile();
+ nqpro = new NoiseQProfile();
+ }
+
+ ~PairedEndQModel() {
+ refs = NULL;
+ if (ori != NULL) delete ori;
+ if (gld != NULL) delete gld;
+ if (rspd != NULL) delete rspd;
+ if (qd != NULL) delete qd;
+ if (qpro != NULL) delete qpro;
+ if (nqpro != NULL) delete nqpro;
+ if (mld != NULL) delete mld;
+ if (mw != NULL) delete mw;
+ }
+
+ void estimateFromReads(const char*);
+
+ //if prob is too small, just make it 0
+ double getConPrb(const PairedEndReadQ& read, const PairedEndHit& hit) {
+ if (read.isLowQuality()) return 0.0;
+
+ double prob;
+ int sid = hit.getSid();
+ RefSeq &ref = refs->getRef(sid);
+ int dir = hit.getDir();
+ int pos = hit.getPos();
+ int fullLen = ref.getFullLen();
+ int totLen = ref.getTotLen();
+ int insertLen = hit.getInsertL();
+
+ int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
+ int effL = std::min(fullLen, totLen - insertLen + 1);
+
+ general_assert(fpos >= 0, "The alignment of fragment " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
+ " from the forward direction, which should be a non-negative number! " + \
+ "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(fpos + insertLen <= totLen,"Fragment " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
+ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(insertLen <= totLen, "Fragment " + read.getName() + " has length " + itos(insertLen) + ", but it is aligned to transcript " \
+ + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the fragment's length!");
+
+ if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
+
+ prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
+ rspd->getAdjustedProb(fpos, effL, fullLen);
+
+ const SingleReadQ& mate1 = read.getMate1();
+ prob *= mld->getAdjustedProb(mate1.getReadLength(), insertLen) *
+ qpro->getProb(mate1.getReadSeq(), mate1.getQScore(), ref, pos, dir);
+
+ const SingleReadQ& mate2 = read.getMate2();
+ int m2pos = totLen - pos - insertLen;
+ int m2dir = !dir;
+
+ prob *= mld->getAdjustedProb(mate2.getReadLength(), hit.getInsertL()) *
+ qpro->getProb(mate2.getReadSeq(), mate2.getQScore(), ref, m2pos, m2dir);
+
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
+
+ return prob;
+ }
+
+ double getNoiseConPrb(const PairedEndReadQ& read) {
+ if (read.isLowQuality()) return 0.0;
+
+ double prob;
+ const SingleReadQ& mate1 = read.getMate1();
+ const SingleReadQ& mate2 = read.getMate2();
+
+ prob = mld->getProb(mate1.getReadLength()) * nqpro->getProb(mate1.getReadSeq(), mate1.getQScore());
+ prob *= mld->getProb(mate2.getReadLength()) * nqpro->getProb(mate2.getReadSeq(), mate2.getQScore());
+
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[0] < EPSILON ? 0.0: prob / mw[0]);
+
+ return prob;
+ }
+
+ double getLogP() { return nqpro->getLogP(); }
+
+ void init();
+
+ void update(const PairedEndReadQ& read, const PairedEndHit& hit, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ RefSeq& ref = refs->getRef(hit.getSid());
+ const SingleReadQ& mate1 = read.getMate1();
+ const SingleReadQ& mate2 = read.getMate2();
+
+ gld->update(hit.getInsertL(), frac);
+ if (estRSPD) {
+ int fpos = (hit.getDir() == 0 ? hit.getPos() : ref.getTotLen() - hit.getPos() - hit.getInsertL());
+ rspd->update(fpos, ref.getFullLen(), frac);
+ }
+ qpro->update(mate1.getReadSeq(), mate1.getQScore(), ref, hit.getPos(), hit.getDir(), frac);
+
+ int m2pos = ref.getTotLen() - hit.getPos() - hit.getInsertL();
+ int m2dir = !hit.getDir();
+ qpro->update(mate2.getReadSeq(), mate2.getQScore(), ref, m2pos, m2dir, frac);
+ }
+
+ void updateNoise(const PairedEndReadQ& read, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ const SingleReadQ& mate1 = read.getMate1();
+ const SingleReadQ& mate2 = read.getMate2();
+
+ nqpro->update(mate1.getReadSeq(), mate1.getQScore(), frac);
+ nqpro->update(mate2.getReadSeq(), mate2.getQScore(), frac);
+ }
+
+ void finish();
+
+ void collect(const PairedEndQModel&);
+
+ bool getNeedCalcConPrb() { return needCalcConPrb; }
+ void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
+
+ void read(const char*);
+ void write(const char*);
+
+ const LenDist& getGLD() { return *gld; }
+
+ void startSimulation(simul*, const std::vector<double>&);
+ bool simulate(READ_INT_TYPE, PairedEndReadQ&, int&);
+ void finishSimulation();
+
+ //Use it after function 'read' or 'estimateFromReads'
+ const double* getMW() {
+ assert(mw != NULL);
+ return mw;
+ }
+
+ int getModelType() const { return model_type; }
+
+private:
+ static const int model_type = 3;
+ static const int read_type = 3;
+
+ int M;
+ READ_INT_TYPE N[3];
+ Refs *refs;
+ int seedLen;
+
+ bool estRSPD;
+ bool needCalcConPrb; //true need, false does not need
+
+ Orientation *ori;
+ LenDist *gld, *mld; //mld1 mate_length_dist
+ RSPD *rspd;
+ QualDist *qd;
+ QProfile *qpro;
+ NoiseQProfile *nqpro;
+
+ simul *sampler; // for simulation
+ double *theta_cdf; // for simulation
+
+ double *mw; // for masking
+
+ void calcMW();
+};
+
+void PairedEndQModel::estimateFromReads(const char* readFN) {
+ int s;
+ char readFs[2][STRLEN];
+ PairedEndReadQ read;
+
+ int n_warns = 0;
+
+ mld->init();
+ for (int i = 0; i < 3; i++)
+ if (N[i] > 0) {
+ genReadFileNames(readFN, i, read_type, s, readFs);
+ ReadReader<PairedEndReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
+
+ READ_INT_TYPE cnt = 0;
+ while (reader.next(read)) {
+ SingleReadQ mate1 = read.getMate1();
+ SingleReadQ mate2 = read.getMate2();
+
+ if (!read.isLowQuality()) {
+ mld->update(mate1.getReadLength(), 1.0);
+ mld->update(mate2.getReadLength(), 1.0);
+
+ qd->update(mate1.getQScore());
+ qd->update(mate2.getQScore());
+
+ if (i == 0) {
+ nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
+ nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
+ }
+ }
+ else if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)
+ if (n_warns <= MAX_WARNS)
+ fprintf(stderr, "Warning: Read %s is ignored due to at least one of the mates' length < seed length (= %d)!\n", read.getName().c_str(), seedLen);
+
+ ++cnt;
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
+ }
+
+ if (verbose) { std::cout<<"estimateFromReads, N"<< i<<" finished."<< std::endl; }
+ }
+
+ if (n_warns > 0) fprintf(stderr, "Warning: There are %d reads ignored in total.\n", n_warns);
+
+ mld->finish();
+ qd->finish();
+ nqpro->calcInitParams();
+
+ mw = new double[M + 1];
+ calcMW();
+}
+
+void PairedEndQModel::init() {
+ gld->init();
+ if (estRSPD) rspd->init();
+ qpro->init();
+ nqpro->init();
+}
+
+void PairedEndQModel::finish() {
+ gld->finish();
+ if (estRSPD) rspd->finish();
+ qpro->finish();
+ nqpro->finish();
+ needCalcConPrb = true;
+ calcMW();
+}
+
+void PairedEndQModel::collect(const PairedEndQModel& o) {
+ gld->collect(*(o.gld));
+ if (estRSPD) rspd->collect(*(o.rspd));
+ qpro->collect(*(o.qpro));
+ nqpro->collect(*(o.nqpro));
+}
+
+//Only master node can call
+void PairedEndQModel::read(const char* inpF) {
+ int val;
+ FILE *fi = fopen(inpF, "r");
+
+ general_assert(fi != NULL, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+
+ assert(fscanf(fi, "%d", &val) == 1);
+ assert(val == model_type);
+
+ ori->read(fi);
+ gld->read(fi);
+ mld->read(fi);
+ rspd->read(fi);
+ qd->read(fi);
+ qpro->read(fi);
+ nqpro->read(fi);
+
+ if (fscanf(fi, "%d", &val) == 1) {
+ if (M == 0) M = val;
+ if (M == val) {
+ mw = new double[M + 1];
+ for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
+ }
+ }
+
+
+ fclose(fi);
+}
+
+//Only master node can call. Only be called at EM.cpp
+void PairedEndQModel::write(const char* outF) {
+ FILE *fo = fopen(outF, "w");
+
+ fprintf(fo, "%d\n", model_type);
+ fprintf(fo, "\n");
+
+ ori->write(fo); fprintf(fo, "\n");
+ gld->write(fo); fprintf(fo, "\n");
+ mld->write(fo); fprintf(fo, "\n");
+ rspd->write(fo); fprintf(fo, "\n");
+ qd->write(fo); fprintf(fo, "\n");
+ qpro->write(fo); fprintf(fo, "\n");
+ nqpro->write(fo);
+
+ if (mw != NULL) {
+ fprintf(fo, "\n%d\n", M);
+ for (int i = 0; i < M; i++) {
+ fprintf(fo, "%.15g ", mw[i]);
+ }
+ fprintf(fo, "%.15g\n", mw[M]);
+ }
+
+ fclose(fo);
+}
+
+void PairedEndQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
+ this->sampler = sampler;
+
+ theta_cdf = new double[M + 1];
+ for (int i = 0; i <= M; i++) {
+ theta_cdf[i] = theta[i];
+ if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
+ }
+
+ rspd->startSimulation(M, refs);
+ qd->startSimulation();
+ qpro->startSimulation();
+ nqpro->startSimulation();
+}
+
+bool PairedEndQModel::simulate(READ_INT_TYPE rid, PairedEndReadQ& read, int& sid) {
+ int dir, pos;
+ int insertL, mateL1, mateL2;
+ std::string name;
+ std::string qual1, qual2, readseq1, readseq2;
+ std::ostringstream strout;
+
+ sid = sampler->sample(theta_cdf, M + 1);
+
+ if (sid == 0) {
+ dir = pos = insertL = 0;
+ mateL1 = mld->simulate(sampler, -1);
+ qual1 = qd->simulate(sampler, mateL1);
+ readseq1 = nqpro->simulate(sampler, mateL1, qual1);
+
+ mateL2 = mld->simulate(sampler, -1);
+ qual2 = qd->simulate(sampler, mateL2);
+ readseq2 = nqpro->simulate(sampler, mateL2, qual2);
+ }
+ else {
+ RefSeq &ref = refs->getRef(sid);
+ dir = ori->simulate(sampler);
+ insertL = gld->simulate(sampler, ref.getTotLen());
+ if (insertL < 0) return false;
+ int effL = std::min(ref.getFullLen(), ref.getTotLen() - insertL + 1);
+ pos = rspd->simulate(sampler, sid, effL);
+ if (pos < 0) return false;
+ if (dir > 0) pos = ref.getTotLen() - pos - insertL;
+
+ mateL1 = mld->simulate(sampler, insertL);
+ qual1 = qd->simulate(sampler, mateL1);
+ readseq1 = qpro->simulate(sampler, mateL1, pos, dir, qual1, ref);
+
+ int m2pos = ref.getTotLen() - pos - insertL;
+ int m2dir = !dir;
+
+ mateL2 = mld->simulate(sampler, insertL);
+ qual2 = qd->simulate(sampler, mateL2);
+ readseq2 = qpro->simulate(sampler, mateL2, m2pos, m2dir, qual2, ref);
+ }
+
+ strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
+ name = strout.str();
+
+ read = PairedEndReadQ(SingleReadQ(name + "/1", readseq1, qual1), SingleReadQ(name + "/2", readseq2, qual2));
+
+ return true;
+}
+
+void PairedEndQModel::finishSimulation() {
+ delete[] theta_cdf;
+
+ rspd->finishSimulation();
+ qd->finishSimulation();
+ qpro->finishSimulation();
+ nqpro->finishSimulation();
+}
+
+
+void PairedEndQModel::calcMW() {
+ assert(mld->getMinL() >= seedLen);
+
+ memset(mw, 0, sizeof(double) * (M + 1));
+ mw[0] = 1.0;
+
+ for (int i = 1; i <= M; i++) {
+ RefSeq& ref = refs->getRef(i);
+ int totLen = ref.getTotLen();
+ int fullLen = ref.getFullLen();
+ int end = std::min(fullLen, totLen - gld->getMinL() + 1);
+ double value = 0.0;
+ int minL, maxL;
+ int effL, pfpos;
+
+ //seedPos is fpos here
+ for (int seedPos = 0; seedPos < end; seedPos++)
+ if (ref.getMask(seedPos)) {
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), totLen - seedPos);
+ pfpos = seedPos;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
+ }
+ }
+
+ mw[i] = 1.0 - value;
+
+ if (mw[i] < 1e-8) {
+ // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+ mw[i] = 0.0;
+ }
+ }
+}
+
+#endif /* PAIREDENDQMODEL_H_ */
diff --git a/PairedEndRead.h b/PairedEndRead.h
new file mode 100644
index 0000000..01d3d00
--- /dev/null
+++ b/PairedEndRead.h
@@ -0,0 +1,67 @@
+#ifndef PAIREDENDREAD
+#define PAIREDENDREAD
+
+#include<cassert>
+#include<iostream>
+#include<string>
+
+#include "Read.h"
+#include "SingleRead.h"
+
+class PairedEndRead : public Read {
+public:
+ PairedEndRead() : mate1(), mate2() {}
+ PairedEndRead(const SingleRead& mate1, const SingleRead& mate2) {
+ this->mate1 = mate1;
+ this->mate2 = mate2;
+ this->name = mate1.getName();
+ }
+
+ bool read(int argc, std::istream* argv[], int flags = 7);
+ void write(int argc, std::ostream* argv[]);
+
+ const SingleRead& getMate1() const { return mate1; }
+ const SingleRead& getMate2() const { return mate2; }
+ const SingleRead& getMate(int i) const {
+ if (i == 1) return mate1;
+ else return mate2;
+ }
+
+ void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
+private:
+ SingleRead mate1, mate2;
+};
+
+bool PairedEndRead::read(int argc, std::istream* argv[], int flags) {
+ bool success;
+ std::istream *inpMate1[1], *inpMate2[1];
+
+ assert(argc == 2);
+ inpMate1[0] = argv[0]; inpMate2[0] = argv[1];
+ success = mate1.read(1, inpMate1, flags) && mate2.read(1, inpMate2, flags);
+ name = "";
+ if (flags & 4) { name = mate1.getName(); } //May chop 1 char later if we want
+
+ return success;
+}
+
+void PairedEndRead::write(int argc, std::ostream *argv[]) {
+ std::ostream *outMate1[1], *outMate2[1];
+
+ assert(argc == 2);
+ outMate1[0] = argv[0]; outMate2[0] = argv[1];
+ mate1.write(1, outMate1);
+ mate2.write(1, outMate2);
+}
+
+//calculate if this read is low quality
+void PairedEndRead::calc_lq(bool hasPolyA, int seedLen) {
+ low_quality = false;
+ mate1.calc_lq(hasPolyA, seedLen);
+ mate2.calc_lq(hasPolyA, seedLen);
+ if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen) low_quality = true;
+ else low_quality = mate1.isLowQuality() && mate2.isLowQuality();
+}
+
+#endif
diff --git a/PairedEndReadQ.h b/PairedEndReadQ.h
new file mode 100644
index 0000000..7513820
--- /dev/null
+++ b/PairedEndReadQ.h
@@ -0,0 +1,67 @@
+#ifndef PAIREDENDREADQ_H_
+#define PAIREDENDREADQ_H_
+
+#include<cassert>
+#include<iostream>
+#include<string>
+
+#include "Read.h"
+#include "SingleReadQ.h"
+
+class PairedEndReadQ : public Read {
+public:
+ PairedEndReadQ() : mate1(), mate2() {}
+ PairedEndReadQ(const SingleReadQ& mate1, const SingleReadQ& mate2) {
+ this->mate1 = mate1;
+ this->mate2 = mate2;
+ this->name = mate1.getName();
+ }
+
+ bool read(int argc, std::istream* argv[], int flags = 7);
+ void write(int argc, std::ostream* argv[]);
+
+ const SingleReadQ& getMate1() const { return mate1; }
+ const SingleReadQ& getMate2() const { return mate2; }
+ const SingleReadQ& getMate(int i) const {
+ if (i == 1) return mate1;
+ else return mate2;
+ }
+
+ void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
+private:
+ SingleReadQ mate1, mate2;
+};
+
+bool PairedEndReadQ::read(int argc, std::istream* argv[], int flags) {
+ bool success;
+ std::istream *inpMate1[1], *inpMate2[1];
+
+ assert(argc == 2);
+ inpMate1[0] = argv[0]; inpMate2[0] = argv[1];
+ success = mate1.read(1, inpMate1, flags) && mate2.read(1, inpMate2, flags);
+ name = "";
+ if (flags & 4) { name = mate1.getName(); } //May chop 1 char later if we want
+
+ return success;
+}
+
+void PairedEndReadQ::write(int argc, std::ostream* argv[]) {
+ std::ostream *outMate1[1], *outMate2[1];
+
+ assert(argc == 2);
+ outMate1[0] = argv[0]; outMate2[0] = argv[1];
+ mate1.write(1, outMate1);
+ mate2.write(1, outMate2);
+}
+
+//calculate if this read is low quality
+void PairedEndReadQ::calc_lq(bool hasPolyA, int seedLen) {
+ low_quality = false;
+ mate1.calc_lq(hasPolyA, seedLen);
+ mate2.calc_lq(hasPolyA, seedLen);
+ if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen) low_quality = true;
+ else low_quality = mate1.isLowQuality() && mate2.isLowQuality();
+}
+
+#endif /* PAIREDENDREADQ_H_ */
diff --git a/PolyARules.h b/PolyARules.h
new file mode 100644
index 0000000..eb57f2c
--- /dev/null
+++ b/PolyARules.h
@@ -0,0 +1,61 @@
+#ifndef POLYARULES
+#define POLYARULES
+
+#include<cstdio>
+#include<cstdlib>
+#include<cassert>
+#include<set>
+#include<cstring>
+#include<fstream>
+
+/**
+Isoform id starts from 1 !
+*/
+
+class PolyARules {
+public:
+ PolyARules() {
+ polyAChoice = 0;
+ polyALen = 0;
+ exceptionList.clear();
+ }
+
+ //Assume parameters are valid here
+ PolyARules(int polyAChoice, int polyALen, char* exceptionF) {
+ this->polyAChoice = polyAChoice;
+ this->polyALen = polyALen;
+
+ if (polyAChoice == 2) {
+ exceptionList.clear();
+
+ std::string transcript_id;
+ std::ifstream fin(exceptionF);
+ if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", exceptionF); exit(-1); }
+
+ while (fin>> transcript_id) {
+ exceptionList.insert(transcript_id);
+ }
+
+ fin.close();
+ }
+ }
+
+ //get the length of padding poly As
+ int getLenAt(const std::string& transcript_id) {
+ switch(polyAChoice) {
+ case 0 : return polyALen;
+ case 1 : return 0;
+ case 2 : iter = exceptionList.find(transcript_id);
+ return (iter == exceptionList.end() ? polyALen : 0);
+ default : assert(false);
+ }
+ }
+
+private:
+ int polyAChoice; // 0, pad; 1, do not pad; 2 pad all but those in exceptionList
+ int polyALen;
+ std::set<std::string> exceptionList; // exception list of transcript_ids
+ std::set<std::string>::iterator iter;
+};
+
+#endif
diff --git a/Profile.h b/Profile.h
new file mode 100644
index 0000000..ecf059c
--- /dev/null
+++ b/Profile.h
@@ -0,0 +1,220 @@
+#ifndef PROFILE_H_
+#define PROFILE_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+
+#include "utils.h"
+#include "RefSeq.h"
+#include "simul.h"
+
+
+class Profile {
+public:
+ Profile(int = 1000);
+ ~Profile() {
+ delete[] p;
+ }
+
+ Profile& operator=(const Profile&);
+
+ void init();
+ void update(const std::string&, const RefSeq&, int, int, double);
+ void finish();
+
+ double getProb(const std::string&, const RefSeq&, int, int);
+
+ void collect(const Profile&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void startSimulation();
+ std::string simulate(simul*, int, int, int, const RefSeq&);
+ void finishSimulation();
+
+private:
+ static const int NCODES = 5;
+
+ int proLen; // profile length
+ int size; // # of items in p;
+ double (*p)[NCODES][NCODES]; //profile matrices
+
+ double (*pc)[NCODES][NCODES]; // for simulation
+};
+
+Profile::Profile(int maxL) {
+ proLen = maxL;
+ size = proLen * NCODES * NCODES;
+ p = new double[proLen][NCODES][NCODES];
+ memset(p, 0, sizeof(double) * size);
+
+ //set initial parameters
+ int N = NCODES - 1;
+ double probN = 1e-5, portionC = 0.99; //portionC, among ACGT, the portion of probability mass the correct base takes
+ double probC, probO;
+
+ for (int i = 0; i < proLen; i++) {
+ for (int j = 0; j < NCODES - 1; j++) {
+ p[i][j][N] = probN;
+ probC = portionC * (1.0 - probN);
+ probO = (1.0 - portionC) / (NCODES - 2) * (1.0 - probN);
+
+ for (int k = 0; k < NCODES - 1; k++) {
+ p[i][j][k] = (j == k ? probC : probO);
+ }
+ }
+ p[i][N][N] = probN;
+ for (int k = 0; k < NCODES - 1; k++)
+ p[i][N][k] = (1.0 - probN) / (NCODES - 1);
+ }
+}
+
+Profile& Profile::operator=(const Profile& rv) {
+ if (this == &rv) return *this;
+ if (proLen != rv.proLen) {
+ delete[] p;
+ proLen = rv.proLen;
+ size = rv.size;
+ p = new double[rv.proLen][NCODES][NCODES];
+ }
+ memcpy(p, rv.p, sizeof(double) * rv.size);
+
+ return *this;
+}
+
+void Profile::init() {
+ memset(p, 0, sizeof(double) * size);
+}
+
+void Profile::update(const std::string& readseq, const RefSeq& refseq, int pos, int dir, double frac) {
+ int len = readseq.size();
+ for (int i = 0; i < len; i++) {
+ p[i][refseq.get_id(i + pos, dir)][get_base_id(readseq[i])] += frac;
+ }
+}
+
+void Profile::finish() {
+ double sum;
+
+ for (int i = 0; i < proLen; i++) {
+ for (int j = 0; j < NCODES; j++) {
+ sum = 0.0;
+ for (int k = 0; k < NCODES; k++) sum += p[i][j][k];
+ if (sum < EPSILON) {
+ for (int k = 0; k < NCODES; k++) p[i][j][k] = 0.0;
+ continue;
+ }
+ for (int k = 0; k < NCODES; k++) p[i][j][k] /= sum;
+ }
+ }
+}
+
+double Profile::getProb(const std::string& readseq, const RefSeq& refseq, int pos, int dir) {
+ double prob = 1.0;
+ int len = readseq.size();
+
+ for (int i = 0; i < len; i++) {
+ prob *= p[i][refseq.get_id(i + pos, dir)][get_base_id(readseq[i])];
+
+ }
+
+ return prob;
+}
+
+void Profile::collect(const Profile& o) {
+ for (int i = 0; i < proLen; i++)
+ for (int j = 0; j < NCODES; j++)
+ for (int k = 0; k < NCODES; k++)
+ p[i][j][k] += o.p[i][j][k];
+}
+
+void Profile::read(FILE *fi) {
+ int tmp_prolen, tmp_ncodes;
+ assert(fscanf(fi, "%d %d", &tmp_prolen, &tmp_ncodes) == 2);
+ assert(tmp_ncodes == NCODES);
+ if (tmp_prolen != proLen) {
+ delete[] p;
+ proLen = tmp_prolen;
+ size = proLen * NCODES * NCODES;
+ p = new double[proLen][NCODES][NCODES];
+ memset(p, 0, sizeof(double) * size);
+ }
+
+ for (int i = 0; i < proLen; i++)
+ for (int j = 0; j < NCODES; j++)
+ for (int k = 0; k < NCODES; k++)
+ assert(fscanf(fi, "%lf", &p[i][j][k]) == 1);
+}
+
+void Profile::write(FILE* fo) {
+ fprintf(fo, "%d %d\n", proLen, NCODES);
+ for (int i = 0; i < proLen; i++) {
+ for (int j = 0; j < NCODES; j++) {
+ for (int k = 0; k < NCODES - 1; k++)
+ fprintf(fo, "%.10g ", p[i][j][k]);
+ fprintf(fo, "%.10g\n", p[i][j][NCODES - 1]);
+ }
+ if (i < proLen - 1) { fprintf(fo, "\n"); }
+ }
+}
+
+void Profile::startSimulation() {
+ pc = new double[proLen][NCODES][NCODES];
+ for (int i = 0; i < proLen; i++) {
+ for (int j = 0; j < NCODES; j++)
+ for (int k = 0; k < NCODES; k++) {
+ pc[i][j][k] = p[i][j][k];
+ if (k > 0) pc[i][j][k] += pc[i][j][k - 1];
+ }
+ //avoid sampling from 0.0!!!
+ double cp_sum, cp_d, cp_n;
+ cp_sum = cp_d = cp_n = 0.0;
+
+ for (int j = 0; j < NCODES - 1; j++) {
+ cp_sum += pc[i][j][NCODES - 1];
+ cp_d += p[i][j][j];
+ cp_n += p[i][j][NCODES - 1];
+ }
+
+ if (cp_sum == 0.0) continue;
+
+ double p_d, p_o, p_n;
+ p_d = cp_d / cp_sum;
+ p_n = cp_n / cp_sum;
+ p_o = (1.0 - p_d - p_n) / (NCODES - 2);
+ for (int j = 0; j < NCODES - 1; j++) {
+ if (pc[i][j][NCODES - 1] > 0.0) continue;
+ for (int k = 0; k < NCODES; k++) {
+ if (k == j) pc[i][j][k] = p_d;
+ else if (k == NCODES - 1) pc[i][j][k] = p_n;
+ else pc[i][j][k] = p_o;
+ if (k > 0) pc[i][j][k] += pc[i][j][k - 1];
+ }
+ }
+ if (pc[i][NCODES - 1][NCODES - 1] == 0.0) {
+ p_o = (1.0 - p_n) / (NCODES - 1);
+ for (int k = 0; k < NCODES; k++) {
+ pc[i][NCODES - 1][k] = (k < NCODES - 1 ? p_o : p_n);
+ if (k > 0) pc[i][NCODES - 1][k] += pc[i][NCODES - 1][k - 1];
+ }
+ }
+ }
+
+}
+
+std::string Profile::simulate(simul* sampler, int len, int pos, int dir, const RefSeq& refseq) {
+ std::string readseq = "";
+
+ for (int i = 0; i < len; i++) {
+ readseq.push_back(getCharacter(sampler->sample(pc[i][refseq.get_id(i + pos, dir)], NCODES)));
+ }
+ return readseq;
+}
+
+void Profile::finishSimulation() {
+ delete[] pc;
+}
+
+#endif /* PROFILE_H_ */
diff --git a/QProfile.h b/QProfile.h
new file mode 100644
index 0000000..6646342
--- /dev/null
+++ b/QProfile.h
@@ -0,0 +1,208 @@
+#ifndef QPROFILE_H_
+#define QPROFILE_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+
+#include "utils.h"
+#include "RefSeq.h"
+#include "simul.h"
+
+
+class QProfile {
+public:
+ QProfile();
+ QProfile& operator=(const QProfile&);
+
+ void init();
+ void update(const std::string&, const std::string&, const RefSeq&, int, int, double);
+ void finish();
+
+ double getProb(const std::string&, const std::string&, const RefSeq&, int, int);
+
+ void collect(const QProfile&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void startSimulation();
+ std::string simulate(simul*, int, int, int, const std::string&, const RefSeq&);
+ void finishSimulation();
+
+private:
+ static const int NCODES = 5; // number of possible codes
+ static const int SIZE = 100;
+
+ double p[SIZE][NCODES][NCODES]; // p[q][r][c] = p(c|r,q)
+
+ //make sure that quality score in [0, 93]
+ int c2q(char c) { assert(c >= 33 && c <= 126); return c - 33; }
+
+ double (*pc)[NCODES][NCODES]; // for simulation
+};
+
+QProfile::QProfile() {
+ memset(p, 0, sizeof(p));
+
+ //make initialized parameters
+ //ASSUME order of A, C, G, T, N
+ int N = NCODES - 1;
+ double probN = 1e-5;
+ double probC, probO; // current, other
+
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES - 1; j++) {
+ p[i][j][N] = probN;
+
+ probO = exp(-i / 10.0 * log(10.0));
+ probC = 1.0 - probO;
+ probO /= (NCODES - 2);
+
+ probC *= (1.0 - probN);
+ probO *= (1.0 - probN);
+
+ assert(probC >= 0.0 && probO >= 0.0);
+
+ for (int k = 0; k < NCODES - 1; k++) {
+ if (j == k) p[i][j][k] = probC;
+ else p[i][j][k] = probO;
+ }
+ }
+ p[i][N][N] = probN;
+ for (int k = 0; k < NCODES - 1; k++)
+ p[i][N][k] = (1.0 - probN) / (NCODES - 1);
+ }
+}
+
+QProfile& QProfile::operator=(const QProfile& rv) {
+ if (this == &rv) return *this;
+ memcpy(p, rv.p, sizeof(rv.p));
+ return *this;
+}
+
+void QProfile::init() {
+ memset(p, 0, sizeof(p));
+}
+
+void QProfile::update(const std::string& readseq, const std::string& qual, const RefSeq& refseq, int pos, int dir, double frac) {
+ int len = readseq.size();
+ for (int i = 0; i < len; i++) {
+ p[c2q(qual[i])][refseq.get_id(i + pos, dir)][get_base_id(readseq[i])] += frac;
+ }
+}
+
+void QProfile::finish() {
+ double sum;
+
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES; j++) {
+ sum = 0.0;
+ for (int k = 0; k < NCODES; k++) sum += p[i][j][k];
+ if (sum < EPSILON) {
+ for (int k = 0; k < NCODES; k++) p[i][j][k] = 0.0;
+ continue;
+ }
+ for (int k = 0; k < NCODES; k++) p[i][j][k] /= sum;
+ }
+ }
+}
+
+double QProfile::getProb(const std::string& readseq, const std::string& qual, const RefSeq& refseq, int pos, int dir) {
+ double prob = 1.0;
+ int len = readseq.size();
+
+ for (int i = 0; i < len; i++) {
+ prob *= p[c2q(qual[i])][refseq.get_id(i + pos, dir)][get_base_id(readseq[i])];
+ }
+
+ return prob;
+}
+
+void QProfile::collect(const QProfile& o) {
+ for (int i = 0; i < SIZE; i++)
+ for (int j = 0; j < NCODES; j++)
+ for (int k = 0; k < NCODES; k++)
+ p[i][j][k] += o.p[i][j][k];
+}
+
+void QProfile::read(FILE *fi) {
+ int tmp_size, tmp_ncodes;
+ assert(fscanf(fi, "%d %d", &tmp_size, &tmp_ncodes) == 2);
+ assert(tmp_size == SIZE && tmp_ncodes == NCODES);
+ for (int i = 0; i < SIZE; i++)
+ for (int j = 0; j < NCODES; j++)
+ for (int k = 0; k < NCODES; k++)
+ assert(fscanf(fi, "%lf", &p[i][j][k]) == 1);
+}
+
+void QProfile::write(FILE *fo) {
+ fprintf(fo, "%d %d\n", SIZE, NCODES);
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES; j++) {
+ for (int k = 0; k < NCODES - 1; k++)
+ fprintf(fo, "%.10g ", p[i][j][k]);
+ fprintf(fo, "%.10g\n", p[i][j][NCODES - 1]);
+ }
+ if (i < SIZE - 1) { fprintf(fo, "\n"); }
+ }
+}
+
+void QProfile::startSimulation() {
+ pc = new double[SIZE][NCODES][NCODES];
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < NCODES; j++)
+ for (int k = 0; k < NCODES; k++) {
+ pc[i][j][k] = p[i][j][k];
+ if (k > 0) pc[i][j][k] += pc[i][j][k - 1];
+ }
+
+ //avoid sampling from 0.0!!!
+ double cp_sum, cp_d, cp_n;
+ cp_sum = cp_d = cp_n = 0.0;
+
+ for (int j = 0; j < NCODES - 1; j++) {
+ cp_sum += pc[i][j][NCODES - 1];
+ cp_d += p[i][j][j];
+ cp_n += p[i][j][NCODES - 1];
+ }
+
+ if (cp_sum == 0.0) continue;
+
+ double p_d, p_o, p_n;
+ p_d = cp_d / cp_sum;
+ p_n = cp_n / cp_sum;
+ p_o = (1.0 - p_d - p_n) / (NCODES - 2);
+ for (int j = 0; j < NCODES - 1; j++) {
+ if (pc[i][j][NCODES - 1] > 0.0) continue;
+ for (int k = 0; k < NCODES; k++) {
+ if (k == j) pc[i][j][k] = p_d;
+ else if (k == NCODES - 1) pc[i][j][k] = p_n;
+ else pc[i][j][k] = p_o;
+ if (k > 0) pc[i][j][k] += pc[i][j][k - 1];
+ }
+ }
+ if (pc[i][NCODES - 1][NCODES - 1] == 0.0) {
+ p_o = (1.0 - p_n) / (NCODES - 1);
+ for (int k = 0; k < NCODES; k++) {
+ pc[i][NCODES - 1][k] = (k < NCODES - 1 ? p_o : p_n);
+ if (k > 0) pc[i][NCODES - 1][k] += pc[i][NCODES - 1][k - 1];
+ }
+ }
+ }
+}
+
+std::string QProfile::simulate(simul* sampler, int len, int pos, int dir, const std::string& qual, const RefSeq& refseq) {
+ std::string readseq = "";
+
+ for (int i = 0; i < len; i++) {
+ readseq.push_back(getCharacter(sampler->sample(pc[c2q(qual[i])][refseq.get_id(i + pos, dir)], NCODES)));
+ }
+ return readseq;
+}
+
+void QProfile::finishSimulation() {
+ delete[] pc;
+}
+
+#endif /* QPROFILE_H_ */
diff --git a/QualDist.h b/QualDist.h
new file mode 100644
index 0000000..5607124
--- /dev/null
+++ b/QualDist.h
@@ -0,0 +1,151 @@
+#ifndef QUALDIST_H_
+#define QUALDIST_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+#include<string>
+
+#include "simul.h"
+
+//from 33 to 126 to encode 0 to 93
+class QualDist {
+public:
+ QualDist() {
+ memset(p_init, 0, sizeof(p_init));
+ memset(p_tran, 0, sizeof(p_tran));
+ }
+
+ QualDist& operator=(const QualDist&);
+
+ void update(const std::string&);
+ void finish();
+
+ double getProb(const std::string&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void startSimulation();
+ std::string simulate(simul*, int);
+ void finishSimulation();
+
+private:
+ static const int SIZE = 100;
+
+ double p_init[SIZE];
+ double p_tran[SIZE][SIZE]; //p_tran[a][b] = p(b|a)
+
+ int c2q(char c) { assert(c >= 33 && c <= 126); return c - 33; }
+
+ double *qc_init, (*qc_trans)[SIZE];
+ char q2c(int qval) { return (char)(qval + 33); }
+};
+
+QualDist& QualDist::operator=(const QualDist& rv) {
+ if (this == &rv) return *this;
+
+ memcpy(p_init, rv.p_init, sizeof(rv.p_init));
+ memcpy(p_tran, rv.p_tran, sizeof(rv.p_tran));
+
+ return *this;
+}
+
+void QualDist::update(const std::string& qual) {
+ int len = qual.size();
+
+ assert(len > 0);
+ ++p_init[c2q(qual[0])];
+
+ for (int i = 1; i < len; i++) {
+ ++p_tran[c2q(qual[i - 1])][c2q(qual[i])];
+ }
+}
+
+void QualDist::finish() {
+ double sum;
+
+ sum = 0.0;
+ for (int i = 0; i < SIZE; i++) sum += p_init[i];
+ for (int i = 0; i < SIZE; i++) p_init[i] /= sum;
+
+ for (int i = 0; i < SIZE; i++) {
+ sum = 0.0;
+ for (int j = 0; j < SIZE; j++) sum += p_tran[i][j];
+ if (sum <= 0.0) continue;
+ //if (isZero(sum)) continue;
+ for (int j = 0; j < SIZE; j++) p_tran[i][j] /= sum;
+ }
+}
+
+double QualDist::getProb(const std::string& qual) {
+ int len = qual.size();
+ double prob = 1.0;
+
+ assert(len > 0);
+ prob *= p_init[c2q(qual[0])];
+ for (int i = 1; i < len; i++) {
+ prob *= p_tran[c2q(qual[i - 1])][c2q(qual[i])];
+ }
+
+ return prob;
+}
+
+void QualDist::read(FILE *fi) {
+ int tmp_size;
+
+ assert(fscanf(fi, "%d", &tmp_size) == 1);
+ assert(tmp_size == SIZE);
+
+ for (int i = 0; i < SIZE; i++) { assert(fscanf(fi, "%lf", &p_init[i]) == 1); }
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < SIZE; j++) { assert(fscanf(fi, "%lf", &p_tran[i][j]) == 1); }
+ }
+}
+
+void QualDist::write(FILE *fo) {
+ fprintf(fo, "%d\n", SIZE);
+ for (int i = 0; i < SIZE - 1; i++) { fprintf(fo, "%.10g ", p_init[i]); }
+ fprintf(fo, "%.10g\n", p_init[SIZE - 1]);
+ for (int i = 0; i < SIZE; i++) {
+ for (int j = 0; j < SIZE -1 ; j++) fprintf(fo, "%.10g ", p_tran[i][j]);
+ fprintf(fo, "%.10g\n", p_tran[i][SIZE - 1]);
+ }
+}
+
+void QualDist::startSimulation() {
+ qc_init = new double[SIZE];
+ qc_trans = new double[SIZE][SIZE];
+
+ for (int i = 0; i < SIZE; i++) {
+ qc_init[i] = p_init[i];
+ if (i > 0) qc_init[i] += qc_init[i - 1];
+ }
+
+ for (int i = 0; i < SIZE; i++)
+ for (int j = 0; j < SIZE; j++) {
+ qc_trans[i][j] = p_tran[i][j];
+ if (j > 0) qc_trans[i][j] += qc_trans[i][j - 1];
+ }
+}
+
+std::string QualDist::simulate(simul* sampler, int len) {
+ int qval, old_qval;
+ std::string qual = "";
+
+ qval = sampler->sample(qc_init, SIZE);
+ qual.push_back(q2c(qval));
+ for (int i = 1; i < len; i++) {
+ old_qval = qval;
+ qval = sampler->sample(qc_trans[old_qval], SIZE);
+ qual.push_back(q2c(qval));
+ }
+
+ return qual;
+}
+
+void QualDist::finishSimulation() {
+ delete[] qc_init;
+ delete[] qc_trans;
+}
+#endif /* QUALDIST_H_ */
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8ad8152
--- /dev/null
+++ b/README.md
@@ -0,0 +1,703 @@
+README for RSEM
+===============
+
+[Bo Li](https://lilab-bcb.github.io/) \(bli28 at mgh dot harvard dot edu\)
+
+* * *
+
+Table of Contents
+-----------------
+
+* [Introduction](#introduction)
+* [Compilation & Installation](#compilation)
+* [Usage](#usage)
+ * [Build RSEM references using RefSeq, Ensembl, or GENCODE annotations](#built)
+ * [Build RSEM references for untypical organisms](#untypical)
+* [Example](#example-main)
+* [Simulation](#simulation)
+* [Generate Transcript-to-Gene-Map from Trinity Output](#gen_trinity)
+* [Differential Expression Analysis](#de)
+* [Prior-Enhanced RSEM (pRSEM)](#pRSEM)
+* [Authors](#authors)
+* [Acknowledgements](#acknowledgements)
+* [License](#license)
+
+* * *
+
+## <a name="introduction"></a> Introduction
+
+RSEM is a software package for estimating gene and isoform expression
+levels from RNA-Seq data. The RSEM package provides an user-friendly
+interface, supports threads for parallel computation of the EM
+algorithm, single-end and paired-end read data, quality scores,
+variable-length reads and RSPD estimation. In addition, it provides
+posterior mean and 95% credibility interval estimates for expression
+levels. For visualization, It can generate BAM and Wiggle files in
+both transcript-coordinate and genomic-coordinate. Genomic-coordinate
+files can be visualized by both UCSC Genome browser and Broad
+Institute's Integrative Genomics Viewer (IGV). Transcript-coordinate
+files can be visualized by IGV. RSEM also has its own scripts to
+generate transcript read depth plots in pdf format. The unique feature
+of RSEM is, the read depth plots can be stacked, with read depth
+contributed to unique reads shown in black and contributed to
+multi-reads shown in red. In addition, models learned from data can
+also be visualized. Last but not least, RSEM contains a simulator.
+
+## <a name="compilation"></a> Compilation & Installation
+
+To compile RSEM, simply run
+
+ make
+
+For Cygwin users, run
+
+ make cygwin=true
+
+To compile EBSeq, which is included in the RSEM package, run
+
+ make ebseq
+
+To install RSEM, simply put the RSEM directory in your environment's PATH
+variable. Alternatively, run
+
+ make install
+
+By default, RSEM executables are installed to `/usr/local/bin`. You
+can change the installation location by setting `DESTDIR` and/or
+`prefix` variables. The RSEM executables will be installed to
+`${DESTDIR}${prefix}/bin`. The default values of `DESTDIR` and
+`prefix` are `DESTDIR=` and `prefix=/usr/local`. For example,
+
+ make install DESTDIR=/home/my_name prefix=/software
+
+will install RSEM executables to `/home/my_name/software/bin`.
+
+**Note** that `make install` does not install `EBSeq` related scripts,
+such as `rsem-generate-ngvector`, `rsem-run-ebseq`, and
+`rsem-control-fdr`. But `rsem-generate-data-matrix`, which generates
+count matrix for differential expression analysis, is installed.
+
+### Prerequisites
+
+C++, Perl and R are required to be installed.
+
+To use the `--gff3` option of `rsem-prepare-reference`, Python is also
+required to be installed.
+
+To take advantage of RSEM's built-in support for the Bowtie/Bowtie
+2/STAR/HISAT2 alignment program, you must have
+[Bowtie](http://bowtie-bio.sourceforge.net)/[Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2)/[STAR](https://github.com/alexdobin/STAR)/[HISAT2](https://ccb.jhu.edu/software/hisat2/manual.shtml)
+installed.
+
+## <a name="usage"></a> Usage
+
+### I. Preparing Reference Sequences
+
+RSEM can extract reference transcripts from a genome if you provide it
+with gene annotations in a GTF/GFF3 file. Alternatively, you can provide
+RSEM with transcript sequences directly.
+
+Please note that GTF files generated from the UCSC Table Browser do not
+contain isoform-gene relationship information. However, if you use the
+UCSC Genes annotation track, this information can be recovered by
+downloading the knownIsoforms.txt file for the appropriate genome.
+
+To prepare the reference sequences, you should run the
+`rsem-prepare-reference` program. Run
+
+ rsem-prepare-reference --help
+
+to get usage information or visit the [rsem-prepare-reference
+documentation page](rsem-prepare-reference.html).
+
+#### <a name="built"></a> Build RSEM references using RefSeq, Ensembl, or GENCODE annotations
+
+RefSeq and Ensembl are two frequently used annotations. For human and
+mouse, GENCODE annotaions are also available. In this section, we show
+how to build RSEM references using these annotations. Note that it is
+important to pair the genome with the annotation file for each
+annotation source. In addition, we recommend users to use the primary
+assemblies of genomes. Without loss of generality, we use human genome as
+an example and in addition build Bowtie indices.
+
+For **RefSeq**, the genome and annotation file in GFF3 format can be found
+at RefSeq genomes FTP:
+
+```
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/
+```
+
+For example, the human genome and GFF3 file locate at the subdirectory
+`vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.31_GRCh38.p5`. `GCF_000001405.31_GRCh38.p5`
+is the latest annotation version when this section was written.
+
+Download and decompress the genome and annotation files to your working directory:
+
+```
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.31_GRCh38.p5/GCF_000001405.31_GRCh38.p5_genomic.fna.gz
+ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.31_GRCh38.p5/GCF_000001405.31_GRCh38.p5_genomic.gff.gz
+```
+
+`GCF_000001405.31_GRCh38.p5_genomic.fna` contains all top level
+sequences, including patches and haplotypes. To obtain the primary
+assembly, run the following RSEM python script:
+
+```
+rsem-refseq-extract-primary-assembly GCF_000001405.31_GRCh38.p5_genomic.fna GCF_000001405.31_GRCh38.p5_genomic.primary_assembly.fna
+```
+
+Then type the following command to build RSEM references:
+
+```
+rsem-prepare-reference --gff3 GCF_000001405.31_GRCh38.p5_genomic.gff \
+ --trusted-sources BestRefSeq,Curated\ Genomic \
+ --bowtie \
+ GCF_000001405.31_GRCh38.p5_genomic.primary_assembly.fna \
+ ref/human_refseq
+```
+
+In the above command, `--trusted-sources` tells RSEM to only extract
+transcripts from RefSeq sources like `BestRefSeq` or `Curated Genomic`. By
+default, RSEM trust all sources. There is also an
+`--gff3-RNA-patterns` option and its default is `mRNA`. Setting
+`--gff3-RNA-patterns mRNA,rRNA` will allow RSEM to extract all mRNAs
+and rRNAs from the genome. Visit [here](rsem-prepare-reference.html)
+for more details.
+
+Because the gene and transcript IDs (e.g. gene1000, rna28655)
+extracted from RefSeq GFF3 files are hard to understand, it is
+recommended to turn on the `--append-names` option in
+`rsem-calculate-expression` for better interpretation of
+quantification results.
+
+For **Ensembl**, the genome and annotation files can be found at
+[Ensembl FTP](http://uswest.ensembl.org/info/data/ftp/index.html).
+
+Download and decompress the human genome and GTF files:
+
+```
+ftp://ftp.ensembl.org/pub/release-83/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
+ftp://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz
+```
+
+Then use the following command to build RSEM references:
+
+```
+rsem-prepare-reference --gtf Homo_sapiens.GRCh38.83.gtf \
+ --bowtie \
+ Homo_sapiens.GRCh38.dna.primary_assembly.fa \
+ ref/human_ensembl
+```
+
+If you want to use GFF3 file instead, which is unnecessary and not
+recommended, you should add option `--gff3-RNA-patterns transcript`
+because `mRNA` is replaced by `transcript` in Ensembl GFF3 files.
+
+**GENCODE** only provides human and mouse annotations. The genome and
+ annotation files can be found from [GENCODE
+ website](http://www.gencodegenes.org/).
+
+Download and decompress the human genome and GTF files:
+
+```
+ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/GRCh38.primary_assembly.genome.fa.gz
+ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz
+```
+
+Then type the following command:
+
+```
+rsem-prepare-reference --gtf gencode.v24.annotation.gtf \
+ --bowtie \
+ GRCh38.primary_assembly.genome.fa \
+ ref/human_gencode
+```
+
+Similar to Ensembl annotation, if you want to use GFF3 files (not
+recommended), add option `--gff3-RNA-patterns transcript`.
+
+#### <a name="untypical"></a> Build RSEM references for untypical organisms
+
+For untypical organisms, such as viruses, you may only have a GFF3 file that containing only genes but not any transcripts. You need to turn on `--gff3-genes-as-transcripts` so that RSEM will make each gene as a unique transcript.
+
+Here is an example command:
+
+```
+rsem-prepare-reference --gff3 virus.gff \
+ --gff3-genes-as-transcripts \
+ --bowtie \
+ virus.genome.fa \
+ ref/virus
+```
+
+### II. Calculating Expression Values
+
+To calculate expression values, you should run the
+`rsem-calculate-expression` program. Run
+
+ rsem-calculate-expression --help
+
+to get usage information or visit the [rsem-calculate-expression
+documentation page](rsem-calculate-expression.html).
+
+#### Calculating expression values from single-end data
+
+For single-end models, users have the option of providing a fragment
+length distribution via the `--fragment-length-mean` and
+`--fragment-length-sd` options. The specification of an accurate fragment
+length distribution is important for the accuracy of expression level
+estimates from single-end data. If the fragment length mean and sd are
+not provided, RSEM will not take a fragment length distribution into
+consideration.
+
+#### Using an alternative aligner
+
+By default, RSEM automates the alignment of reads to reference
+transcripts using the Bowtie aligner. Turn on `--bowtie2` for
+`rsem-prepare-reference` and `rsem-calculate-expression` will allow
+RSEM to use the Bowtie 2 alignment program instead. Please note that
+indel alignments, local alignments and discordant alignments are
+disallowed when RSEM uses Bowtie 2 since RSEM currently cannot handle
+them. See the description of `--bowtie2` option in
+`rsem-calculate-expression` for more details. Similarly, turn on
+`--star` will allow RSEM to use the STAR aligner. Turn on `--hisat2-hca`
+will allow RSEM to use the HISAT2 aligner according to Human Cell
+Atals SMART-Seq2 pipeline. To use an alternative alignment program,
+align the input reads against the file
+`reference_name.idx.fa` generated by `rsem-prepare-reference`, and
+format the alignment output in SAM/BAM/CRAM format. Then, instead of
+providing reads to `rsem-calculate-expression`, specify the
+`--alignments` option and provide the SAM/BAM/CRAM file as an
+argument.
+
+RSEM requires the alignments of a read to be adjacent. For paired-end
+reads, RSEM also requires the two mates of any alignment be
+adjacent. To check if your SAM/BAM/CRAM file satisfy the requirements,
+run
+
+ rsem-sam-validator <input.sam/input.bam/input.cram>
+
+If your file does not satisfy the requirements, you can use
+`convert-sam-for-rsem` to convert it into a BAM file which RSEM can
+process. Run
+
+ convert-sam-for-rsem --help
+
+to get usage information or visit the [convert-sam-for-rsem
+documentation
+page](convert-sam-for-rsem.html).
+
+Note that RSEM does ** not ** support gapped alignments. So make sure
+that your aligner does not produce alignments with
+intersions/deletions. In addition, you should make sure that you use
+`reference_name.idx.fa`, which is generated by RSEM, to build your
+aligner's indices.
+
+### III. Visualization
+
+RSEM includes a copy of SAMtools. When `--no-bam-output` is not
+specified and `--sort-bam-by-coordinate` is specified, RSEM will
+produce these three files:`sample_name.transcript.bam`, the unsorted
+BAM file, `sample_name.transcript.sorted.bam` and
+`sample_name.transcript.sorted.bam.bai` the sorted BAM file and
+indices generated by the SAMtools included. All three files are in
+transcript coordinates. When users in addition specify the
+`--output-genome-bam` option, RSEM will produce three more files:
+`sample_name.genome.bam`, the unsorted BAM file,
+`sample_name.genome.sorted.bam` and
+`sample_name.genome.sorted.bam.bai` the sorted BAM file and
+indices. All these files are in genomic coordinates.
+
+#### a) Converting transcript BAM file into genome BAM file
+
+Normally, RSEM will do this for you via `--output-genome-bam` option
+of `rsem-calculate-expression`. However, if you have run
+`rsem-prepare-reference` and use `reference_name.idx.fa` to build
+indices for your aligner, you can use `rsem-tbam2gbam` to convert your
+transcript coordinate BAM alignments file into a genomic coordinate
+BAM alignments file without the need to run the whole RSEM
+pipeline.
+
+Usage:
+
+ rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output
+
+reference_name : The name of reference built by `rsem-prepare-reference`
+unsorted_transcript_bam_input : This file should satisfy: 1) the alignments of a same read are grouped together, 2) for any paired-end alignment, the two mates should be adjacent to each other, 3) this file should not be sorted by samtools
+genome_bam_output : The output genomic coordinate BAM file's name
+
+#### b) Generating a Wiggle file
+
+A wiggle plot representing the expected number of reads overlapping
+each position in the genome/transcript set can be generated from the
+sorted genome/transcript BAM file output. To generate the wiggle
+plot, run the `rsem-bam2wig` program on the
+`sample_name.genome.sorted.bam`/`sample_name.transcript.sorted.bam` file.
+
+Usage:
+
+ rsem-bam2wig sorted_bam_input wig_output wiggle_name [--no-fractional-weight]
+
+sorted_bam_input : Input BAM format file, must be sorted
+wig_output : Output wiggle file's name, e.g. output.wig
+wiggle_name : The name of this wiggle plot
+--no-fractional-weight : If this is set, RSEM will not look for "ZW" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line
+
+#### c) Loading a BAM and/or Wiggle file into the UCSC Genome Browser or Integrative Genomics Viewer(IGV)
+
+For UCSC genome browser, please refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html).
+
+For integrative genomics viewer, please refer to the [IGV home page](http://www.broadinstitute.org/software/igv/home). Note: Although IGV can generate read depth plot from the BAM file given, it cannot recognize "ZW" tag RSEM puts. Therefore IGV counts each alignment as weight 1 instead of the expected weight for the plot it generates. So we recommend to use the wiggle file generated by RSEM for read depth visualization.
+
+Here are some guidance for visualizing transcript coordinate files using IGV:
+
+1) Import the transcript sequences as a genome
+
+Select File -> Import Genome, then fill in ID, Name and Fasta file. Fasta file should be `reference_name.idx.fa`. After that, click Save button. Suppose ID is filled as `reference_name`, a file called `reference_name.genome` will be generated. Next time, we can use: File -> Load Genome, then select `reference_name.genome`.
+
+2) Load visualization files
+
+Select File -> Load from File, then choose one transcript coordinate visualization file generated by RSEM. IGV might require you to convert wiggle file to tdf file. You should use igvtools to perform this task. One way to perform the conversion is to use the following command:
+
+ igvtools tile reference_name.transcript.wig reference_name.transcript.tdf reference_name.genome
+
+#### d) Generating Transcript Wiggle Plots
+
+To generate transcript wiggle plots, you should run the
+`rsem-plot-transcript-wiggles` program. Run
+
+ rsem-plot-transcript-wiggles --help
+
+to get usage information or visit the [rsem-plot-transcript-wiggles
+documentation page](rsem-plot-transcript-wiggles.html).
+
+#### e) Visualize the model learned by RSEM
+
+RSEM provides an R script, `rsem-plot-model`, for visulazing the model learned.
+
+Usage:
+
+ rsem-plot-model sample_name output_plot_file
+
+sample_name: the name of the sample analyzed
+output_plot_file: the file name for plots generated from the model. It is a pdf file
+
+The plots generated depends on read type and user configuration. It
+may include fragment length distribution, mate length distribution,
+read start position distribution (RSPD), quality score vs observed
+quality given a reference base, position vs percentage of sequencing
+error given a reference base and alignment statistics.
+
+fragment length distribution and mate length distribution: x-axis is fragment/mate length, y axis is the probability of generating a fragment/mate with the associated length
+
+RSPD: Read Start Position Distribution. x-axis is bin number, y-axis is the probability of each bin. RSPD can be used as an indicator of 3' bias
+
+Quality score vs. observed quality given a reference base: x-axis is Phred quality scores associated with data, y-axis is the "observed quality", Phred quality scores learned by RSEM from the data. Q = -10log_10(P), where Q is Phred quality score and P is the probability of sequencing error for a particular base
+
+Position vs. percentage sequencing error given a reference base: x-axis is position and y-axis is percentage sequencing error
+
+Alignment statistics: It includes a histogram and a pie chart. For the histogram, x-axis shows the number of **isoform-level** alignments a read has and y-axis provides the number of reads with that many alignments. The inf in x-axis means number of reads filtered due to too many alignments. For the pie chart, four categories of reads --- unalignable, unique, **isoform-level**multi-mapping, filtered -- are plotted and their percentages are noted. In both the histogram and the piechart, numbers belong to unalignable, unique, multi-mapping, and filtered are colored as green, blue, gray and red.
+
+## <a name="example-main"></a> Example
+
+Suppose we download the mouse genome from UCSC Genome Browser. We do
+not add poly(A) tails and use `/ref/mouse_0` as the reference name.
+We have a FASTQ-formatted file, `mmliver.fq`, containing single-end
+reads from one sample, which we call `mmliver_single_quals`. We want
+to estimate expression values by using the single-end model with a
+fragment length distribution. We know that the fragment length
+distribution is approximated by a normal distribution with a mean of
+150 and a standard deviation of 35. We wish to generate 95%
+credibility intervals in addition to maximum likelihood estimates.
+RSEM will be allowed 1G of memory for the credibility interval
+calculation. We will visualize the probabilistic read mappings
+generated by RSEM on UCSC genome browser. We will generate a list of
+transcript wiggle plots (`output.pdf`) for the genes provided in `gene_ids.txt`.
+We will visualize the models learned in
+`mmliver_single_quals.models.pdf`
+
+The commands for this scenario are as follows:
+
+ rsem-prepare-reference --gtf mm9.gtf --transcript-to-gene-map knownIsoforms.txt --bowtie --bowtie-path /sw/bowtie /data/mm9 /ref/mouse_0
+ rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --output-genome-bam --calc-ci --ci-memory 1024 /data/mmliver.fq /ref/mouse_0 mmliver_single_quals
+ rsem-bam2wig mmliver_single_quals.sorted.bam mmliver_single_quals.sorted.wig mmliver_single_quals
+ rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf
+ rsem-plot-model mmliver_single_quals mmliver_single_quals.models.pdf
+
+## <a name="simulation"></a> Simulation
+
+RSEM provides users the `rsem-simulate-reads` program to simulate RNA-Seq data based on parameters learned from real data sets. Run
+
+ rsem-simulate-reads
+
+to get usage information or read the following subsections.
+
+### Usage:
+
+ rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]
+
+__reference_name:__ The name of RSEM references, which should be already generated by `rsem-prepare-reference`
+
+__estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using `rsem-calculate-expression`. The file can be found under the `sample_name.stat` folder with the name of `sample_name.model`. `model_file_description.txt` provides the format and meanings of this file.
+
+__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using `rsem-calculate-expression` from real data. The corresponding file users want to use is `sample_name.isoforms.results`. If simulating from user-designed expression profile is desired, start from a learned `sample_name.isoforms.results` file and only modify the `TPM` column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, `sample_name.alleles.results` should be used instead.
+
+__theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using `rsem-calculate-expression` from real data. Users can find it as the first value of the third line of the file `sample_name.stat/sample_name.theta`.
+
+__N:__ The total number of reads to be simulated. If `rsem-calculate-expression` is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file `sample_name.stat/sample_name.cnt`.
+
+__output_name:__ Prefix for all output files.
+
+__--seed seed:__ Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.
+
+__-q:__ Set it will stop outputting intermediate information.
+
+### Outputs:
+
+output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.
+output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.
+
+output_name.fa if single-end without quality score;
+output_name.fq if single-end with quality score;
+output_name_1.fa & output_name_2.fa if paired-end without quality
+score;
+output_name_1.fq & output_name_2.fq if paired-end with quality score.
+
+**Format of the header line**: Each simulated read's header line encodes where it comes from. The header line has the format:
+
+ {>/@}_rid_dir_sid_pos[_insertL]
+
+__{>/@}:__ Either '>' or '@' must appear. '>' appears if FASTA files are generated and '@' appears if FASTQ files are generated
+
+__rid:__ Simulated read's index, numbered from 0
+
+__dir:__ The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-')
+
+__sid:__ Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the `transcript_id` column of the `sample_name.isoforms.results` file (at line sid + 1, line 1 is for column names)
+
+__pos:__ The start position of the simulated read in strand dir of transcript sid. It is numbered from 0
+
+__insertL:__ Only appear for paired-end reads. It gives the insert length of the simulated read.
+
+### Example:
+
+Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example-main). In addition, we set theta0 as 0.2 and output_name as `simulated_reads`. The command is:
+
+ rsem-simulate-reads /ref/mouse_0 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads
+
+## <a name="gen_trinity"></a> Generate Transcript-to-Gene-Map from Trinity Output
+
+For Trinity users, RSEM provides a perl script to generate transcript-to-gene-map file from the fasta file produced by Trinity.
+
+### Usage:
+
+ extract-transcript-to-gene-map-from-trinity trinity_fasta_file map_file
+
+trinity_fasta_file: the fasta file produced by trinity, which contains all transcripts assembled.
+map_file: transcript-to-gene-map file's name.
+
+## <a name="de"></a> Differential Expression Analysis
+
+Popular differential expression (DE) analysis tools such as edgeR and
+DESeq do not take variance due to read mapping uncertainty into
+consideration. Because read mapping ambiguity is prevalent among
+isoforms and de novo assembled transcripts, these tools are not ideal
+for DE detection in such conditions.
+
+EBSeq, an empirical Bayesian DE analysis tool developed in UW-Madison,
+can take variance due to read mapping ambiguity into consideration by
+grouping isoforms with parent gene's number of isoforms. In addition,
+it is more robust to outliers. For more information about EBSeq
+(including the paper describing their method), please visit [EBSeq's
+website](http://www.biostat.wisc.edu/~ningleng/EBSeq_Package).
+
+
+RSEM includes EBSeq in its folder named `EBSeq`. To use it, first type
+
+ make ebseq
+
+to compile the EBSeq related codes.
+
+EBSeq requires gene-isoform relationship for its isoform DE
+detection. However, for de novo assembled transcriptome, it is hard to
+obtain an accurate gene-isoform relationship. Instead, RSEM provides a
+script `rsem-generate-ngvector`, which clusters transcripts based on
+measures directly relating to read mappaing ambiguity. First, it
+calculates the 'unmappability' of each transcript. The 'unmappability'
+of a transcript is the ratio between the number of k mers with at
+least one perfect match to other transcripts and the total number of k
+mers of this transcript, where k is a parameter. Then, Ng vector is
+generated by applying Kmeans algorithm to the 'unmappability' values
+with number of clusters set as 3. This program will make sure the mean
+'unmappability' scores for clusters are in ascending order. All
+transcripts whose lengths are less than k are assigned to cluster
+3. Run
+
+ rsem-generate-ngvector --help
+
+to get usage information or visit the [rsem-generate-ngvector
+documentation
+page](rsem-generate-ngvector.html).
+
+If your reference is a de novo assembled transcript set, you should
+run `rsem-generate-ngvector` first. Then load the resulting
+`output_name.ngvec` into R. For example, you can use
+
+ NgVec <- scan(file="output_name.ngvec", what=0, sep="\n")
+
+. After that, set "NgVector = NgVec" for your differential expression
+test (either `EBTest` or `EBMultiTest`).
+
+
+For users' convenience, RSEM also provides a script
+`rsem-generate-data-matrix` to extract input matrix from expression
+results:
+
+ rsem-generate-data-matrix sampleA.[genes/isoforms].results sampleB.[genes/isoforms].results ... > output_name.counts.matrix
+
+The results files are required to be either all gene level results or
+all isoform level results. You can load the matrix into R by
+
+ IsoMat <- data.matrix(read.table(file="output_name.counts.matrix"))
+
+before running either `EBTest` or `EBMultiTest`.
+
+Lastly, RSEM provides two scripts, `rsem-run-ebseq` and
+`rsem-control-fdr`, to help users find differential expressed
+genes/transcripts. First, `rsem-run-ebseq` calls EBSeq to calculate related statistics
+for all genes/transcripts. Run
+
+ rsem-run-ebseq --help
+
+to get usage information or visit the [rsem-run-ebseq documentation
+page](rsem-run-ebseq.html). Second,
+`rsem-control-fdr` takes `rsem-run-ebseq` 's result and reports called
+differentially expressed genes/transcripts by controlling the false
+discovery rate. Run
+
+ rsem-control-fdr --help
+
+to get usage information or visit the [rsem-control-fdr documentation
+page](rsem-control-fdr.html). These
+two scripts can perform DE analysis on either 2 conditions or multiple
+conditions.
+
+Please note that `rsem-run-ebseq` and `rsem-control-fdr` use EBSeq's
+default parameters. For advanced use of EBSeq or information about how
+EBSeq works, please refer to [EBSeq's
+manual](http://www.bioconductor.org/packages/devel/bioc/vignettes/EBSeq/inst/doc/EBSeq_Vignette.pdf).
+
+Questions related to EBSeq should
+be sent to <a href="mailto:nleng@wisc.edu">Ning Leng</a>.
+
+## <a name="pRSEM"></a> Prior-Enhanced RSEM (pRSEM)
+
+### I. Overview
+
+[Prior-enhanced RSEM (pRSEM)](https://deweylab.github.io/pRSEM/) uses complementary information (e.g. ChIP-seq data) to allocate RNA-seq multi-mapping fragments. We included pRSEM code in the subfolder `pRSEM/` as well as in RSEM's scripts `rsem-prepare-reference` and `rsem-calculate-expression`.
+
+### II. Demo
+
+To get a quick idea on how to use pRSEM, you can try [this demo](https://github.com/pliu55/pRSEM_demo). It provides a single script, named `run_pRSEM_demo.sh`, which allows you to run all pRSEM's functions. It also contains detailed descriptions of pRSEM's workflow, input and output files.
+
+### III. Installation
+
+To compile pRSEM, type
+
+ make pRSEM
+
+Note that you need to first compile RSEM before compiling pRSEM. Currently, pRSEM has only been tested on Linux.
+
+
+### IV. Example
+
+To run pRSEM on the [RSEM example above](#example-main), you need to provide:
+- __ChIP-seq sequencing file(s) in FASTQ format__ or __a ChIP-seq peak file in BED format__. They will be used by pRSEM to obtain complementatry information for allocating RNA-seq multi-mapping fragments.
+- __a genome mappability file in bigWig format__ to let pRSEM build a training
+ set of isoforms to learn prior. Mappability can be obtained from UCSC's
+ ENCODE composite track for [human hg19](http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign36mer.bigWig)
+ and [mouse mm9](http://hgdownload.cse.ucsc.edu/goldenPath/mm9/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign36mer.bigWig). For other genomes, you
+ can generate the mappability file by following [this tutorial] (http://wiki.bits.vib.be/index.php/Create_a_mappability_track#Install_and_run_the_GEM_library_tools).
+
+Assuming you would like to use RNA Pol II's ChIP-seq sequencing files `/data/mmliver_PolIIRep1.fq.gz` and `/data/mmliver_PolIIRep2.fq.gz`, with ChIP-seq control `/data/mmliver_ChIPseqCtrl.fq.gz`. Also, assuming the mappability file for mouse genome is `/data/mm9.bigWig` and you prefer to use STAR located at `/sw/STAR` to align RNA-seq fragments and use Bowtie to align ChIP-seq reads. Then, you can use the following commands to run pRSEM:
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --star \
+ --star-path /sw/STAR \
+ -p 8 \
+ --prep-pRSEM \
+ --bowtie-path /sw/bowtie \
+ --mappability-bigwig-file /data/mm9.bigWig \
+ /data/mm9 \
+ /ref/mouse_0
+
+ rsem-calculate-expression --star \
+ --star-path /sw/STAR \
+ --calc-pme \
+ --run-pRSEM \
+ --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \
+ --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \
+ --bowtie-path /sw/bowtie \
+ -p 8 \
+ /data/mmliver.fq \
+ /ref/mouse_0 \
+ mmliver_single_quals
+
+
+To find out more about pRSEM options and examples, you can use the commands:
+
+ rsem-prepare-reference --help
+
+and
+
+ rsem-calculate-expression --help
+
+
+### V. System Requirements
+- Linux
+- Perl version >= 5.8.8
+- Python version >= 2.7.3
+- R version >= 3.3.1
+- Bioconductor 3.3
+
+
+### VI. Required External Packages
+All the following packages will be automatically installed when compiling pRSEM.
+- [data.table 1.9.6](https://cran.r-project.org/web/packages/data.table/index.html): an extension of R's data.frame, heavily used by pRSEM.
+- [GenomicRanges 1.24.3](https://bioconductor.org/packages/release/bioc/html/GenomicRanges.html): efficient representing and manipulating genomic intervals, heavily used by pRSEM.
+- [ShortRead 1.30.0](https://bioconductor.org/packages/release/bioc/html/ShortRead.html): guessing the encoding of ChIP-seq FASTQ file's quality score.
+- [caTools 1.17.1](https://cran.r-project.org/web/packages/caTools/index.html): used for SPP Peak Caller.
+- [SPP Peak Caller](https://code.google.com/archive/p/phantompeakqualtools/):
+ ChIP-seq peak caller. Source code was slightly modified in terms of included headers in order to be compiled under R v3.3.1.
+- [IDR](https://sites.google.com/site/anshulkundaje/projects/idr/idrCode.tar.gz?attredirects=0):
+ calculating Irreproducible Discovery Rate to call peaks from multiple ChIP-seq replicates.
+
+
+## <a name="authors"></a> Authors
+
+[Bo Li](http://bli25ucb.github.io/) and [Colin Dewey](https://www.biostat.wisc.edu/~cdewey/) designed the RSEM algorithm. [Bo Li](http://bli25ucb.github.io/) implemented the RSEM software. [Peng Liu](https://www.biostat.wisc.edu/~cdewey/group.html) contributed the STAR aligner options and prior-enhanced RSEM (pRSEM).
+
+## <a name="acknowledgements"></a> Acknowledgements
+
+RSEM uses the [Boost C++](http://www.boost.org/) and
+[SAMtools](http://www.htslib.org/) libraries. RSEM includes
+[EBSeq](http://www.biostat.wisc.edu/~ningleng/EBSeq_Package/) for
+differential expression analysis.
+
+We thank earonesty, Dr. Samuel Arvidsson, John Marshall, and Michael
+R. Crusoe for contributing patches.
+
+We thank Han Lin, j.miller, Jo&euml;l Fillon, Dr. Samuel G. Younkin,
+Malcolm Cook, Christina Wells, Uro&#353; &#352;ipeti&#263;,
+outpaddling, rekado, and Josh Richer for suggesting possible fixes.
+
+**Note** that `bam_sort.c` of SAMtools is slightly modified so that
+ `samtools sort -n` will not move the two mates of paired-end
+ alignments apart. In addition, we turn on the `--without-curses`
+ option when configuring SAMtools and thus SAMtools' curses-based
+ `tview` subcommand is not built.
+
+## <a name="license"></a> License
+
+RSEM is licensed under the [GNU General Public License
+v3](http://www.gnu.org/licenses/gpl-3.0.html).
diff --git a/RSPD.h b/RSPD.h
new file mode 100644
index 0000000..8c484a5
--- /dev/null
+++ b/RSPD.h
@@ -0,0 +1,206 @@
+#ifndef RSPD_H_
+#define RSPD_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+
+#include "utils.h"
+#include "RefSeq.h"
+#include "Refs.h"
+#include "simul.h"
+
+const int RSPD_DEFAULT_B = 20;
+
+class RSPD {
+public:
+ RSPD(bool estRSPD, int B = RSPD_DEFAULT_B) {
+ this->estRSPD = estRSPD;
+ this->B = B;
+
+ pdf = new double[B + 2];
+ cdf = new double[B + 2];
+
+ //set initial parameters
+ memset(pdf, 0, sizeof(double) * (B + 2)); // use B + 2 for evalCDF
+ memset(cdf, 0, sizeof(double) * (B + 2));
+ for (int i = 1; i <= B; i++) {
+ pdf[i] = 1.0 / B;
+ cdf[i] = i * 1.0 / B;
+ }
+ }
+
+ ~RSPD() {
+ delete[] pdf;
+ delete[] cdf;
+ }
+
+ RSPD& operator=(const RSPD& rv);
+
+ void init();
+
+ //fpos starts from 0
+ void update(int fpos, int fullLen, double frac) {
+ assert(estRSPD);
+
+ if (fpos >= fullLen) return; // if out of range, do not use this hit
+
+ int i;
+ double a = fpos * 1.0 / fullLen;
+ double b;
+
+ for (i = ((long long)fpos) * B / fullLen + 1; i < (((long long)fpos + 1) * B - 1) / fullLen + 1; i++) {
+ b = i * 1.0 / B;
+ pdf[i] += (b - a) * fullLen * frac;
+ a = b;
+ }
+ b = (fpos + 1.0) / fullLen;
+ pdf[i] += (b - a) * fullLen * frac;
+ }
+
+ void finish();
+
+ double evalCDF(int fpos, int fullLen) {
+ int i = ((long long)fpos) * B / fullLen;
+ double val = fpos * 1.0 / fullLen * B;
+
+ return cdf[i] + (val - i) * pdf[i + 1];
+ }
+
+ double getAdjustedProb(int fpos, int effL, int fullLen) {
+ assert(fpos >= 0 && fpos < fullLen && effL <= fullLen);
+ if (!estRSPD) return 1.0 / effL;
+ double denom = evalCDF(effL, fullLen);
+ return (denom >= EPSILON ? (evalCDF(fpos + 1, fullLen) - evalCDF(fpos, fullLen)) / denom : 0.0) ;
+ }
+
+ void collect(const RSPD&);
+
+ void read(FILE*);
+ void write(FILE*);
+
+ void startSimulation(int, Refs*);
+ int simulate(simul*, int, int);
+ void finishSimulation();
+
+private:
+ bool estRSPD;
+ int B; // number of bins
+ double *pdf, *cdf;
+
+ int M;
+ double **rspdDists;
+};
+
+RSPD& RSPD::operator=(const RSPD& rv) {
+ if (this == &rv) return *this;
+ if (B != rv.B) {
+ delete[] pdf;
+ delete[] cdf;
+ pdf = new double[rv.B + 2];
+ cdf = new double[rv.B + 2];
+ }
+ B = rv.B;
+ memcpy(pdf, rv.pdf, sizeof(double) * (B + 2));
+ memcpy(cdf, rv.cdf, sizeof(double) * (B + 2));
+
+ return *this;
+}
+
+void RSPD::init() {
+ assert(estRSPD);
+ memset(pdf, 0, sizeof(double) * (B + 2));
+ memset(cdf, 0, sizeof(double) * (B + 2));
+}
+
+void RSPD::finish() {
+ double sum = 0.0;
+
+ assert(estRSPD);
+
+ for (int i = 1; i <= B; i++) {
+ sum += pdf[i];
+ }
+
+ for (int i = 1; i <= B; i++) {
+ pdf[i] /= sum;
+ cdf[i] = cdf[i - 1] + pdf[i];
+ }
+}
+
+void RSPD::collect(const RSPD& o) {
+ assert(estRSPD);
+ for (int i = 1; i <= B; i++) {
+ pdf[i] += o.pdf[i];
+ }
+}
+
+void RSPD::read(FILE *fi) {
+ //release default space first
+ delete[] pdf;
+ delete[] cdf;
+
+ int val;
+ assert(fscanf(fi, "%d", &val) == 1);
+ estRSPD = (val != 0);
+
+ if (estRSPD) {
+ assert(fscanf(fi, "%d", &B) == 1);
+ pdf = new double[B + 2];
+ cdf = new double[B + 2];
+ memset(pdf, 0, sizeof(double) * (B + 2));
+ memset(cdf, 0, sizeof(double) * (B + 2));
+ for (int i = 1; i <= B; i++) {
+ assert(fscanf(fi, "%lf", &pdf[i]) == 1);
+ cdf[i] = cdf[i - 1] + pdf[i];
+ }
+ }
+ else {
+ B = RSPD_DEFAULT_B;
+ pdf = new double[B + 2];
+ cdf = new double[B + 2];
+ memset(pdf, 0, sizeof(double) * (B + 2));
+ memset(cdf, 0, sizeof(double) * (B + 2));
+ for (int i = 1; i <= B; i++) {
+ pdf[i] = 1.0 / B;
+ cdf[i] = i * 1.0 / B;
+ }
+ }
+}
+
+void RSPD::write(FILE *fo) {
+ fprintf(fo, "%d\n", estRSPD);
+ if (estRSPD) {
+ fprintf(fo, "%d\n", B);
+ for (int i = 1; i < B; i++) {
+ fprintf(fo, "%.10g ", pdf[i]);
+ }
+ fprintf(fo, "%.10g\n", pdf[B]);
+ }
+}
+
+void RSPD::startSimulation(int M, Refs* refs) {
+ if (!estRSPD) return;
+ this->M = M;
+ rspdDists = new double*[M + 1];
+ rspdDists[0] = NULL;
+ for (int i = 1; i <= M; i++) {
+ int fullLen = refs->getRef(i).getFullLen();
+ rspdDists[i] = new double[fullLen];
+ memset(rspdDists[i], 0, sizeof(double) * fullLen);
+ for (int j = 0; j < fullLen; j++) rspdDists[i][j] = evalCDF(j + 1, fullLen);
+ }
+}
+
+int RSPD::simulate(simul *sampler, int sid, int effL) {
+ if (estRSPD) return (rspdDists[sid][effL - 1] > 0.0 ? sampler->sample(rspdDists[sid], effL) : -1);
+ return int(sampler->random() * effL);
+}
+
+void RSPD::finishSimulation() {
+ if (!estRSPD) return;
+ for (int i = 1; i <= M; i++) delete[] rspdDists[i];
+ delete[] rspdDists;
+}
+
+#endif /* RSPD_H_ */
diff --git a/Read.h b/Read.h
new file mode 100644
index 0000000..009fe95
--- /dev/null
+++ b/Read.h
@@ -0,0 +1,23 @@
+#ifndef READ
+#define READ
+
+/**
+father class of SingleRead, SingleReadQ, PairedEndRead, PairedEndReadQ
+ */
+
+#include<iostream>
+#include<string>
+
+class Read {
+ public:
+ Read() { name = ""; low_quality = false; }
+ bool read(int argc, std::istream* argv[], int flags = 7) { return false; } //read from file, flags, which entries loaded 1 : readseq, 2 : quality score 4 : name
+ void write(int argc, std::ostream* argv[]) {}; //write to files // do not write if does not read fully
+ const std::string& getName() const { return name; }
+ bool isLowQuality() const { return low_quality; } // if this read is low quality and should not be used
+ protected:
+ std::string name; //name of the read
+ bool low_quality;
+};
+
+#endif
diff --git a/ReadIndex.h b/ReadIndex.h
new file mode 100644
index 0000000..293e4a3
--- /dev/null
+++ b/ReadIndex.h
@@ -0,0 +1,59 @@
+#ifndef READINDEX_H_
+#define READINDEX_H_
+
+#include<cstdio>
+#include<cstdlib>
+#include<iostream>
+#include<fstream>
+
+#include "utils.h"
+
+struct ReadIndex {
+ READ_INT_TYPE nReads;
+ int gap, nPos;
+ std::streampos *index;
+
+ ReadIndex () {
+ nReads = 0; gap = nPos = 0;
+ index = NULL;
+ }
+
+ ReadIndex(const char* readF) {
+ char indexF[STRLEN];
+ std::ifstream fin;
+
+ sprintf(indexF, "%s.ridx", readF);
+ fin.open(indexF, std::ios::binary);
+ if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", indexF); exit(-1); }
+
+ nReads = 0; gap = nPos = 0;
+ index = NULL;
+ if (fin.is_open()) {
+ fin.read((char*)&nReads, sizeof(nReads));
+ fin.read((char*)&gap, sizeof(gap));
+ fin.read((char*)&nPos, sizeof(nPos));
+ index = new std::streampos[nPos];
+ for (int i = 0; i < nPos; i++) {
+ fin.read((char*)&index[i], sizeof(std::streampos));
+ }
+ }
+ }
+
+ ~ReadIndex() {
+ nReads = 0; gap = nPos = 0;
+ if (index != NULL) delete[] index;
+ }
+
+ //rid 0-based , return crid : current seeked rid
+ READ_INT_TYPE locate(READ_INT_TYPE rid, std::ifstream& out) {
+ if (index == NULL) {
+ out.seekg(0, std::ios::beg);
+ return 0;
+ }
+ assert(rid >= 0 && rid < nReads);
+ out.seekg(index[rid / gap]);
+ return (rid / gap) * gap;
+ }
+};
+
+#endif /* READINDEX_H_ */
diff --git a/ReadReader.h b/ReadReader.h
new file mode 100644
index 0000000..141585a
--- /dev/null
+++ b/ReadReader.h
@@ -0,0 +1,118 @@
+#ifndef READREADER_H_
+#define READREADER_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<iostream>
+#include<cassert>
+#include<fstream>
+#include<vector>
+
+#include "utils.h"
+#include "SingleRead.h"
+#include "SingleReadQ.h"
+#include "PairedEndRead.h"
+#include "PairedEndReadQ.h"
+#include "ReadIndex.h"
+
+
+template<class ReadType>
+class ReadReader {
+public:
+ ReadReader() { s = 0; indices = NULL; arr = NULL; hasPolyA = false; seedLen = -1; }
+ ReadReader(int s, char readFs[][STRLEN], bool hasPolyA = false, int seedLen = -1);
+ ~ReadReader();
+
+ void setIndices(ReadIndex** indices) {
+ this->indices = indices;
+ }
+
+ bool locate(READ_INT_TYPE); // You should guarantee that indices exist and rid is valid, otherwise return false; If it fails, you should reset it manually!
+ void reset();
+
+ bool next(ReadType& read, int flags = 7) {
+ bool success = read.read(s, (std::istream**)arr, flags);
+ if (success && seedLen > 0) { read.calc_lq(hasPolyA, seedLen); }
+ return success;
+ }
+
+private:
+ int s; // number of files
+ ReadIndex **indices;
+ std::ifstream** arr;
+ std::streampos *locations;
+
+ bool hasPolyA;
+ int seedLen;
+};
+
+template<class ReadType>
+ReadReader<ReadType>::ReadReader(int s, char readFs[][STRLEN], bool hasPolyA, int seedLen) {
+ assert(s > 0);
+ this->s = s;
+ arr = new std::ifstream*[s];
+ locations = new std::streampos[s];
+ indices = NULL;
+ for (int i = 0; i < s; i++) {
+ arr[i] = new std::ifstream(readFs[i]);
+ if (!arr[i]->is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", readFs[i]); exit(-1); }
+ locations[i] = arr[i]->tellg();
+ }
+ this->hasPolyA = hasPolyA;
+ this->seedLen = seedLen;
+}
+
+template<class ReadType>
+ReadReader<ReadType>::~ReadReader() {
+ indices = NULL;
+ if (arr != NULL) {
+ for (int i = 0; i < s; i++) {
+ arr[i]->close();
+ delete arr[i];
+ }
+ delete[] arr;
+ }
+ if (locations != NULL) {
+ delete[] locations;
+ }
+}
+
+template<class ReadType>
+bool ReadReader<ReadType>::locate(READ_INT_TYPE rid) {
+ READ_INT_TYPE crid = -1;
+ ReadType read;
+
+ if (indices == NULL) return false;
+
+ //We should make sure that crid returned by each indices is the same
+ for (int i = 0; i < s; i++) {
+ READ_INT_TYPE val = indices[i]->locate(rid, *arr[i]);
+ if (i == 0) { crid = val; } else { assert(crid == val); }
+ }
+ assert(crid <= rid);
+ while (crid < rid && read.read(s, (std::istream**)arr, 0)) ++crid;
+
+ if (crid < rid) return false;
+
+ std::vector<std::streampos> tmp(s);
+ for (int i = 0; i < s; i++) { tmp[i] = arr[i]->tellg(); }
+
+ if (!read.read(s, (std::istream**)arr, 0)) return false;
+
+ for (int i = 0; i < s; i++) {
+ locations[i] = tmp[i];
+ arr[i]->seekg(locations[i]);
+ }
+
+ return true;
+}
+
+template<class ReadType>
+void ReadReader<ReadType>::reset() {
+ for (int i = 0; i < s; i++) {
+ arr[i]->seekg(locations[i]);
+ }
+}
+
+#endif /* READREADER_H_ */
diff --git a/RefSeq.h b/RefSeq.h
new file mode 100644
index 0000000..88b149e
--- /dev/null
+++ b/RefSeq.h
@@ -0,0 +1,140 @@
+#ifndef REFSEQ
+#define REFSEQ
+
+#include<cassert>
+#include<fstream>
+#include<string>
+#include<vector>
+
+#include "utils.h"
+
+//Each Object can only be used once
+class RefSeq {
+public:
+ RefSeq() {
+ fullLen = totLen = 0;
+ name = ""; seq = "";
+ fmasks.clear();
+ }
+
+ //Constructor , seq : the forward strand of the reference
+ //tag does not contain ">"
+ //polyALen : length of polyA tail we add
+ RefSeq(const std::string& name, const std::string& seq, int polyALen) {
+ fullLen = seq.length();
+ totLen = fullLen + polyALen;
+ this->name = name;
+ this->seq = seq;
+ this->seq.append(polyALen, 'A');
+
+ assert(fullLen > 0 && totLen >= fullLen);
+
+ int len = (fullLen - 1) / NBITS + 1;
+ fmasks.assign(len, 0);
+ // set mask if poly(A) tail is added
+ if (polyALen > 0) {
+ for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i);
+ }
+ }
+
+ RefSeq(const RefSeq& o) {
+ fullLen = o.fullLen;
+ totLen = o.totLen;
+ name = o.name;
+ seq = o.seq;
+ fmasks = o.fmasks;
+ }
+
+ RefSeq& operator= (const RefSeq &rhs) {
+ if (this != &rhs) {
+ fullLen = rhs.fullLen;
+ totLen = rhs.totLen;
+ name = rhs.name;
+ seq = rhs.seq;
+ fmasks = rhs.fmasks;
+ }
+
+ return *this;
+ }
+
+ ~RefSeq() {}
+
+ bool read(std::ifstream&, int = 0);
+ void write(std::ofstream&);
+
+ int getFullLen() const { return fullLen; }
+
+ int getTotLen() const { return totLen; }
+
+ const std::string& getName() const { return name; }
+
+ std::string getSeq() const { return seq; }
+
+ std::string getRSeq() const {
+ std::string rseq = "";
+ for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i])));
+ return rseq;
+ }
+
+ //get the sequence dir 0 : + 1 : -
+ std::string getSeq(int dir) const {
+ return (dir == 0 ? getSeq() : getRSeq());
+ }
+
+ int get_id(int pos, int dir) const {
+ assert(pos >= 0 && pos < totLen);
+ return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1]));
+ }
+
+ bool getMask(int seedPos) const {
+ assert(seedPos >= 0 && seedPos < totLen);
+ return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS];
+ }
+
+ void setMask(int seedPos) {
+ assert(seedPos >= 0 && seedPos < totLen);
+ fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS];
+ }
+
+private:
+ int fullLen; // fullLen : the original length of an isoform
+ int totLen; // totLen : the total length, included polyA tails, if any
+ std::string name; // the tag
+ std::string seq; // the raw sequence, in forward strand
+ std::vector<unsigned int> fmasks; // record masks for forward strand, each position occupies 1 bit
+};
+
+//internal read; option 0 : read all 1 : do not read seqences
+bool RefSeq::read(std::ifstream& fin, int option) {
+ std::string line;
+
+ if (!(fin>>fullLen>>totLen)) return false;
+ assert(fullLen > 0 && totLen >= fullLen);
+ getline(fin, line);
+ if (!getline(fin, name)) return false;
+ if (!getline(fin, seq)) return false;
+
+ int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits
+ fmasks.assign(len, 0);
+ for (int i = 0; i < len; i++)
+ if (!(fin>>fmasks[i])) return false;
+ getline(fin, line);
+
+ assert(option == 0 || option == 1);
+ if (option == 1) { seq = ""; }
+
+ return true;
+}
+
+//write to file in "internal" format
+void RefSeq::write(std::ofstream& fout) {
+ fout<<fullLen<<" "<<totLen<<std::endl;
+ fout<<name<<std::endl;
+ fout<<seq<<std::endl;
+
+ int len = fmasks.size();
+ for (int i = 0; i < len - 1; i++) fout<<fmasks[i]<<" ";
+ fout<<fmasks[len - 1]<<std::endl;
+}
+
+#endif
diff --git a/RefSeqPolicy.h b/RefSeqPolicy.h
new file mode 100644
index 0000000..71acb3d
--- /dev/null
+++ b/RefSeqPolicy.h
@@ -0,0 +1,22 @@
+#ifndef REFSEQPOLICY
+#define REFSEQPOLICY
+
+#include<string>
+
+/**
+Convert reference sequences to RSEM format
+ */
+class RefSeqPolicy {
+ public:
+ std::string convert(const std::string& rawseq) {
+ int size = (int)rawseq.size();
+ std::string seq = rawseq;
+ for (int i = 0; i < size; i++) {
+ seq[i] = toupper(rawseq[i]);
+ if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
+ }
+ return seq;
+ }
+};
+
+#endif
diff --git a/Refs.h b/Refs.h
new file mode 100644
index 0000000..9c3b254
--- /dev/null
+++ b/Refs.h
@@ -0,0 +1,159 @@
+#ifndef REFS
+#define REFS
+
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<fstream>
+#include<vector>
+
+#include "utils.h"
+#include "RefSeq.h"
+#include "RefSeqPolicy.h"
+#include "PolyARules.h"
+
+
+class Refs {
+ public:
+ Refs() {
+ M = 0;
+ seqs.clear();
+ has_polyA = false;
+ }
+
+ ~Refs() {}
+
+ void makeRefs(char*, RefSeqPolicy&, PolyARules&);
+ void loadRefs(char*, int = 0);
+ void saveRefs(char*);
+
+ int getM() { return M; } // get number of isoforms
+
+ //int getNS() { return M + 1; } // get number of parameters, I do not think we need it here.
+
+ RefSeq& getRef(int sid) { return seqs[sid]; } // get a particular reference
+
+ std::vector<RefSeq>& getRefs() { return seqs; } // may be slow, for copying the whole thing
+
+ bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added
+
+ //lim : >=0 If mismatch > lim , return; -1 find all mismatches
+ int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) {
+ int nMis = 0; // number of mismatches
+
+ for (int i = 0; i < LEN; i++) {
+ char rc = toupper(readseq[i]);
+ if (seq[i + pos] == 'N' || rc == 'N' || seq[i + pos] != rc) nMis++;
+
+ // a speed up tech
+ if (lim >= 0 && nMis > lim) return nMis;
+ }
+
+ return nMis;
+ }
+
+ bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) {
+ if (sid <= 0 || sid > M || (dir != 0 && dir != 1) || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false;
+ const std::string& seq = seqs[sid].getSeq(dir);
+ return countMismatch(seq, pos, readseq, LEN, C) <= C;
+ }
+
+ // get segment from refs
+ std::string getSegment(int sid, int dir, int pos, int LEN) {
+ if (pos < 0 || pos + LEN > seqs[sid].getTotLen()) return "fail";
+
+ const std::string& seq = seqs[sid].getSeq(dir);
+ std::string seg = "";
+
+ for (int i = 0; i < LEN; i++)
+ seg.append(1, seq[pos + i]);
+
+ return seg;
+ }
+
+ private:
+ int M; // # of isoforms, id starts from 1
+ std::vector<RefSeq> seqs; // reference sequences, starts from 1; 0 is for noise gene
+ bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false
+};
+
+//inpF in fasta format
+void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
+ //read standard fasta format here
+ std::ifstream fin;
+ std::string tag, line, rawseq;
+
+ seqs.clear();
+ seqs.push_back(RefSeq()); // noise isoform
+
+ M = 0;
+ has_polyA = false;
+
+ fin.open(inpF);
+ if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
+ getline(fin, line);
+ while ((fin) && (line[0] == '>')) {
+ tag = line.substr(1);
+ rawseq = "";
+ while((getline(fin, line)) && (line[0] != '>')) {
+ rawseq += line;
+ }
+ if (rawseq.size() <= 0) {
+ fprintf(stderr, "Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
+ continue;
+ }
+ ++M;
+ seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
+ has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen();
+ }
+ fin.close();
+
+ if (verbose) { printf("Refs.makeRefs finished!\n"); }
+}
+
+//inpF in fasta format, with sequence all in one line together
+//option 0 read all, 1 do not read sequences
+void Refs::loadRefs(char *inpF, int option) {
+ std::ifstream fin;
+ RefSeq seq;
+
+ fin.open(inpF);
+ if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
+ seqs.clear();
+ seqs.push_back(RefSeq());
+
+ M = 0;
+ has_polyA = false;
+
+ bool success;
+ do {
+ success = seq.read(fin, option);
+ if (success) {
+ seqs.push_back(seq);
+ ++M;
+ has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen();
+ }
+ } while (success);
+
+ fin.close();
+
+ assert(M + 1 == (int)seqs.size());
+
+ if (verbose) { printf("Refs.loadRefs finished!\n"); }
+}
+
+void Refs::saveRefs(char* outF) {
+ std::ofstream fout;
+
+ fout.open(outF);
+ for (int i = 1; i <= M; i++) {
+ seqs[i].write(fout);
+ }
+ fout.close();
+
+ if (verbose) { printf("Refs.saveRefs finished!\n"); }
+}
+
+#endif
diff --git a/SamHeader.cpp b/SamHeader.cpp
new file mode 100644
index 0000000..6eb7a88
--- /dev/null
+++ b/SamHeader.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016
+ Bo Li (University of California, Berkeley)
+ bli25@berkeley.edu
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA
+*/
+
+#include<cstdio>
+#include<cassert>
+#include<string>
+#include<set>
+#include<map>
+#include<fstream>
+#include<sstream>
+
+#include "my_assert.h"
+#include "SamHeader.hpp"
+
+void SamHeader::replaceSQ(const char* faiF) {
+ std::ifstream fin(faiF);
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(faiF) + "! It may not exist.");
+
+ std::string line;
+ size_t pos;
+
+ SQstr = "";
+ while (getline(fin, line)) {
+ pos = line.find_first_of('\t');
+ assert(pos != std::string::npos && pos > 0 && pos + 1 < line.length() && line[pos + 1] != '\t');
+ SQstr += "@SQ\tSN:" + line.substr(0, pos) + "\tLN:" + line.substr(pos + 1, line.find_first_of('\t', pos + 1)) + "\n";
+ }
+ fin.close();
+}
+
+std::map<std::string, std::string> SamHeader::parse_line(const std::string& line) {
+ size_t len = line.length();
+ assert(line.substr(0, 3) != "@CO" && len > 4);
+
+ size_t fr, to, colon;
+ std::string field;
+ std::map<std::string, std::string> dict;
+
+ fr = 4;
+ do {
+ to = line.find_first_of('\t', fr);
+ field = line.substr(fr, to);
+ colon = field.find_first_of(':');
+ if (colon != std::string::npos) {
+ dict[field.substr(0, colon)] = field.substr(colon + 1);
+ }
+ fr = to;
+ } while (fr != std::string::npos && (++fr) < len);
+
+ return dict;
+}
+
+void SamHeader::parse_text(const char* text) {
+ std::istringstream strin(text);
+ std::string line, tag;
+
+ std::map<std::string, std::string> dict;
+ std::map<std::string, std::string>::iterator dict_iter;
+
+
+ HDstr = SQstr = RGstr = PGstr = COstr = other = "";
+ pids.clear();
+
+ while (getline(strin, line)) {
+ if (line[0] != '@') continue;
+ tag = line.substr(1, 2);
+ if (tag == "HD") {
+ general_assert(HDstr == "", "@HD tag can only present once!");
+ HDstr = line; HDstr += "\n";
+ }
+ else if (tag == "SQ") {
+ SQstr += line; SQstr += "\n";
+ }
+ else if (tag == "RG") {
+ RGstr += line; RGstr += "\n";
+ }
+ else if (tag == "PG") {
+ dict = parse_line(line);
+ dict_iter = dict.find("ID");
+ general_assert(dict_iter != dict.end(), "\"" + line + "\" does not contain an ID!" );
+
+ general_assert(pids.find(dict_iter->second) == pids.end(), "Program record identifier " + dict_iter->second + " is not unique!");
+ pids.insert(dict_iter->second);
+
+ PGstr += line; PGstr += "\n";
+ }
+ else if (tag == "CO") {
+ COstr += line; COstr += "\n";
+ }
+ else {
+ other += line; line += "\n";
+ }
+ }
+}
diff --git a/SamHeader.hpp b/SamHeader.hpp
new file mode 100644
index 0000000..86124e7
--- /dev/null
+++ b/SamHeader.hpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016
+ Bo Li (University of California, Berkeley)
+ bli25@berkeley.edu
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA
+*/
+
+#ifndef SAMHEADER_H_
+#define SAMHEADER_H_
+
+#include<cstdlib>
+#include<string>
+#include<set>
+#include<map>
+
+#include "htslib/sam.h"
+
+class SamHeader {
+public:
+ SamHeader(const char* text = NULL) {
+ if (text != NULL) parse_text(text);
+ }
+
+ void replaceSQ(const char* faiF);
+
+ void insertPG(const std::string& pid, const std::string& command = "") {
+ if (pids.find(pid) == pids.end()) {
+ pids.insert(pid);
+ PGstr += "@PG\tID:" + pid;
+ if (command != "") PGstr += "\tCL:" + command;
+ PGstr += "\n";
+ }
+ }
+
+ void addComment(const std::string& comment) {
+ COstr += "@CO\t" + comment + "\n";
+ }
+
+ bam_hdr_t* create_header() {
+ std::string text = HDstr + SQstr + RGstr + PGstr + COstr + other;
+ bam_hdr_t *h = sam_hdr_parse(text.length(), text.c_str());
+
+ h->l_text = text.length();
+ h->text = (char*)calloc(h->l_text + 1, 1);
+ strcpy(h->text, text.c_str());
+
+ return h;
+ }
+
+private:
+ std::string HDstr, SQstr, RGstr, PGstr, COstr, other;
+ std::set<std::string> pids;
+
+ std::map<std::string, std::string> parse_line(const std::string& line);
+ void parse_text(const char* text);
+};
+
+#endif
diff --git a/SamParser.h b/SamParser.h
new file mode 100644
index 0000000..2ae6f55
--- /dev/null
+++ b/SamParser.h
@@ -0,0 +1,268 @@
+/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */
+#ifndef SAMPARSER_H_
+#define SAMPARSER_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+
+#include "utils.h"
+#include "my_assert.h"
+
+#include "SingleRead.h"
+#include "SingleReadQ.h"
+#include "PairedEndRead.h"
+#include "PairedEndReadQ.h"
+#include "SingleHit.h"
+#include "PairedEndHit.h"
+
+#include "Transcripts.h"
+
+
+class SamParser {
+public:
+ SamParser(const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName);
+ ~SamParser();
+
+ /**
+ * return value
+ * -1 : no more alignment
+ * 0 : new read , type 0
+ * 1 : new read , type 1 with alignment
+ * 2 : new read , type 2
+ * 5 : new alignment but same read
+ */
+ int parseNext(SingleRead& read, SingleHit& hit);
+ int parseNext(SingleReadQ& read, SingleHit& hit);
+ int parseNext(PairedEndRead& read, PairedEndHit& hit);
+ int parseNext(PairedEndReadQ& read, PairedEndHit& hit);
+
+ static void setReadTypeTag(const char* tag) {
+ strcpy(rtTag, tag);
+ }
+
+private:
+ samFile *sam_in;
+ bam_hdr_t *header;
+ bam1_t *b, *b2;
+
+ Transcripts& transcripts;
+
+ int n_warns; // Number of warnings
+
+ //tag used by aligner
+ static char rtTag[STRLEN];
+
+ //0 ~ N0, 1 ~ N1, 2 ~ N2
+ int getReadType(const bam1_t* b) {
+ if (bam_is_mapped(b)) return 1;
+ if (!strcmp(rtTag, "")) return 0;
+ uint8_t *p = bam_aux_get(b, rtTag);
+ return (p == NULL || bam_aux2i(p) <= 0) ? 0 : 2;
+ }
+
+ // for paired-end reads
+ int getReadType(const bam1_t* b, const bam1_t* b2) {
+ if (bam_is_mapped(b) && bam_is_mapped(b2)) return 1;
+ if (!strcmp(rtTag, "")) return 0;
+
+ uint8_t *p = bam_aux_get(b, rtTag);
+ if (p != NULL && bam_aux2i(p) > 0) return 2;
+
+ p = bam_aux_get(b2, rtTag);
+ if (p != NULL && bam_aux2i(p) > 0) return 2;
+
+ return 0;
+ }
+};
+
+char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
+
+// aux, if not 0, points to the file name of fn_list
+SamParser::SamParser(const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName)
+ : transcripts(transcripts), n_warns(0)
+{
+ sam_in = sam_open(inpF, "r");
+ general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+
+ if (aux != NULL) hts_set_fai_filename(sam_in, aux);
+ header = sam_hdr_read(sam_in);
+ general_assert(header != 0, "Fail to parse sam header!");
+
+ transcripts.buildMappings(header->n_targets, header->target_name, imdName);
+
+ b = bam_init1();
+ b2 = bam_init1();
+}
+
+SamParser::~SamParser() {
+ if (n_warns > 0) fprintf(stderr, "Warning: Detected %d lines containing read pairs whose two mates have different names.\n", n_warns);
+
+ bam_hdr_destroy(header);
+ sam_close(sam_in);
+ bam_destroy1(b);
+ bam_destroy1(b2);
+}
+
+// If sam_read1 returns 0 , what does it mean?
+//Assume b.core.tid is 0-based
+int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
+ int val; // return value
+
+ if (sam_read1(sam_in, header, b) < 0) return -1;
+
+ std::string name = bam_get_canonical_name(b);
+
+ general_assert(!bam_is_paired(b), "Read " + name + ": Find a paired end read in the file!");
+
+ int readType = getReadType(b);
+ if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
+ val = readType;
+ read = SingleRead(name, bam_get_read_seq(b));
+ }
+ else {
+ general_assert(read.getReadLength() == b->core.l_qseq, "Read " + name + " has alignments with inconsistent read lengths!");
+ val = 5;
+ }
+
+ if (readType == 1) {
+ general_assert(bam_check_cigar(b), "Read " + name + ": RSEM currently does not support gapped alignments, sorry!\n");
+ if (bam_is_rev(b)) {
+ hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
+ }
+ else {
+ hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
+ }
+ }
+
+ return val;
+}
+
+int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
+ int val;
+
+ if (sam_read1(sam_in, header, b) < 0) return -1;
+
+ std::string name = bam_get_canonical_name(b);
+
+ general_assert(!bam_is_paired(b), "Read " + name + ": Find a paired end read in the file!");
+
+ int readType = getReadType(b);
+ if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
+ val = readType;
+ read = SingleReadQ(name, bam_get_read_seq(b), bam_get_qscore(b));
+ }
+ else {
+ general_assert(read.getReadLength() == b->core.l_qseq, "Read " + name + " has alignments with inconsistent read lengths!");
+ val = 5;
+ }
+
+ if (readType == 1) {
+ general_assert(bam_check_cigar(b), "Read " + name + ": RSEM currently does not support gapped alignments, sorry!\n");
+ if (bam_is_rev(b)) {
+ hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
+ }
+ else {
+ hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
+ }
+ }
+
+ return val;
+}
+
+//Assume whether aligned or not , two mates of paired-end reads are always get together
+int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
+ int val;
+
+ if ((sam_read1(sam_in, header, b) < 0) || (sam_read1(sam_in, header, b2) < 0)) return -1;
+
+ if (!bam_is_read1(b)) { bam1_t * tmp = b; b = b2; b2 = tmp; }
+ std::string name = bam_get_canonical_name(b);
+
+ general_assert(bam_is_paired(b) && bam_is_paired(b2), "Read " + name + ": One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)");
+ general_assert((bam_is_read1(b) && bam_is_read2(b2)), "Read " + name + ": The adjacent two lines do not represent the two mates of a paired-end read! (RSEM assumes the two mates of a paired-end read should be adjacent)");
+ general_assert((bam_is_mapped(b) && bam_is_mapped(b2)) || (!bam_is_mapped(b) && !bam_is_mapped(b2)), "Read " + name + ": RSEM currently does not support partial alignments!");
+
+ std::string name2 = bam_get_canonical_name(b2);
+ if (name != name2)
+ if (++n_warns <= MAX_WARNS)
+ fprintf(stderr, "Warning: Detected a read pair whose two mates have different names--%s and %s!\n", name.c_str(), name2.c_str());
+
+ int readType = getReadType(b, b2);
+
+ if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
+ val = readType;
+ SingleRead mate1(name, bam_get_read_seq(b));
+ SingleRead mate2(name2, bam_get_read_seq(b2));
+ read = PairedEndRead(mate1, mate2);
+ }
+ else {
+ general_assert(read.getMate1().getReadLength() == b->core.l_qseq && read.getMate2().getReadLength() == b2->core.l_qseq, "Paired-end read " + name + " has alignments with inconsistent mate lengths!");
+ val = 5;
+ }
+
+ if (readType == 1) {
+ general_assert(bam_check_cigar(b) && bam_check_cigar(b2), "Read " + name + ": RSEM currently does not support gapped alignments, sorry!");
+ general_assert(b->core.tid == b2->core.tid, "Read " + name + ": The two mates do not align to a same transcript! RSEM does not support discordant alignments.");
+ if (bam_is_rev(b)) {
+ hit = PairedEndHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq, b->core.pos + b->core.l_qseq - b2->core.pos);
+ }
+ else {
+ hit = PairedEndHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos, b2->core.pos + b2->core.l_qseq - b->core.pos);
+ }
+ }
+
+ return val;
+}
+
+int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
+ int val;
+
+ if ((sam_read1(sam_in, header, b) < 0) || (sam_read1(sam_in, header, b2) < 0)) return -1;
+
+ if (!bam_is_read1(b)) { bam1_t *tmp = b; b = b2; b2 = tmp; } // swap if the first read is not read 1
+ std::string name = bam_get_canonical_name(b);
+
+ general_assert(bam_is_paired(b) && bam_is_paired(b2), "Read " + name + ": One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)");
+ general_assert(bam_is_read1(b) && bam_is_read2(b2), "Read " + name + ": The adjacent two lines do not represent the two mates of a paired-end read! (RSEM assumes the two mates of a paired-end read should be adjacent)");
+ general_assert((bam_is_mapped(b) && bam_is_mapped(b2)) || (!bam_is_mapped(b) && !bam_is_mapped(b2)), "Read " + name + ": RSEM currently does not support partial alignments!");
+
+ std::string name2 = bam_get_canonical_name(b2);
+ if (name != name2)
+ if (++n_warns <= MAX_WARNS)
+ fprintf(stderr, "Warning: Detected a read pair whose two mates have different names--%s and %s!\n", name.c_str(), name2.c_str());
+
+ int readType = getReadType(b, b2);
+
+ if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
+ val = readType;
+ SingleReadQ mate1(name, bam_get_read_seq(b), bam_get_qscore(b));
+ SingleReadQ mate2(name2, bam_get_read_seq(b2), bam_get_qscore(b2));
+ read = PairedEndReadQ(mate1, mate2);
+ }
+ else {
+ general_assert(read.getMate1().getReadLength() == b->core.l_qseq && read.getMate2().getReadLength() == b2->core.l_qseq, "Paired-end read " + name + " has alignments with inconsistent mate lengths!");
+ val = 5;
+ }
+
+ if (readType == 1) {
+ general_assert(bam_check_cigar(b) && bam_check_cigar(b2), "Read " + name + ": RSEM currently does not support gapped alignments, sorry!");
+ general_assert(b->core.tid == b2->core.tid, "Read " + name + ": The two mates do not align to a same transcript! RSEM does not support discordant alignments.");
+ if (bam_is_rev(b)) {
+ hit = PairedEndHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq, b->core.pos + b->core.l_qseq - b2->core.pos);
+ }
+ else {
+ hit = PairedEndHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos, b2->core.pos + b2->core.l_qseq - b->core.pos);
+ }
+ }
+
+ return val;
+}
+
+#endif /* SAMPARSER_H_ */
diff --git a/SingleHit.h b/SingleHit.h
new file mode 100644
index 0000000..b157a15
--- /dev/null
+++ b/SingleHit.h
@@ -0,0 +1,56 @@
+#ifndef SINGLEHIT_H_
+#define SINGLEHIT_H_
+
+#include<cstdlib>
+#include<iostream>
+
+//char dir : 0 +, 1 - , encoding as 1 + , -1 -
+class SingleHit {
+public:
+ SingleHit() {
+ sid = 0; pos = -1; conprb = 0.0; // for noise gene
+ }
+
+ //sid encodes dir here
+ SingleHit(int sid, int pos, double conprb = 0.0) {
+ this->sid = sid;
+ this->pos = pos;
+ this->conprb = conprb;
+ }
+
+ bool isNoise() const { return sid == 0; }
+
+ //makes no sense for noise gene
+ int getDir() const { return sid < 0; }
+
+ int getSid() const { return abs(sid); }
+
+ int getPos() const { return pos; }
+
+ double getConPrb() const { return conprb; }
+
+ void setConPrb(double conprb) {
+ this->conprb = conprb;
+ }
+
+ bool read(std::istream&);
+ void write(std::ostream&);
+
+protected:
+ int sid, pos; // sid encodes dir
+ double conprb; // conditional probability
+};
+
+bool SingleHit::read(std::istream& in) {
+ conprb = 0.0;
+ return (in>>sid>>pos);
+}
+
+void SingleHit::write(std::ostream& out) {
+ out<<" "<<sid<<" "<<pos;
+}
+
+#endif /* SINGLEHIT_H_ */
+
+
+
diff --git a/SingleModel.h b/SingleModel.h
new file mode 100644
index 0000000..87f4138
--- /dev/null
+++ b/SingleModel.h
@@ -0,0 +1,526 @@
+#ifndef SINGLEMODEL_H_
+#define SINGLEMODEL_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cassert>
+#include<cstring>
+#include<string>
+#include<algorithm>
+#include<sstream>
+#include<iostream>
+#include<vector>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Orientation.h"
+#include "LenDist.h"
+#include "RSPD.h"
+#include "Profile.h"
+#include "NoiseProfile.h"
+
+#include "ModelParams.h"
+#include "RefSeq.h"
+#include "Refs.h"
+#include "SingleRead.h"
+#include "SingleHit.h"
+#include "ReadReader.h"
+
+#include "simul.h"
+
+class SingleModel {
+public:
+ SingleModel(Refs* refs = NULL) {
+ this->refs = refs;
+ M = (refs != NULL ? refs->getM() : 0);
+ memset(N, 0, sizeof(N));
+ estRSPD = false;
+ needCalcConPrb = true;
+
+ ori = new Orientation();
+ gld = new LenDist();
+ mld = NULL;
+ rspd = new RSPD(estRSPD);
+ pro = new Profile();
+ npro = new NoiseProfile();
+
+ mean = -1.0; sd = 0.0;
+ mw = NULL;
+
+ seedLen = 0;
+ }
+
+ //If it is not a master node, only init & update can be used!
+ SingleModel(ModelParams& params, bool isMaster = true) {
+ M = params.M;
+ memcpy(N, params.N, sizeof(params.N));
+ refs = params.refs;
+ estRSPD = params.estRSPD;
+ mean = params.mean; sd = params.sd;
+ seedLen = params.seedLen;
+ needCalcConPrb = true;
+
+ ori = NULL; gld = NULL; mld = NULL; rspd = NULL; pro = NULL; npro = NULL;
+ mw = NULL;
+
+ if (isMaster) {
+ gld = new LenDist(params.minL, params.maxL);
+ if (mean >= EPSILON) {
+ mld = new LenDist(params.mate_minL, params.mate_maxL);
+ }
+ if (!estRSPD) { rspd = new RSPD(estRSPD); }
+ }
+
+ ori = new Orientation(params.probF);
+ if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
+ pro = new Profile(params.maxL);
+ npro = new NoiseProfile();
+ }
+
+ ~SingleModel() {
+ refs = NULL;
+ if (ori != NULL) delete ori;
+ if (gld != NULL) delete gld;
+ if (mld != NULL) delete mld;
+ if (rspd != NULL) delete rspd;
+ if (pro != NULL) delete pro;
+ if (npro != NULL) delete npro;
+ if (mw != NULL) delete[] mw;
+ /* delete[] p1, p2 */
+ }
+
+ void estimateFromReads(const char*);
+
+ //if prob is too small, just make it 0
+ double getConPrb(const SingleRead& read, const SingleHit& hit) {
+ if (read.isLowQuality()) return 0.0;
+
+ double prob;
+ int sid = hit.getSid();
+ RefSeq &ref = refs->getRef(sid);
+ int fullLen = ref.getFullLen();
+ int totLen = ref.getTotLen();
+ int dir = hit.getDir();
+ int pos = hit.getPos();
+ int readLen = read.getReadLength();
+ int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
+
+ general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
+ " from the forward direction, which should be a non-negative number! " + \
+ "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
+ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
+ + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
+
+ int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
+ if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
+
+ int effL;
+ double value;
+
+ if (mld != NULL) {
+ int minL = std::max(readLen, gld->getMinL());
+ int maxL = std::min(totLen - pos, gld->getMaxL());
+ int pfpos; // possible fpos for fragment
+ value = 0.0;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
+ }
+ }
+ else {
+ effL = std::min(fullLen, totLen - readLen + 1);
+ value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
+ }
+
+ prob = ori->getProb(dir) * value * pro->getProb(read.getReadSeq(), ref, pos, dir);
+
+ if (prob < EPSILON) { prob = 0.0; }
+
+
+ prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
+
+ return prob;
+ }
+
+ double getNoiseConPrb(const SingleRead& read) {
+ if (read.isLowQuality()) return 0.0;
+ double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
+ prob *= npro->getProb(read.getReadSeq());
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
+
+ return prob;
+ }
+
+ double getLogP() { return npro->getLogP(); }
+
+ void init();
+
+ void update(const SingleRead& read, const SingleHit& hit, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ RefSeq& ref = refs->getRef(hit.getSid());
+ int dir = hit.getDir();
+ int pos = hit.getPos();
+
+ if (estRSPD) {
+ int fullLen = ref.getFullLen();
+
+ // Only use one strand to estimate RSPD
+ if (ori->getProb(0) >= ORIVALVE && dir == 0) {
+ rspd->update(pos, fullLen, frac);
+ }
+
+ if (ori->getProb(0) < ORIVALVE && dir == 1) {
+ int totLen = ref.getTotLen();
+ int readLen = read.getReadLength();
+
+ int pfpos, effL;
+
+ if (mld != NULL) {
+ int minL = std::max(readLen, gld->getMinL());
+ int maxL = std::min(totLen - pos, gld->getMaxL());
+ double sum = 0.0;
+ assert(maxL >= minL);
+ std::vector<double> frag_vec(maxL - minL + 1, 0.0);
+
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = totLen - pos - fragLen;
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
+ sum += frag_vec[fragLen - minL];
+ }
+ assert(sum >= EPSILON);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = totLen - pos - fragLen;
+ rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
+ }
+ }
+ else {
+ rspd->update(totLen - pos - readLen, fullLen, frac);
+ }
+ }
+ }
+ pro->update(read.getReadSeq(), ref, pos, dir, frac);
+ }
+
+ void updateNoise(const SingleRead& read, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ npro->update(read.getReadSeq(), frac);
+ }
+
+ void finish();
+
+ void collect(const SingleModel&);
+
+ bool getNeedCalcConPrb() { return needCalcConPrb; }
+ void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
+
+ //void calcP1();
+ //void calcP2();
+ //double* getP1() { return p1; }
+ //double* getP2() { return p2; }
+
+ void read(const char*);
+ void write(const char*);
+
+ const LenDist& getGLD() { return *gld; }
+
+ void startSimulation(simul*, const std::vector<double>&);
+ bool simulate(READ_INT_TYPE, SingleRead&, int&);
+ void finishSimulation();
+
+ const double* getMW() {
+ assert(mw != NULL);
+ return mw;
+ }
+
+ int getModelType() const { return model_type; }
+
+private:
+ static const int model_type = 0;
+ static const int read_type = 0;
+
+ int M;
+ READ_INT_TYPE N[3];
+ Refs *refs;
+ double mean, sd;
+ int seedLen;
+ //double *p1, *p2; P_i' & P_i''
+
+ bool estRSPD; // true if estimate RSPD
+ bool needCalcConPrb; // true need, false does not need
+
+ Orientation *ori;
+ LenDist *gld, *mld;
+ RSPD *rspd;
+ Profile *pro;
+ NoiseProfile *npro;
+
+ simul *sampler; // for simulation
+ double *theta_cdf; // for simulation
+
+ double *mw; // for masking
+
+ void calcMW();
+};
+
+void SingleModel::estimateFromReads(const char* readFN) {
+ int s;
+ char readFs[2][STRLEN];
+ SingleRead read;
+
+ int n_warns = 0;
+
+ mld != NULL ? mld->init() : gld->init();
+
+ for (int i = 0; i < 3; i++)
+ if (N[i] > 0) {
+ genReadFileNames(readFN, i, read_type, s, readFs);
+ ReadReader<SingleRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
+
+ READ_INT_TYPE cnt = 0;
+ while (reader.next(read)) {
+ if (!read.isLowQuality()) {
+ mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+ if (i == 0) { npro->updateC(read.getReadSeq()); }
+ }
+ else if (read.getReadLength() < seedLen)
+ if (++n_warns <= MAX_WARNS)
+ fprintf(stderr, "Warning: Read %s is ignored due to read length (= %d) < seed length (= %d)!\n", read.getName().c_str(), read.getReadLength(), seedLen);
+
+ ++cnt;
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
+ }
+
+ if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
+ }
+
+ if (n_warns > 0) fprintf(stderr, "Warning: There are %d reads ignored in total.\n", n_warns);
+
+ mld != NULL ? mld->finish() : gld->finish();
+ if (mean >= EPSILON) { //mean should be > 0
+ assert(mld->getMaxL() <= gld->getMaxL());
+ gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
+ }
+ npro->calcInitParams();
+
+ mw = new double[M + 1];
+ calcMW();
+}
+
+void SingleModel::init() {
+ if (estRSPD) rspd->init();
+ pro->init();
+ npro->init();
+}
+
+void SingleModel::finish() {
+ if (estRSPD) rspd->finish();
+ pro->finish();
+ npro->finish();
+ needCalcConPrb = true;
+ if (estRSPD) calcMW();
+}
+
+void SingleModel::collect(const SingleModel& o) {
+ if (estRSPD) rspd->collect(*(o.rspd));
+ pro->collect(*(o.pro));
+ npro->collect(*(o.npro));
+}
+
+//Only master node can call
+void SingleModel::read(const char* inpF) {
+ int val;
+ FILE *fi = fopen(inpF, "r");
+
+ general_assert(fi != NULL, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+
+ assert(fscanf(fi, "%d", &val) == 1);
+ assert(val == model_type);
+
+ ori->read(fi);
+ gld->read(fi);
+ assert(fscanf(fi, "%d", &val) == 1);
+ if (val > 0) {
+ if (mld == NULL) mld = new LenDist();
+ mld->read(fi);
+ }
+ rspd->read(fi);
+ pro->read(fi);
+ npro->read(fi);
+
+ if (fscanf(fi, "%d", &val) == 1) {
+ if (M == 0) M = val;
+ if (M == val) {
+ mw = new double[M + 1];
+ for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
+ }
+ }
+
+ fclose(fi);
+}
+
+//Only master node can call. Only be called at EM.cpp
+void SingleModel::write(const char* outF) {
+ FILE *fo = fopen(outF, "w");
+
+ fprintf(fo, "%d\n", model_type);
+ fprintf(fo, "\n");
+
+ ori->write(fo); fprintf(fo, "\n");
+ gld->write(fo); fprintf(fo, "\n");
+ if (mld != NULL) {
+ fprintf(fo, "1\n");
+ mld->write(fo);
+ }
+ else { fprintf(fo, "0\n"); }
+ fprintf(fo, "\n");
+ rspd->write(fo); fprintf(fo, "\n");
+ pro->write(fo); fprintf(fo, "\n");
+ npro->write(fo);
+
+ if (mw != NULL) {
+ fprintf(fo, "\n%d\n", M);
+ for (int i = 0; i < M; i++) {
+ fprintf(fo, "%.15g ", mw[i]);
+ }
+ fprintf(fo, "%.15g\n", mw[M]);
+ }
+
+ fclose(fo);
+}
+
+void SingleModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
+ this->sampler = sampler;
+
+ theta_cdf = new double[M + 1];
+ for (int i = 0; i <= M; i++) {
+ theta_cdf[i] = theta[i];
+ if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
+ }
+
+ rspd->startSimulation(M, refs);
+ pro->startSimulation();
+ npro->startSimulation();
+}
+
+bool SingleModel::simulate(READ_INT_TYPE rid, SingleRead& read, int& sid) {
+ int dir, pos, readLen, fragLen;
+ std::string name;
+ std::string readseq;
+ std::ostringstream strout;
+
+ sid = sampler->sample(theta_cdf, M + 1);
+
+ if (sid == 0) {
+ dir = pos = 0;
+ readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
+ readseq = npro->simulate(sampler, readLen);
+ }
+ else {
+ RefSeq &ref = refs->getRef(sid);
+ dir = ori->simulate(sampler);
+ fragLen = gld->simulate(sampler, ref.getTotLen());
+ if (fragLen < 0) return false;
+ int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
+ pos = rspd->simulate(sampler, sid, effL);
+ if (pos < 0) return false;
+ if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
+
+ if (mld != NULL) {
+ readLen = mld->simulate(sampler, fragLen);
+ if (readLen < 0) return false;
+ readseq = pro->simulate(sampler, readLen, pos, dir, ref);
+ }
+ else {
+ readseq = pro->simulate(sampler, fragLen, pos, dir, ref);
+ }
+ }
+
+ strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
+ name = strout.str();
+
+ read = SingleRead(name, readseq);
+
+ return true;
+}
+
+void SingleModel::finishSimulation() {
+ delete[] theta_cdf;
+
+ rspd->finishSimulation();
+ pro->finishSimulation();
+ npro->finishSimulation();
+}
+
+void SingleModel::calcMW() {
+ double probF, probR;
+
+ assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
+
+ memset(mw, 0, sizeof(double) * (M + 1));
+ mw[0] = 1.0;
+
+ probF = ori->getProb(0);
+ probR = ori->getProb(1);
+
+ for (int i = 1; i <= M; i++) {
+ RefSeq& ref = refs->getRef(i);
+ int totLen = ref.getTotLen();
+ int fullLen = ref.getFullLen();
+ double value = 0.0;
+ int minL, maxL;
+ int effL, pfpos;
+ int end = std::min(fullLen, totLen - seedLen + 1);
+ double factor;
+
+ for (int seedPos = 0; seedPos < end; seedPos++)
+ if (ref.getMask(seedPos)) {
+ //forward
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), totLen - seedPos);
+ pfpos = seedPos;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ //reverse
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = seedPos - (fragLen - seedLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ }
+
+ //for reverse strand masking
+ for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
+ minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
+ maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = seedPos - (fragLen - seedLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ }
+
+ mw[i] = 1.0 - value;
+
+ if (mw[i] < 1e-8) {
+ // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+ mw[i] = 0.0;
+ }
+ }
+}
+
+#endif /* SINGLEMODEL_H_ */
diff --git a/SingleQModel.h b/SingleQModel.h
new file mode 100644
index 0000000..4989ed1
--- /dev/null
+++ b/SingleQModel.h
@@ -0,0 +1,546 @@
+#ifndef SINGLEQMODEL_H_
+#define SINGLEQMODEL_H_
+
+#include<cmath>
+#include<cstdio>
+#include<cassert>
+#include<cstring>
+#include<string>
+#include<algorithm>
+#include<sstream>
+#include<iostream>
+#include<vector>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Orientation.h"
+#include "LenDist.h"
+#include "RSPD.h"
+#include "QualDist.h"
+#include "QProfile.h"
+#include "NoiseQProfile.h"
+
+#include "ModelParams.h"
+#include "RefSeq.h"
+#include "Refs.h"
+#include "SingleReadQ.h"
+#include "SingleHit.h"
+#include "ReadReader.h"
+
+#include "simul.h"
+
+class SingleQModel {
+public:
+ SingleQModel(Refs* refs = NULL) {
+ this->refs = refs;
+ M = (refs != NULL ? refs->getM() : 0);
+ memset(N, 0, sizeof(N));
+ estRSPD = false;
+ needCalcConPrb = true;
+
+ ori = new Orientation();
+ gld = new LenDist();
+ mld = NULL;
+ rspd = new RSPD(estRSPD);
+ qd = new QualDist();
+ qpro = new QProfile();
+ nqpro = new NoiseQProfile();
+
+ mean = -1.0; sd = 0.0;
+ mw = NULL;
+
+ seedLen = 0;
+ }
+
+ //If it is not a master node, only init & update can be used!
+ SingleQModel(ModelParams& params, bool isMaster = true) {
+ M = params.M;
+ memcpy(N, params.N, sizeof(params.N));
+ refs = params.refs;
+ estRSPD = params.estRSPD;
+ mean = params.mean; sd = params.sd;
+ seedLen = params.seedLen;
+ needCalcConPrb = true;
+
+ ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
+ mw = NULL;
+
+ if (isMaster) {
+ gld = new LenDist(params.minL, params.maxL);
+ if (mean >= EPSILON) {
+ mld = new LenDist(params.mate_minL, params.mate_maxL);
+ }
+ if (!estRSPD) { rspd = new RSPD(estRSPD); }
+ qd = new QualDist();
+ }
+
+ ori = new Orientation(params.probF);
+ if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
+ qpro = new QProfile();
+ nqpro = new NoiseQProfile();
+ }
+
+ ~SingleQModel() {
+ refs = NULL;
+ if (ori != NULL) delete ori;
+ if (gld != NULL) delete gld;
+ if (mld != NULL) delete mld;
+ if (rspd != NULL) delete rspd;
+ if (qd != NULL) delete qd;
+ if (qpro != NULL) delete qpro;
+ if (nqpro != NULL) delete nqpro;
+ if (mw != NULL) delete[] mw;
+ /*delete[] p1, p2;*/
+ }
+
+ //SingleQModel& operator=(const SingleQModel&);
+
+ void estimateFromReads(const char*);
+
+ //if prob is too small, just make it 0
+ double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
+ if (read.isLowQuality()) return 0.0;
+
+ double prob;
+ int sid = hit.getSid();
+ RefSeq &ref = refs->getRef(sid);
+ int fullLen = ref.getFullLen();
+ int totLen = ref.getTotLen();
+ int dir = hit.getDir();
+ int pos = hit.getPos();
+ int readLen = read.getReadLength();
+ int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
+
+ general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
+ " from the forward direction, which should be a non-negative number! " + \
+ "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
+ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
+ general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
+ + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
+
+ int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
+ if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
+
+ int effL;
+ double value;
+
+ if (mld != NULL) {
+ int minL = std::max(readLen, gld->getMinL());
+ int maxL = std::min(totLen - pos, gld->getMaxL());
+ int pfpos; // possible fpos for fragment
+ value = 0.0;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
+ }
+ }
+ else {
+ effL = std::min(fullLen, totLen - readLen + 1);
+ value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
+ }
+
+ prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
+
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
+
+ return prob;
+ }
+
+ double getNoiseConPrb(const SingleReadQ& read) {
+ if (read.isLowQuality()) return 0.0;
+ double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
+ prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
+ if (prob < EPSILON) { prob = 0.0; }
+
+ prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
+
+ return prob;
+ }
+
+ double getLogP() { return nqpro->getLogP(); }
+
+ void init();
+
+ void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ const RefSeq& ref = refs->getRef(hit.getSid());
+
+ int dir = hit.getDir();
+ int pos = hit.getPos();
+
+ if (estRSPD) {
+ int fullLen = ref.getFullLen();
+
+ // Only use one strand to estimate RSPD
+ if (ori->getProb(0) >= ORIVALVE && dir == 0) {
+ rspd->update(pos, fullLen, frac);
+ }
+
+ if (ori->getProb(0) < ORIVALVE && dir == 1) {
+ int totLen = ref.getTotLen();
+ int readLen = read.getReadLength();
+
+ int pfpos, effL;
+
+ if (mld != NULL) {
+ int minL = std::max(readLen, gld->getMinL());
+ int maxL = std::min(totLen - pos, gld->getMaxL());
+ double sum = 0.0;
+ assert(maxL >= minL);
+ std::vector<double> frag_vec(maxL - minL + 1, 0.0);
+
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = totLen - pos - fragLen;
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
+ sum += frag_vec[fragLen - minL];
+ }
+ assert(sum >= EPSILON);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = totLen - pos - fragLen;
+ rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
+ }
+ }
+ else {
+ rspd->update(totLen - pos - readLen, fullLen, frac);
+ }
+ }
+ }
+ qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
+ }
+
+ void updateNoise(const SingleReadQ& read, double frac) {
+ if (read.isLowQuality() || frac < EPSILON) return;
+
+ nqpro->update(read.getReadSeq(), read.getQScore(), frac);
+ }
+
+ void finish();
+
+ void collect(const SingleQModel&);
+
+ //void copy(const SingleQModel&);
+
+ bool getNeedCalcConPrb() { return needCalcConPrb; }
+ void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
+
+ //void calcP1();
+ //void calcP2();
+ //double* getP1() { return p1; }
+ //double* getP2() { return p2; }
+
+ void read(const char*);
+ void write(const char*);
+
+ const LenDist& getGLD() { return *gld; }
+
+ void startSimulation(simul*, const std::vector<double>&);
+ bool simulate(READ_INT_TYPE, SingleReadQ&, int&);
+ void finishSimulation();
+
+ //Use it after function 'read' or 'estimateFromReads'
+ const double* getMW() {
+ assert(mw != NULL);
+ return mw;
+ }
+
+ int getModelType() const { return model_type; }
+
+private:
+ static const int model_type = 1;
+ static const int read_type = 1;
+
+ int M;
+ READ_INT_TYPE N[3];
+ Refs *refs;
+ double mean, sd;
+ int seedLen;
+ //double *p1, *p2; P_i' & P_i'';
+
+ bool estRSPD; // true if estimate RSPD
+ bool needCalcConPrb; //true need, false does not need
+
+ Orientation *ori;
+ LenDist *gld, *mld;
+ RSPD *rspd;
+ QualDist *qd;
+ QProfile *qpro;
+ NoiseQProfile *nqpro;
+
+ simul *sampler; // for simulation
+ double *theta_cdf; // for simulation
+
+ double *mw; // for masking
+
+ void calcMW();
+};
+
+void SingleQModel::estimateFromReads(const char* readFN) {
+ int s;
+ char readFs[2][STRLEN];
+ SingleReadQ read;
+
+ int n_warns = 0;
+
+ mld != NULL ? mld->init() : gld->init();
+
+ for (int i = 0; i < 3; i++)
+ if (N[i] > 0) {
+ genReadFileNames(readFN, i, read_type, s, readFs);
+ ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
+
+ READ_INT_TYPE cnt = 0;
+ while (reader.next(read)) {
+ if (!read.isLowQuality()) {
+ mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+ qd->update(read.getQScore());
+ if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+ }
+ else if (read.getReadLength() < seedLen)
+ if (++n_warns <= MAX_WARNS)
+ fprintf(stderr, "Warning: Read %s is ignored due to read length (= %d) < seed length (= %d)!\n", read.getName().c_str(), read.getReadLength(), seedLen);
+
+ ++cnt;
+ if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
+ }
+
+ if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
+ }
+
+ if (n_warns > 0) fprintf(stderr, "Warning: There are %d reads ignored in total.\n", n_warns);
+
+ mld != NULL ? mld->finish() : gld->finish();
+ if (mean >= EPSILON) { //mean should be > 0
+ assert(mld->getMaxL() <= gld->getMaxL());
+ gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
+ }
+ qd->finish();
+ nqpro->calcInitParams();
+
+ mw = new double[M + 1];
+ calcMW();
+}
+
+void SingleQModel::init() {
+ if (estRSPD) rspd->init();
+ qpro->init();
+ nqpro->init();
+}
+
+void SingleQModel::finish() {
+ if (estRSPD) rspd->finish();
+ qpro->finish();
+ nqpro->finish();
+ needCalcConPrb = true;
+ if (estRSPD) calcMW();
+}
+
+void SingleQModel::collect(const SingleQModel& o) {
+ if (estRSPD) rspd->collect(*(o.rspd));
+ qpro->collect(*(o.qpro));
+ nqpro->collect(*(o.nqpro));
+}
+
+//Only master node can call
+void SingleQModel::read(const char* inpF) {
+ int val;
+ FILE *fi = fopen(inpF, "r");
+
+ general_assert(fi != NULL, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+
+ assert(fscanf(fi, "%d", &val) == 1);
+ assert(val == model_type);
+
+ ori->read(fi);
+ gld->read(fi);
+ assert(fscanf(fi, "%d", &val) == 1);
+ if (val > 0) {
+ if (mld == NULL) mld = new LenDist();
+ mld->read(fi);
+ }
+ rspd->read(fi);
+ qd->read(fi);
+ qpro->read(fi);
+ nqpro->read(fi);
+
+ if (fscanf(fi, "%d", &val) == 1) {
+ if (M == 0) M = val;
+ if (M == val) {
+ mw = new double[M + 1];
+ for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
+ }
+ }
+
+ fclose(fi);
+}
+
+//Only master node can call. Only be called at EM.cpp
+void SingleQModel::write(const char* outF) {
+ FILE *fo = fopen(outF, "w");
+
+ fprintf(fo, "%d\n", model_type);
+ fprintf(fo, "\n");
+
+ ori->write(fo); fprintf(fo, "\n");
+ gld->write(fo); fprintf(fo, "\n");
+ if (mld != NULL) {
+ fprintf(fo, "1\n");
+ mld->write(fo);
+ }
+ else { fprintf(fo, "0\n"); }
+ fprintf(fo, "\n");
+ rspd->write(fo); fprintf(fo, "\n");
+ qd->write(fo); fprintf(fo, "\n");
+ qpro->write(fo); fprintf(fo, "\n");
+ nqpro->write(fo);
+
+ if (mw != NULL) {
+ fprintf(fo, "\n%d\n", M);
+ for (int i = 0; i < M; i++) {
+ fprintf(fo, "%.15g ", mw[i]);
+ }
+ fprintf(fo, "%.15g\n", mw[M]);
+ }
+
+ fclose(fo);
+}
+
+void SingleQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
+ this->sampler = sampler;
+
+ theta_cdf = new double[M + 1];
+ for (int i = 0; i <= M; i++) {
+ theta_cdf[i] = theta[i];
+ if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
+ }
+
+ rspd->startSimulation(M, refs);
+ qd->startSimulation();
+ qpro->startSimulation();
+ nqpro->startSimulation();
+}
+
+bool SingleQModel::simulate(READ_INT_TYPE rid, SingleReadQ& read, int& sid) {
+ int dir, pos, readLen, fragLen;
+ std::string name;
+ std::string qual, readseq;
+ std::ostringstream strout;
+
+ sid = sampler->sample(theta_cdf, M + 1);
+
+ if (sid == 0) {
+ dir = pos = 0;
+ readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
+ qual = qd->simulate(sampler, readLen);
+ readseq = nqpro->simulate(sampler, readLen, qual);
+ }
+ else {
+ RefSeq &ref = refs->getRef(sid);
+ dir = ori->simulate(sampler);
+ fragLen = gld->simulate(sampler, ref.getTotLen());
+ if (fragLen < 0) return false;
+
+ int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
+ pos = rspd->simulate(sampler, sid, effL);
+ if (pos < 0) return false;
+ if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
+
+ if (mld != NULL) {
+ readLen = mld->simulate(sampler, fragLen);
+ if (readLen < 0) return false;
+ qual = qd->simulate(sampler, readLen);
+ readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
+ }
+ else {
+ qual = qd->simulate(sampler, fragLen);
+ readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
+ }
+ }
+
+ strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
+ name = strout.str();
+
+ read = SingleReadQ(name, readseq, qual);
+
+ return true;
+}
+
+void SingleQModel::finishSimulation() {
+ delete[] theta_cdf;
+
+ rspd->finishSimulation();
+ qd->finishSimulation();
+ qpro->finishSimulation();
+ nqpro->finishSimulation();
+}
+
+void SingleQModel::calcMW() {
+ double probF, probR;
+
+ assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
+
+ memset(mw, 0, sizeof(double) * (M + 1));
+ mw[0] = 1.0;
+
+ probF = ori->getProb(0);
+ probR = ori->getProb(1);
+
+ for (int i = 1; i <= M; i++) {
+ RefSeq& ref = refs->getRef(i);
+ int totLen = ref.getTotLen();
+ int fullLen = ref.getFullLen();
+ double value = 0.0;
+ int minL, maxL;
+ int effL, pfpos;
+ int end = std::min(fullLen, totLen - seedLen + 1);
+ double factor;
+
+ for (int seedPos = 0; seedPos < end; seedPos++)
+ if (ref.getMask(seedPos)) {
+ //forward
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), totLen - seedPos);
+ pfpos = seedPos;
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ //reverse
+ minL = gld->getMinL();
+ maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = seedPos - (fragLen - seedLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ }
+
+ //for reverse strand masking
+ for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
+ minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
+ maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+ for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+ pfpos = seedPos - (fragLen - seedLen);
+ effL = std::min(fullLen, totLen - fragLen + 1);
+ factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+ value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+ }
+ }
+
+ mw[i] = 1.0 - value;
+
+ if (mw[i] < 1e-8) {
+ // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+ mw[i] = 0.0;
+ }
+ }
+}
+
+#endif /* SINGLEQMODEL_H_ */
diff --git a/SingleRead.h b/SingleRead.h
new file mode 100644
index 0000000..8ea4eec
--- /dev/null
+++ b/SingleRead.h
@@ -0,0 +1,92 @@
+#ifndef SINGLEREAD
+#define SINGLEREAD
+
+#include<cmath>
+#include<cstdio>
+#include<cstdlib>
+#include<cassert>
+#include<iostream>
+#include<string>
+
+#include "utils.h"
+#include "Read.h"
+
+class SingleRead : public Read {
+public:
+ SingleRead() { readseq = ""; len = 0; }
+ SingleRead(const std::string& name, const std::string& readseq) {
+ this->name = name;
+ this->readseq = readseq;
+ this->len = readseq.length();
+ }
+
+ bool read(int argc, std::istream* argv[], int flags = 7);
+ void write(int argc, std::ostream* argv[]);
+
+ const int getReadLength() const { return len; /*readseq.length();*/ } // If need memory and .length() are guaranteed O(1), use statement in /* */
+ const std::string& getReadSeq() const { return readseq; }
+
+ void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
+private:
+ int len; // read length
+ std::string readseq; // read sequence
+};
+
+//If return false, you should not trust the value of any member
+bool SingleRead::read(int argc, std::istream* argv[], int flags) {
+ std::string line;
+
+ assert(argc == 1);
+ if (!getline((*argv[0]), line)) return false;
+ if (line[0] != '>') { fprintf(stderr, "Read file does not look like a FASTA file!"); exit(-1); }
+ name = "";
+ if (flags & 4) { name = line.substr(1); }
+ if (!getline((*argv[0]), readseq)) return false;
+ len = readseq.length(); // set read length
+ if (!(flags & 1)) { readseq = ""; }
+
+ return true;
+}
+
+void SingleRead::write(int argc, std::ostream* argv[]) {
+ assert(argc == 1);
+ (*argv[0])<<">"<<name<<std::endl<<readseq<<std::endl;
+}
+
+//calculate if this read is low quality
+void SingleRead::calc_lq(bool hasPolyA, int seedLen) {
+ low_quality = false;
+ if (len < seedLen) { low_quality = true; return; }
+
+ // if no polyA, no need to do the following calculation
+ if (!hasPolyA) return;
+
+ assert(readseq != "");
+
+ int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
+ int threshold_1, threshold_2;
+
+ threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
+ threshold_2 = (OLEN - 1) / 2 + 1;
+ for (int i = 0; i < len; i++) {
+ if (readseq[i] == 'A') {
+ ++numA;
+ if (i < OLEN) ++numAO;
+ }
+ if (readseq[i] == 'T') {
+ ++numT;
+ if (i >= len - OLEN) ++numTO;
+ }
+ }
+
+ if (numA >= threshold_1) {
+ low_quality = (numAO >= threshold_2);
+ }
+ else if (numT >= threshold_1) {
+ low_quality = (numTO >= threshold_2);
+ }
+ else low_quality = false;
+}
+
+#endif
diff --git a/SingleReadQ.h b/SingleReadQ.h
new file mode 100644
index 0000000..9fd3dd1
--- /dev/null
+++ b/SingleReadQ.h
@@ -0,0 +1,97 @@
+#ifndef SINGLEREADQ
+#define SINGLEREADQ
+
+#include<cmath>
+#include<cstdio>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<iostream>
+
+#include "utils.h"
+#include "Read.h"
+
+class SingleReadQ : public Read {
+public:
+ SingleReadQ() { readseq = qscore = ""; len = 0; }
+ SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) {
+ this->name = name;
+ this->readseq = readseq;
+ this->qscore = qscore;
+ this->len = readseq.length();
+ }
+
+ bool read(int argc, std::istream* argv[], int flags = 7);
+ void write(int argc, std::ostream* argv[]);
+
+ int getReadLength() const { return len; }
+ const std::string& getReadSeq() const { return readseq; }
+ const std::string& getQScore() const { return qscore; }
+
+ void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
+private:
+ int len; // read length
+ std::string readseq, qscore; // qscore : quality scores
+};
+
+bool SingleReadQ::read(int argc, std::istream* argv[], int flags) {
+ std::string line;
+
+ assert(argc == 1);
+ if (!getline((*argv[0]), line)) return false;
+ if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); }
+ name = "";
+ if (flags & 4) { name = line.substr(1); }
+ if (!getline((*argv[0]), readseq)) return false;
+ len = readseq.length();
+ if (!(flags & 1)) { readseq = ""; }
+ if (!getline((*argv[0]), line)) return false;
+ if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); }
+ if (!getline((*argv[0]), qscore)) return false;
+ if (!(flags & 2)) { qscore = ""; }
+
+ return true;
+}
+
+void SingleReadQ::write(int argc, std::ostream* argv[]) {
+ assert(argc == 1);
+ (*argv[0])<<"@"<<name<<std::endl<<readseq<<std::endl<<"+\n"<<qscore<<std::endl;
+}
+
+//calculate if this read is low quality
+void SingleReadQ::calc_lq(bool hasPolyA, int seedLen) {
+ low_quality = false;
+ if (len < seedLen) { low_quality = true; return; }
+
+ // if no polyA, no need to do the following calculation
+ if (!hasPolyA) return;
+
+ assert(readseq != "");
+
+ int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
+ int threshold_1, threshold_2;
+
+ threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
+ threshold_2 = (OLEN - 1) / 2 + 1;
+ for (int i = 0; i < len; i++) {
+ if (readseq[i] == 'A') {
+ ++numA;
+ if (i < OLEN) ++numAO;
+ }
+ if (readseq[i] == 'T') {
+ ++numT;
+ if (i >= len - OLEN) ++numTO;
+ }
+ }
+
+ if (numA >= threshold_1) {
+ low_quality = (numAO >= threshold_2);
+ }
+ else if (numT >= threshold_1) {
+ low_quality = (numTO >= threshold_2);
+ }
+ else low_quality = false;
+}
+
+#endif
diff --git a/Transcript.h b/Transcript.h
new file mode 100644
index 0000000..e8de8a3
--- /dev/null
+++ b/Transcript.h
@@ -0,0 +1,169 @@
+#ifndef TRANSCRIPT_H_
+#define TRANSCRIPT_H_
+
+#include<cstdio>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<vector>
+#include<fstream>
+#include<sstream>
+
+#include "utils.h"
+
+/**
+ If no genome is provided, seqname field is used to store the allele name.
+ */
+
+struct Interval {
+ int start, end;
+
+ Interval(int start, int end) {
+ this->start = start;
+ this->end = end;
+ }
+};
+
+class Transcript {
+public:
+ Transcript() {
+ length = 0;
+ structure.clear();
+ strand = 0;
+ seqname = gene_id = transcript_id = "";
+ gene_name = transcript_name = "";
+ left = "";
+ }
+
+ Transcript(const std::string& transcript_id, const std::string& gene_id, const std::string& seqname,
+ const char& strand, const std::vector<Interval>& structure, const std::string& left,
+ const std::string& transcript_name = "", const std::string& gene_name = "") : structure(structure), strand(strand),
+ seqname(seqname), gene_id(gene_id), transcript_id(transcript_id), gene_name(gene_name), transcript_name(transcript_name) {
+ //eliminate prefix spaces in string variable "left"
+ int pos = 0;
+ int len = left.length();
+ while (pos < len && left[pos] == ' ') ++pos;
+ this->left = left.substr(pos);
+
+ length = 0;
+ int s = structure.size();
+ for (int i = 0; i < s; i++) length += structure[i].end + 1 - structure[i].start;
+ }
+
+ bool operator< (const Transcript& o) const {
+ return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id) || (gene_id == o.gene_id && transcript_id == o.transcript_id && seqname < o.seqname);
+ }
+
+ const std::string& getTranscriptID() const { return transcript_id; }
+
+ const std::string& getTranscriptName() const { return transcript_name; }
+
+ const std::string& getGeneID() const { return gene_id; }
+
+ const std::string& getGeneName() const { return gene_name; }
+
+ const std::string& getSeqName() const { return seqname; }
+
+ char getStrand() const { return strand; }
+
+ const std::string& getLeft() const { return left; }
+
+ int getLength() const { return length; }
+
+ const std::vector<Interval>& getStructure() const { return structure; }
+
+ void extractSeq (const std::string&, std::string&) const;
+
+ void read(std::ifstream&);
+ void write(std::ofstream&);
+
+private:
+ int length; // transcript length
+ std::vector<Interval> structure; // transcript structure , coordinate starts from 1
+ char strand;
+ std::string seqname, gene_id, transcript_id; // follow GTF definition
+ std::string gene_name, transcript_name;
+ std::string left;
+};
+
+//gseq : genomic sequence
+void Transcript::extractSeq(const std::string& gseq, std::string& seq) const {
+ seq = "";
+ int s = structure.size();
+ size_t glen = gseq.length();
+
+ if (structure[0].start < 1 || (size_t)structure[s - 1].end > glen) {
+ fprintf(stderr, "Transcript %s is out of chromosome %s's boundary!\n", transcript_id.c_str(), seqname.c_str());
+ exit(-1);
+ }
+
+ switch(strand) {
+ case '+':
+ for (int i = 0; i < s; i++) {
+ seq += gseq.substr(structure[i].start - 1, structure[i].end - structure[i].start + 1); // gseq starts from 0!
+ }
+ break;
+ case '-':
+ for (int i = s - 1; i >= 0; i--) {
+ for (int j = structure[i].end; j >= structure[i].start; j--) {
+ seq += getOpp(gseq[j - 1]);
+ }
+ }
+ break;
+ default: assert(false);
+ }
+
+ assert(seq.length() > 0);
+}
+
+void Transcript::read(std::ifstream& fin) {
+ int s;
+ std::string tmp;
+ std::istringstream strin;
+
+ getline(fin, tmp);
+ strin.str(tmp);
+ getline(strin, transcript_id, '\t');
+ getline(strin, transcript_name);
+
+ getline(fin, tmp);
+ strin.clear(); strin.str(tmp);
+ getline(strin, gene_id, '\t');
+ getline(strin, gene_name);
+
+ getline(fin, seqname);
+
+ fin>>tmp>>length;
+ assert(tmp.length() == 1 && (tmp[0] == '+' || tmp[0] == '-'));
+ strand = tmp[0];
+ structure.clear();
+ fin>>s;
+ for (int i = 0; i < s; i++) {
+ int start, end;
+ fin>>start>>end;
+ structure.push_back(Interval(start, end));
+ }
+ getline(fin, tmp); //get the end of this line
+ getline(fin, left);
+}
+
+void Transcript::write(std::ofstream& fout) {
+ int s = structure.size();
+
+ fout<< transcript_id;
+ if (transcript_name != "") fout<< '\t'<< transcript_name;
+ fout<< std::endl;
+
+ fout<< gene_id;
+ if (gene_name != "") fout<< '\t'<< gene_name;
+ fout<< std::endl;
+
+ fout<<seqname<<std::endl;
+ fout<<strand<<" "<<length<<std::endl;
+ fout<<s;
+ for (int i = 0; i < s; i++) fout<<" "<<structure[i].start<<" "<<structure[i].end;
+ fout<<std::endl;
+ fout<<left<<std::endl;
+}
+
+#endif /* TRANSCRIPT_H_ */
diff --git a/Transcripts.h b/Transcripts.h
new file mode 100644
index 0000000..700633f
--- /dev/null
+++ b/Transcripts.h
@@ -0,0 +1,145 @@
+/*
+ * transcripts are numbered from 1. 0 is reserved for noise isoform
+ */
+#ifndef TRANSCRIPTS_H_
+#define TRANSCRIPTS_H_
+
+#include<cstdio>
+#include<cstdlib>
+#include<cassert>
+#include<fstream>
+#include<vector>
+#include<algorithm>
+#include<map>
+#include<string>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Transcript.h"
+
+class Transcripts {
+public:
+ Transcripts(int type = 0) {
+ M = 0; this->type = type;
+ transcripts.clear();
+ transcripts.push_back(Transcript());
+
+ e2i.clear(); i2e.clear();
+ }
+
+ int getM() { return M; }
+
+ // used in shrinking the transcripts
+ void setM(int M) { this->M = M; transcripts.resize(M + 1); }
+
+ void move(int from, int to) {
+ assert(from >= to);
+ if (from > to) transcripts[to] = transcripts[from];
+ }
+
+ int getType() { return type; }
+ void setType(int type) { this->type = type; }
+
+ bool isAlleleSpecific() { return type == 2; }
+
+ const Transcript& getTranscriptAt(int pos) {
+ assert(pos > 0 && pos <= M);
+ return transcripts[pos];
+ }
+
+ void add(const Transcript& transcript) {
+ transcripts.push_back(transcript);
+ ++M;
+ }
+
+ void sort() {
+ std::sort(transcripts.begin(), transcripts.end());
+ }
+
+ void readFrom(const char*);
+ void writeTo(const char*);
+
+ //Eid: external sid
+ int getInternalSid(int eid) {
+ assert(eid > 0 && eid <= M);
+ return e2i[eid];
+ }
+
+ const Transcript& getTranscriptViaEid(int eid) {
+ return transcripts[getInternalSid(eid)];
+ }
+
+ void buildMappings(int, char**, const char* = NULL);
+
+private:
+ int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific
+ std::vector<Transcript> transcripts;
+
+ std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
+};
+
+void Transcripts::readFrom(const char* inpF) {
+ std::string line;
+ std::ifstream fin(inpF);
+
+ if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
+
+ fin>>M>>type;
+ getline(fin, line);
+ transcripts.assign(M + 1, Transcript());
+ for (int i = 1; i <= M; i++) {
+ transcripts[i].read(fin);
+ }
+ fin.close();
+}
+
+void Transcripts::writeTo(const char* outF) {
+ std::ofstream fout(outF);
+ fout<<M<<" "<<type<<std::endl;
+ for (int i = 1; i <= M; i++) {
+ transcripts[i].write(fout);
+ }
+ fout.close();
+}
+
+void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
+ std::map<std::string, int> dict;
+ std::map<std::string, int>::iterator iter;
+ std::vector<bool> appeared;
+
+ general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
+ general_assert(n_targets <= M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
+ if (n_targets < M) fprintf(stderr, "Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)! Please make sure that you aligned your reads against transcript sequences instead of genome.\n", n_targets, M);
+
+ dict.clear();
+ for (int i = 1; i <= M; i++) {
+ const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
+ iter = dict.find(tid);
+ general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
+ dict[tid] = i;
+ }
+
+ e2i.assign(M + 1, 0);
+ i2e.assign(M + 1, 0);
+ appeared.assign(M + 1, false);
+ for (int i = 0; i < n_targets; i++) {
+ iter = dict.find(std::string(target_name[i]));
+ general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
+ general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
+ e2i[i + 1] = iter->second;
+ i2e[iter->second] = i + 1;
+ iter->second = -1;
+ appeared[e2i[i + 1]] = true;
+ }
+
+ if (imdName != NULL) {
+ char omitF[STRLEN];
+ sprintf(omitF, "%s.omit", imdName);
+ FILE *fo = fopen(omitF, "w");
+ for (int i = 1; i <= M; i++)
+ if (!appeared[i]) fprintf(fo, "%d\n", i);
+ fclose(fo);
+ }
+}
+
+#endif /* TRANSCRIPTS_H_ */
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW
new file mode 100644
index 0000000..babd096
--- /dev/null
+++ b/WHAT_IS_NEW
@@ -0,0 +1,327 @@
+RSEM v1.3.0
+
+- Added Prior-Enhanced RSEM (pRSEM) as a submodule
+- Introduced `--strandedness <none|forward|reverse>` option, `--strand-specific` and `--forward-prob` are deprecated (but still supported)
+- Revised documentation for `rsem-plot-model`, maked it clear that in alignment statistics, isoform-level (instead of genome-level) multi-mapping reads are shown
+- Significantly improved the output information of `rsem-sam-validator`: if indels/clippings/skips are detected in alignments or alignments exceed transcript boundaries, `rsem-sam-validator` will report them instead of telling you the input is valid
+- Updated the warning message to ask users to make sure that they align their reads agains a set of transcripts instead of genome when RSEM finds less sequences in the BAM file than RSEM's indices
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.31
+
+- Rewrote `rsem-gff3-to-gtf` to handle a more general set of GFF3 files
+- Added safety checks to make sure poly(A) tails are not added to the reference when `--star` is set
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.30
+
+- Fixed a bug that can cause SAMtools sort to fail
+- Improved the appearance of warning messages: for the same type of warning messages, only show the first 50 messages and then provide the total number of such warnings
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.29
+
+- Reformatted Makefile to be more professional, and `make install` is ready to use
+- Enabled `./configure --without-curses` for configuring SAMtools to avoid potential compilation problems due to curses library
+- Fixed bugs for installing EBSeq
+- Improved the readability of RSEM documentation
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.28
+
+- Fixed a bug in RSEM v1.2.27 that can lead to assertion errors for parsing GTF files
+- Fixed a bug in Makefile
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.27
+
+- Upgraded SAMtools to v1.3; RSEM now supports input alignments in SAM/BAM/CRAM format
+- '--sam/--bam' options of 'rsem-calculate-expression' are obsoleted; use '--alignments' instead; '--sam/--bam' can still be used for compatibility with previous versions
+- Some 'rsem-calculate-expression' options are renamed for better interpretability
+- Documents are updated to reflect the SAMtools upgrade
+- Fixed a bug for parsing GTF files
+- Fixed a bug for generating transcript wiggle plots
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.26
+
+- RSEM supports GFF3 annotation format now
+- Added instructions to build RSEM references using RefSeq, Ensembl, or GENCODE annotations
+- Added an option to extract only transcripts from certain resources given a GTF/GFF3 file
+- Fixed a bug and improved the clarity of error messages for extracting transcripts from genome
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.25
+
+- RSEM will extract gene_name/transcript_name from GTF file when possible; however, it only appends them to the 'sample_name.*.results' files if '--append-names' option is specified; unlike v1.2.24, this version is compatible with STAR aligner even when '--append-names' is set
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.24
+
+- RSEM will extract gene_name/transcript_name from GTF file when possible; if extracted, gene_name/transcript_name will append at the end of gene_id/transcript_id with an underscore in between
+- Modified 'rsem-plot-model' to indicate the modes of fragment length and read length distributions
+- Modified 'rsem-plot-model' to present alignment statistics better using both barplot and pie chart
+- Updated 'EBSeq' to version 1.2.0
+- Added coefficient of quartile variation in addition to credibility intervals when '--calc-ci' is turned on
+- Added '--single-cell-prior' option to notify RSEM to use a sparse prior (Dir(0.1)) for single cell data; this option only makes sense if '--calc-pme' or '--calc-ci' is set
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.23
+
+- Moved version information from WHAT_IS_NEW to rsem_perl_utils.pm in order to make sure the '--version' option always output the version information
+- Fixed a typo in 'rsem-calculate-expression' that can lead an error when '--star' is set and '--star-path' is not set
+- Fixed a bug that can occasionally crash the RSEM simulator
+- Added user-friendly error messages that are triggered when RSEM detects invalid bases in the input FASTA file during reference building
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.22
+
+- Added options to run the STAR aligner
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.21
+
+- Strip read names of extra words to avoid mismatches of paired-end read names
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.20
+
+- Fixed a problem that can lead to assertion error if any paired-end read's insert size > 32767 (by changing the type of insertL in PairedEndHit.h from short to int)
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.19
+
+- Modified 'rsem-prepare-reference' such that by default it does not add any poly(A) tails. To add poly(A) tails, use '--polyA' option
+- Added an annotation of the 'sample_name.stat/sample_name.cnt' file, see 'cnt_file_description.txt'
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.18
+
+- Only generate warning message if two mates of a read pair have different names
+- Only parse attributes of a GTF record if its feature is "exon" to avoid unnecessary warning messages
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.17
+
+- Added error detection for cases such as a read's two mates having different names or a read is both alignable and unalignable
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.16
+
+- Corrected a typo in 'rsem-generate-data-matrix', this script extracts 'expected_count' column instead of 'TPM' column
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.15
+
+- Allowed for a subset of reference sequences to be declared in an input SAM/BAM file
+- For any transcript not declared in the SAM/BAM file, its PME estimates and credibility intervals are set to zero
+- Added advanced options for customizing Gibbs sampler and credibility interval calculation behaviors
+- Splitted options in 'rsem-calculate-expression' into basic and advanced options
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.14
+
+- Changed RSEM's behaviors for building Bowtie/Bowtie 2 indices. In 'rsem-prepare-reference', '--no-bowtie' and '--no-ntog' options are removed. By default, RSEM does not build either Bowtie or Bowtie 2 indices. Instead, it generates two index Multi-FASTA files, 'reference_name.idx.fa' and 'reference_name.n2g.idx.fa'. Compared to the former file, the latter one in addition converts all 'N's into 'G's. These two files can be used to build aligner indices for customized aligners. In addition, 'reference_name.transcripts.fa' does not have poly(A) tails added. To enable RSEM build Bowtie/Bowtie 2 indices, '--bowtie' or '--bowtie2' must be set explicitly. The most significant benefit of this change is that now we can build Bowtie and Bowtie 2 indices simultaneously by turning both '--bowtie' and '--bowtie2' on. Type 'rsem-prepare-reference --help' for more information
+- If transcript coordinate files are visualized using IGV, 'reference_name.idx.fa' should be imported as a genome (instead of 'reference_name.transcripts.fa'). For more information, see the third subsection of Visualization in 'README.md'
+- Modified RSEM perl scripts so that RSEM directory will be added in the beginning of the PATH variable. This also means RSEM will try to use its own samtools first
+- Added --seed option to set random number generator seeds in 'rsem-calculate-expression'
+- Added posterior standard deviation of counts as output if either '--calc-pme' or '--calc-ci' is set
+- Updated boost to v1.55.0
+- Renamed makefile as Makefile
+- If '--output-genome-bam' is set, in the genome BAM file, each alignment's 'MD' field will be adjusted to match the CIGAR string
+- 'XS:A:value' field is required by Cufflinks for spliced alignments. If '--output-genome-bam' is set, in the genome BAM file, first each alignment's 'XS' filed will be deleted. Then if the alignment is an spliced alignment, a 'XS:A:value' field will be added accordingly
+- Added instructions for users who want to put all RSEM executables into a bin directory (see Compilation & Installation section of 'README.md')
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.13
+
+- Allowed users to use the SAMtools in the PATH first and enabled RSEM to find its executables via a symbolic link
+- Changed the behavior of parsing GTF file. Now if a GTF line's feature is not "exon" and it does not contain a "gene_id" or "transcript_id" attribute, only a warning message will be produced (instead of failing the RSEM)
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.12
+
+- Enabled allele-specific expression estimation
+- Added '--calc-pme' option for 'rsem-calculate-expression' to calculate posterior mean estimates only (no credibility intervals)
+- Modified the shebang line of RSEM perl scripts to make them more portable
+- Added '--seed' option for 'rsem-simulate-reads' to enable users set the seed of random number generator used by the simulation
+- Modified the transcript extraction behavior of 'rsem-prepare-reference'. For transcripts that cannot be extracted, instead of failing the whole script, warning information is produced. Those transcripts are ignored
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.11
+
+- Enabled RSEM to use Bowtie 2 aligner (indel, local and discordant alignments are not supported yet)
+- Changed option names '--bowtie-phred33-quals', '--bowtie-phred64-quals' and '--bowtie-solexa-quals' back to '--phred33-quals', '--phred64-quals' and '--solexa-quals'
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.10
+
+- Fixed a bug which will lead to out-of-memory error when RSEM computes ngvector for EBSeq
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.9
+
+- Fixed a compilation error problem in Mac OS
+- Fixed a problem in makefile that affects 'make ebseq'
+- Added 'model_file_description.txt', which describes the format and meanings of file 'sample_name.stat/sample_name.model'
+- Updated samtools to version 0.1.19
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.8
+
+- Provided a more detailed description for how to simulate RNA-Seq data using 'rsem-simulate-reads'
+- Provided more user-friendly error message if RSEM fails to extract transcript sequences due to the failure of reading certain chromosome sequences
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.7
+
+- 'rsem-find-DE' is replaced by 'rsem-run-ebseq' and 'rsem-control-fdr' for a more friendly user experience
+- Added support for differential expression testing on more than 2 conditions in RSEM's EBSeq wrappers 'rsem-run-ebseq' and 'rsem-control-fdr'
+- Renamed '--phred33-quals', '--phred64-quals', and '--solexa-quals' in 'rsem-calculate-expression' to '--bowtie-phred33-quals', '--bowtie-phred64-quals', and '--bowtie-solex-quals' to avoid confusion
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.6
+
+- Install the latest version of EBSeq from Bioconductor and if fails, try to install EBSeq v1.1.5 locally
+- Fixed a bug in 'rsem-gen-transcript-plots', which makes 'rsem-plot-transcript-wiggles' fail
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.5
+
+- Updated EBSeq from v1.1.5 to v1.1.6
+- Fixed a bug in 'rsem-generate-data-matrix', which can cause 'rsem-find-DE' to crash
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.4
+
+- Fixed a bug that leads to poor parallelization performance in Mac OS systems
+- Fixed a problem that may halt the 'rsem-gen-transcript-plots", thanks Han Lin for pointing out the problem and suggesting possible fixes
+- Added some user-friendly error messages for converting transcript BAM files into genomic BAM files
+- Modified rsem-tbam2gbam so that the original alignment quality MAPQ will be preserved if the input bam is not from RSEM
+- Added user-friendly error messages if users forget to compile the source codes
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.3
+
+- Fixed a bug in 'EBSeq/rsem-for-ebseq-generate-ngvector-from-clustering-info' which may crash the script
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.2
+
+- Updated EBSeq to v1.1.5
+- Modified 'rsem-find-DE' to generate extra output files (type 'rsem-find-DE' to see more information)
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.1
+
+- Added poly(A) tails to 'reference_name.transcripts.fa' so that the RSEM generated transcript unsorted BAM file can be fed into RSEM as an input file. However, users need to rebuild their references if they want to visualize the transcript level wiggle files and BAM files using IGV
+- Modified 'rsem-tbam2gbam' to convert users' alignments from transcript BAM files into genome BAM files, provided users use 'reference_name.idx.fa' to build indices for their aligners
+- Updated EBSeq from v1.1.3 to v1.1.4
+- Corrected several typos in warning messages
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.2.0
+
+- Changed output formats, added FPKM field etc.
+- Fixed a bug related to paired-end reads data
+- Added a script to run EBSeq automatically and updated EBSeq to v1.1.3
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.21
+
+- Removed optional field "Z0:A:!" in the BAM outputs
+- Added --no-fractional-weight option to rsem-bam2wig, if the BAM file is not generated by RSEM, this option is recommended to be set
+- Fixed a bug for generating transcript level wiggle files using 'rsem-plot-transcript-wiggles'
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.20
+
+- Added an option to set the temporary folder name
+- Removed sample_name.sam.gz. Instead, RSEM uses samtools to convert bowtie outputted SAM file into a BAM file under the temporary folder
+- RSEM generated BAM files now contains all alignment lines produced by bowtie or user-specified aligners, including unalignable reads. Please note that for paired-end reads, if one mate has alignments but the other does not, RSEM will mark the alignable mate as "unmappable" (flag bit 0x4) and append an optional field "Z0:A:!"
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.19
+
+- Allowed > 2^31 hits
+- Added some instructions on how to visualize transcript coordinate BAM/WIG files using IGV
+- Included EBSeq for downstream differential expression analysis
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.18
+
+- Added some user-friendly error messages
+- Added program 'rsem-sam-validator', users can use this program to check if RSEM can process their SAM/BAM files
+- Modified 'convert-sam-for-rsem' so that this program will convert users' SAM/BAM files into acceptable BAM files for RSEM
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.17
+
+- Fixed a bug related to parallezation of credibility intervals calculation
+- Added --no-bam-output option to rsem-calculate-expression
+- The order of @SQ tags in SAM/BAM files can be arbitrary now
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.16
+
+- Added --time option to show time consumed by each phase
+- Moved the alignment file out of the temporary folder
+- Enabled pthreads for calculating credibility intervals
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.15
+
+- Fixed several bugs causing compilation error
+- Modified samtools' Makefile for cygwin. For cygwin users, please uncomment the 4th and 8th lines in sam/Makefile before compiling RSEM
+
+--------------------------------------------------------------------------------------------
+
+RSEM v1.1.14
+
+- Added --chunkmbs option to rsem-calculate-expression (patch contributed by earonesty)
+- Added --sampling-for-bam option to rsem-calculate-expression, in the bam file, instead of providing expected weights, for each read RSEM samples one alignment based on the expected weights
+- RSEM can generate BAM and Wiggle files in both genomic-coordinate and transcript-coordinate
+- Added rsem-plot-transcript-wiggles. This script can generate transcript-coordinate wiggle plots in pdf format. One unique feature is, a stacked plot can be generated, with unique read contribution shown as black and multi-read contribution shown as red
+- Added convert_sam_for_rsem script for users do not use bowtie aligner
+- Modified RSEM's GTF file parser. Now RSEM does not require "transcript_id" and "gene_id" be the first two attributes shown
+- Improved descriptions for thread related errors
+
diff --git a/WriteResults.h b/WriteResults.h
new file mode 100644
index 0000000..d3100d5
--- /dev/null
+++ b/WriteResults.h
@@ -0,0 +1,637 @@
+#ifndef WRITERESULTS_H_
+#define WRITERESULTS_H_
+
+#include<cmath>
+#include<cstdio>
+#include<vector>
+#include<string>
+#include<fstream>
+#include<algorithm>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "GroupInfo.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+#include "Refs.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+template<class ModelType>
+void calcExpectedEffectiveLengths(int M, Refs& refs, ModelType& model, std::vector<double>& eel) {
+ int lb, ub, span;
+ double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
+
+ model.getGLD().copyTo(pdf, cdf, lb, ub, span);
+ clen = new double[span + 1];
+ clen[0] = 0.0;
+ for (int i = 1; i <= span; i++) {
+ clen[i] = clen[i - 1] + pdf[i] * (lb + i);
+ }
+
+ eel.assign(M + 1, 0.0);
+ for (int i = 1; i <= M; i++) {
+ int totLen = refs.getRef(i).getTotLen();
+ int fullLen = refs.getRef(i).getFullLen();
+ int pos1 = std::max(std::min(totLen - fullLen + 1, ub) - lb, 0);
+ int pos2 = std::max(std::min(totLen, ub) - lb, 0);
+
+ if (pos2 == 0) { eel[i] = 0.0; continue; }
+
+ eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
+ assert(eel[i] >= 0);
+ if (eel[i] < MINEEL) { eel[i] = 0.0; }
+ }
+
+ delete[] pdf;
+ delete[] cdf;
+ delete[] clen;
+}
+
+void polishTheta(int M, std::vector<double>& theta, const std::vector<double>& eel, const double* mw) {
+ double sum = 0.0;
+
+ /* The reason that for noise gene, mw value is 1 is :
+ * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
+ * So the theta0 does not containing reads from any masked position
+ */
+
+ for (int i = 0; i <= M; i++) {
+ // i == 0, mw[i] == 1
+ if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
+ theta[i] = 0.0;
+ continue;
+ }
+ theta[i] = theta[i] / mw[i];
+ sum += theta[i];
+ }
+ // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
+ general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
+ for (int i = 0; i <= M; i++) theta[i] /= sum;
+}
+
+void calcExpressionValues(int M, const std::vector<double>& theta, const std::vector<double>& eel, std::vector<double>& tpm, std::vector<double>& fpkm) {
+ double denom;
+ std::vector<double> frac;
+
+ //calculate fraction of count over all mappabile reads
+ denom = 0.0;
+ frac.assign(M + 1, 0.0);
+ for (int i = 1; i <= M; i++)
+ if (eel[i] >= EPSILON) {
+ frac[i] = theta[i];
+ denom += frac[i];
+ }
+ // general_assert(denom >= EPSILON, "No alignable reads?!");
+ if (denom < EPSILON) denom = 1.0;
+ for (int i = 1; i <= M; i++) frac[i] /= denom;
+
+ //calculate FPKM
+ fpkm.assign(M + 1, 0.0);
+ for (int i = 1; i <= M; i++)
+ if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
+
+ //calculate TPM
+ tpm.assign(M + 1, 0.0);
+ denom = 0.0;
+ for (int i = 1; i <= M; i++) denom += fpkm[i];
+ if (denom < EPSILON) denom = 1.0;
+ for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
+}
+
+inline bool isAlleleSpecific(const char* refName, GroupInfo* gt = NULL, GroupInfo* ta = NULL) {
+ bool alleleS;
+ char gtF[STRLEN], taF[STRLEN];
+
+ sprintf(gtF, "%s.gt", refName);
+ sprintf(taF, "%s.ta", refName);
+ std::ifstream gtIF(gtF), taIF(taF);
+ alleleS = gtIF.is_open() && taIF.is_open();
+ if (gtIF.is_open()) gtIF.close();
+ if (taIF.is_open()) taIF.close();
+
+ if (alleleS) {
+ if (gt != NULL) gt->load(gtF);
+ if (ta != NULL) ta->load(taF);
+ }
+
+ return alleleS;
+}
+
+void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts& transcripts, std::vector<double>& theta, std::vector<double>& eel, double* counts, bool appendNames) {
+ char outF[STRLEN];
+ FILE *fo;
+
+ int m;
+ GroupInfo gi;
+ char groupF[STRLEN];
+
+ std::vector<int> tlens;
+ std::vector<double> fpkm, tpm, isopct;
+ std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ // For allele-specific expression
+ int m_trans = 0;
+ GroupInfo gt, ta;
+ std::vector<double> trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+ bool alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific
+
+ calcExpressionValues(M, theta, eel, tpm, fpkm);
+
+ //calculate IsoPct, etc.
+ isopct.assign(M + 1, 0.0);
+ tlens.assign(M + 1, 0);
+
+ glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+ gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(j);
+ tlens[j] = transcript.getLength();
+
+ gene_counts[i] += counts[j];
+ gene_tpm[i] += tpm[j];
+ gene_fpkm[i] += fpkm[j];
+ }
+
+ if (gene_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ glens[i] += tlens[j] * frac;
+ gene_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ isopct[j] = gene_tpm[i] > EPSILON ? tpm[j] / gene_tpm[i] : 0.0;
+ glens[i] += tlens[j] * isopct[j];
+ gene_eels[i] += eel[j] * isopct[j];
+ }
+ }
+ }
+
+ if (alleleS) {
+ m_trans = ta.getm();
+ ta_pct.assign(M + 1, 0.0);
+ trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0);
+ trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ trans_counts[i] += counts[j];
+ trans_tpm[i] += tpm[j];
+ trans_fpkm[i] += fpkm[j];
+ }
+
+ if (trans_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ trans_lens[i] += tlens[j] * frac;
+ trans_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ ta_pct[j] = trans_tpm[i] > EPSILON ? tpm[j] / trans_tpm[i] : 0.0;
+ trans_lens[i] += tlens[j] * ta_pct[j];
+ trans_eels[i] += eel[j] * ta_pct[j];
+ }
+ }
+ }
+
+ gt_pct.assign(m_trans, 0.0);
+ for (int i = 0; i < m; i++)
+ if (gene_tpm[i] >= EPSILON) {
+ int b = gt.spAt(i), e = gt.spAt(i + 1);
+ for (int j = b; j < e; j++) gt_pct[j] = gene_tpm[i] > EPSILON ? trans_tpm[j] / gene_tpm[i] : 0.0;
+ }
+ }
+
+ if (!alleleS) {
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+
+ fprintf(fo, "%s", transcript.getTranscriptID().c_str());
+ if (appendNames && transcript.getTranscriptName() != "")
+ fprintf(fo, "_%s", transcript.getTranscriptName().c_str());
+ fprintf(fo, "%c", (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+
+ fprintf(fo, "%s", transcript.getGeneID().c_str());
+ if (appendNames && transcript.getGeneName() != "")
+ fprintf(fo, "_%s", transcript.getGeneName().c_str());
+ fprintf(fo, "%c", (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+ }
+ else {
+ // allele level results
+ sprintf(outF, "%s.allele_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getSeqName().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+
+ // isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 0; i < m_trans; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+ fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < m_trans - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m_trans; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m_trans - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_lens[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_eels[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n'));
+ fclose(fo);
+ }
+
+ //gene level results
+ sprintf(outF, "%s.gene_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 0; i < m; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
+
+ fprintf(fo, "%s", transcript.getGeneID().c_str());
+ if (appendNames && transcript.getGeneName() != "")
+ fprintf(fo, "_%s", transcript.getGeneName().c_str());
+ fprintf(fo, "%c", (i < m - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ std::string curtid = "", tid;
+ for (int j = b; j < e; j++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(j);
+ tid = transcript.getTranscriptID();
+ if (curtid != tid) {
+ if (curtid != "") fprintf(fo, ",");
+ fprintf(fo, "%s", tid.c_str());
+ if (appendNames && transcript.getTranscriptName() != "")
+ fprintf(fo, "_%s", transcript.getTranscriptName().c_str());
+ curtid = tid;
+ }
+ }
+ fprintf(fo, "%c", (i < m - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
+ fclose(fo);
+
+ if (verbose) { printf("Expression Results are written!\n"); }
+}
+
+void writeResultsGibbs(int M, int m, int m_trans, GroupInfo& gi, GroupInfo &gt, GroupInfo &ta, bool alleleS, char* imdName, std::vector<double>& pme_c, std::vector<double>& pme_fpkm, std::vector<double>& pme_tpm, std::vector<double>& pve_c, std::vector<double>& pve_c_genes, std::vector<double>& pve_c_trans) {
+ char outF[STRLEN];
+ FILE *fo;
+
+ std::vector<double> isopct;
+ std::vector<double> gene_counts, gene_tpm, gene_fpkm;
+
+ // For allele-specific expression
+ std::vector<double> trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+ //calculate IsoPct, etc.
+ isopct.assign(M + 1, 0.0);
+ gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ gene_counts[i] += pme_c[j];
+ gene_tpm[i] += pme_tpm[j];
+ gene_fpkm[i] += pme_fpkm[j];
+ }
+ if (gene_tpm[i] < EPSILON) continue;
+ for (int j = b; j < e; j++)
+ isopct[j] = pme_tpm[j] / gene_tpm[i];
+ }
+
+ if (alleleS) {
+ ta_pct.assign(M + 1, 0.0);
+ trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ trans_counts[i] += pme_c[j];
+ trans_tpm[i] += pme_tpm[j];
+ trans_fpkm[i] += pme_fpkm[j];
+ }
+ if (trans_tpm[i] < EPSILON) continue;
+ for (int j = b; j < e; j++)
+ ta_pct[j] = pme_tpm[j] / trans_tpm[i];
+ }
+
+ gt_pct.assign(m_trans, 0.0);
+ for (int i = 0; i < m; i++)
+ if (gene_tpm[i] >= EPSILON) {
+ int b = gt.spAt(i), e = gt.spAt(i + 1);
+ for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+ }
+ }
+
+ if (!alleleS) {
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", sqrt(pve_c[i]), (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+ }
+ else {
+ //allele level results
+ sprintf(outF, "%s.allele_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", sqrt(pve_c[i]), (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", sqrt(pve_c_trans[i]), (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n'));
+ fclose(fo);
+ }
+
+ //gene level results
+ sprintf(outF, "%s.gene_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", sqrt(pve_c_genes[i]), (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
+ fclose(fo);
+
+ if (verbose) { printf("Gibbs based expression values are written!\n"); }
+}
+
+void writeResultsSimulation(int M, char* refName, char* outFN, Transcripts& transcripts, std::vector<double>& eel, std::vector<double>& counts) {
+ char outF[STRLEN];
+ FILE *fo;
+
+ int m;
+ GroupInfo gi;
+ char groupF[STRLEN];
+
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ std::vector<int> tlens;
+ std::vector<double> tpm, fpkm, isopct;
+ std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+
+ // For allele-specific expression
+ int m_trans = 0;
+ GroupInfo gt, ta;
+ std::vector<double> trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+ bool alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific
+
+ for (int i = 1; i <= M; i++)
+ general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
+
+ calcExpressionValues(M, counts, eel, tpm, fpkm);
+
+ //calculate IsoPct, etc.
+ isopct.assign(M + 1, 0.0);
+ tlens.assign(M + 1, 0);
+
+ glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+ gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(j);
+ tlens[j] = transcript.getLength();
+
+ gene_counts[i] += counts[j];
+ gene_tpm[i] += tpm[j];
+ gene_fpkm[i] += fpkm[j];
+ }
+
+ if (gene_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ glens[i] += tlens[j] * frac;
+ gene_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ isopct[j] = tpm[j] / gene_tpm[i];
+ glens[i] += tlens[j] * isopct[j];
+ gene_eels[i] += eel[j] * isopct[j];
+ }
+ }
+ }
+
+ if (alleleS) {
+ m_trans = ta.getm();
+ ta_pct.assign(M + 1, 0.0);
+ trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0);
+ trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ trans_counts[i] += counts[j];
+ trans_tpm[i] += tpm[j];
+ trans_fpkm[i] += fpkm[j];
+ }
+
+ if (trans_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ trans_lens[i] += tlens[j] * frac;
+ trans_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ ta_pct[j] = tpm[j] / trans_tpm[i];
+ trans_lens[i] += tlens[j] * ta_pct[j];
+ trans_eels[i] += eel[j] * ta_pct[j];
+ }
+ }
+ }
+
+ gt_pct.assign(m_trans, 0.0);
+ for (int i = 0; i < m; i++)
+ if (gene_tpm[i] >= EPSILON) {
+ int b = gt.spAt(i), e = gt.spAt(i + 1);
+ for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+ }
+ }
+
+ //allele level
+ if (alleleS) {
+ sprintf(outF, "%s.sim.alleles.results", outFN);
+ fo = fopen(outF, "w");
+ fprintf(fo, "allele_id\ttranscript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tAlleleIsoPct\tAlleleGenePct\n");
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s\t%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getSeqName().c_str(), transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+ eel[i], counts[i], tpm[i], fpkm[i], ta_pct[i] * 1e2, isopct[i] * 1e2);
+ }
+ fclose(fo);
+ }
+
+ //isoform level
+ sprintf(outF, "%s.sim.isoforms.results", outFN);
+ fo = fopen(outF, "w");
+ fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
+ if (!alleleS) {
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+ eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
+ }
+ }
+ else {
+ for (int i = 0; i < m_trans; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+ fprintf(fo, "%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), trans_lens[i],
+ trans_eels[i], trans_counts[i], trans_tpm[i], trans_fpkm[i], gt_pct[i] * 1e2);
+ }
+ }
+ fclose(fo);
+
+ //gene level
+ sprintf(outF, "%s.sim.genes.results", outFN);
+ fo = fopen(outF, "w");
+ fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ const std::string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
+ fprintf(fo, "%s\t", gene_id.c_str());
+ std::string curtid = "", tid;
+ for (int j = b; j < e; j++) {
+ tid = transcripts.getTranscriptAt(j).getTranscriptID();
+ if (curtid != tid) {
+ if (curtid != "") fprintf(fo, ",");
+ fprintf(fo, "%s", tid.c_str());
+ curtid = tid;
+ }
+ }
+ fprintf(fo, "\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
+ }
+ fclose(fo);
+}
+
+#endif
diff --git a/bam2readdepth.cpp b/bam2readdepth.cpp
new file mode 100644
index 0000000..d252e2e
--- /dev/null
+++ b/bam2readdepth.cpp
@@ -0,0 +1,27 @@
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <fstream>
+
+#include "my_assert.h"
+#include "wiggle.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+ if (argc != 3) {
+ printf("Usage: rsem-bam2readdepth sorted_bam_input readdepth_output\n");
+ exit(-1);
+ }
+
+ ofstream fout(argv[2]);
+ general_assert(fout.is_open(), "Cannot write to " + cstrtos(argv[2]) + "!");
+
+ ReadDepthWriter depth_writer(fout);
+
+ build_wiggles(argv[1], depth_writer);
+
+ fout.close();
+
+ return 0;
+}
diff --git a/bam2wig.cpp b/bam2wig.cpp
new file mode 100644
index 0000000..393b08e
--- /dev/null
+++ b/bam2wig.cpp
@@ -0,0 +1,27 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "wiggle.h"
+
+using namespace std;
+
+void printUsage() {
+ printf("Usage: rsem-bam2wig sorted_alignment_file wig_output wiggle_name [--no-fractional-weight]\n");
+ printf("sorted_alignment_file\t: Can be either in SAM/BAM/CRAM format, must be sorted\n");
+ printf("wig_output\t\t: Output wiggle file's name, e.g. output.wig\n");
+ printf("wiggle_name\t\t: the name of this wiggle plot\n");
+ printf("--no-fractional-weight\t: If this is set, RSEM will not look for \"ZW\" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line.\n");
+ exit(-1);
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4 || argc > 5) { printf("Number of arguments is not correct!\n"); printUsage(); }
+ if (argc == 5 && strcmp(argv[4], "--no-fractional-weight")) { printf("Cannot recognize option %s!\n", argv[4]); printUsage(); }
+
+ no_fractional_weight = (argc == 5 && !strcmp(argv[4], "--no-fractional-weight"));
+ UCSCWiggleTrackWriter track_writer(argv[2], argv[3]);
+ build_wiggles(argv[1], track_writer);
+
+ return 0;
+}
diff --git a/bc_aux.h b/bc_aux.h
new file mode 100644
index 0000000..54f4d67
--- /dev/null
+++ b/bc_aux.h
@@ -0,0 +1,120 @@
+#ifndef BC_AUX_H_
+#define BC_AUX_H_
+
+#include<map>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+
+struct SingleEndT {
+ bam1_t *b;
+
+ SingleEndT(bam1_t *b) {
+ this->b = b;
+ }
+
+ int getSign(bool value) const { return value ? -1 : 1; }
+
+ int compare(const SingleEndT& o) const {
+ int strand1, strand2;
+ uint32_t *p1, *p2;
+
+ if (b->core.tid != o.b->core.tid) return getSign(b->core.tid < o.b->core.tid);
+ if (b->core.pos != o.b->core.pos) return getSign(b->core.pos < o.b->core.pos);
+ strand1 = bam_is_rev(b); strand2 = bam_is_rev(o.b);
+ if (strand1 != strand2) return getSign(strand1 < strand2);
+ if (b->core.n_cigar != o.b->core.n_cigar) return getSign(b->core.n_cigar < o.b->core.n_cigar);
+ p1 = bam_get_cigar(b); p2 = bam_get_cigar(o.b);
+ for (int i = 0; i < (int)b->core.n_cigar; ++i) {
+ if (*p1 != *p2) return getSign(*p1 < *p2);
+ ++p1; ++p2;
+ }
+
+ return 0;
+ }
+
+ bool operator< (const SingleEndT& o) const {
+ return compare(o) < 0;
+ }
+};
+
+struct PairedEndT {
+ SingleEndT mate1, mate2;
+
+ PairedEndT(const SingleEndT& mate1, const SingleEndT& mate2) : mate1(mate1), mate2(mate2) {
+ }
+
+ bool operator< (const PairedEndT& o) const {
+ int value = mate1.compare(o.mate1);
+ return value < 0 || (value == 0 && mate2 < o.mate2);
+ }
+};
+
+class CollapseMap {
+public:
+ CollapseMap() { isPaired = false; smap.clear(); pmap.clear(); }
+
+ void init(bool isPaired) {
+ this->isPaired = isPaired;
+ isPaired ? pmap.clear() : smap.clear();
+ }
+
+ void insert(bam1_t *b, bam1_t *b2, float prb) {
+ if (!isPaired) {
+ smapIter = smap.find(SingleEndT(b));
+ if (smapIter == smap.end()) { smap[SingleEndT(bam_dup1(b))] = prb; }
+ else smapIter->second += prb;
+ }
+ else {
+ pmapIter = pmap.find(PairedEndT(SingleEndT(b), SingleEndT(b2)));
+ if (pmapIter == pmap.end()) { pmap[PairedEndT(SingleEndT(bam_dup1(b)), SingleEndT(bam_dup1(b2)))] = prb; }
+ else pmapIter->second += prb;
+ }
+ }
+
+ //once this function is called, "insert" cannot be called anymore
+ bool empty(bool& par) {
+ bool value;
+
+ par = isPaired;
+ if (!isPaired) { value = smap.empty(); smapIter = smap.begin(); }
+ else { value = pmap.empty(); pmapIter = pmap.begin(); }
+
+ return value;
+ }
+
+ bool next(bam1_t*& b, bam1_t*& b2, float& prb) {
+ bool value;
+
+ if (!isPaired) {
+ value = smapIter != smap.end();
+ if (value) {
+ b = smapIter->first.b;
+ prb = smapIter->second;
+ smapIter++;
+ }
+ }
+ else {
+ value = pmapIter != pmap.end();
+ if (value) {
+ b = pmapIter->first.mate1.b;
+ b2 = pmapIter->first.mate2.b;
+ prb = pmapIter->second;
+ pmapIter++;
+ }
+ }
+
+ return value;
+ }
+
+private:
+ bool isPaired;
+
+ std::map<SingleEndT, float> smap;
+ std::map<SingleEndT, float>::iterator smapIter;
+
+ std::map<PairedEndT, float> pmap;
+ std::map<PairedEndT, float>::iterator pmapIter;
+};
+
+#endif /* BC_AUX_H_ */
diff --git a/buildReadIndex.cpp b/buildReadIndex.cpp
new file mode 100644
index 0000000..3837079
--- /dev/null
+++ b/buildReadIndex.cpp
@@ -0,0 +1,86 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<string>
+#include<fstream>
+#include<iostream>
+
+#include "utils.h"
+using namespace std;
+
+bool verbose = true;
+
+int gap;
+bool hasQ;
+
+void buildIndex(char* readF, int gap, bool hasQ) {
+ int nPos;
+ READ_INT_TYPE nReads;
+ bool success;
+ string line;
+ char idxF[STRLEN];
+ char buf[sizeof(nReads) + sizeof(gap) + sizeof(nPos)];
+ streampos startPos;
+
+ sprintf(idxF, "%s.ridx", readF);
+
+ ifstream fin(readF);
+ if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", readF); exit(-1); }
+ ofstream fout(idxF, ios::binary);
+
+ startPos = fout.tellp();
+ memset(buf, 0, sizeof(buf));
+ fout.write((char*)buf, sizeof(buf));
+
+ nReads = 0; nPos = 0;
+ do {
+ streampos pos = fin.tellg();
+ success = true;
+
+ success = (getline(fin, line));
+ if (!success) continue;
+ success = (getline(fin, line));
+ if (!success) continue;
+
+ if (hasQ) {
+ success = (getline(fin, line));
+ if (!success) continue;
+ success = (getline(fin, line));
+ if (!success) continue;
+ }
+
+ if (nReads % gap == 0) {
+ ++nPos;
+ fout.write((char*)&pos, sizeof(pos));
+ }
+ ++nReads;
+
+ if (verbose && nReads % 1000000 == 0) { cout<< "FIN "<< nReads<< endl; }
+ } while (success);
+
+ fout.seekp(startPos);
+ fout.write((char*)&nReads, sizeof(nReads));
+ fout.write((char*)&gap, sizeof(gap));
+ fout.write((char*)&nPos, sizeof(nPos));
+
+ fin.close();
+ fout.close();
+
+ if (verbose) { cout<< "Build Index "<< readF<< " is Done!"<< endl; }
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 5) {
+ printf("Usage : rsem-build-read-index gap hasQ quiet readFile1, readFile2, ...\n");
+ exit(-1);
+ }
+
+ gap = atoi(argv[1]);
+ hasQ = atoi(argv[2]);
+ verbose = !atoi(argv[3]);
+ for (int i = 4; i < argc; i++) {
+ buildIndex(argv[i], gap, hasQ);
+ }
+
+ return 0;
+}
diff --git a/calcCI.cpp b/calcCI.cpp
new file mode 100644
index 0000000..4edcc35
--- /dev/null
+++ b/calcCI.cpp
@@ -0,0 +1,581 @@
+#include<ctime>
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<fstream>
+#include<algorithm>
+#include<vector>
+#include<pthread.h>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "sampling.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+#include "Refs.h"
+#include "GroupInfo.h"
+#include "WriteResults.h"
+
+#include "Buffer.h"
+
+using namespace std;
+
+bool verbose = true;
+
+struct Params {
+ int no;
+ FILE *fi;
+ engine_type *engine;
+ const double *mw;
+};
+
+struct CIParams {
+ int no;
+ int start_gene_id, end_gene_id;
+};
+
+struct CIType {
+ float lb, ub; // the interval is [lb, ub]
+ float cqv; // coefficient of quartile variation
+
+ CIType() { lb = ub = cqv = 0.0; }
+};
+
+int model_type;
+
+double pseudoC; // pseudo count, default is 1
+
+int nMB;
+double confidence;
+int nCV, nSpC, nSamples; // nCV: number of count vectors; nSpC: number of theta vectors sampled per count vector; nSamples: nCV * nSpC
+int nThreads;
+
+float *l_bars;
+
+char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN];
+
+CIType *tpm, *fpkm;
+CIType *iso_tpm = NULL, *iso_fpkm = NULL;
+CIType *gene_tpm, *gene_fpkm;
+
+int M, m;
+Refs refs;
+GroupInfo gi;
+char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
+char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
+
+bool alleleS;
+int m_trans;
+GroupInfo ta;
+
+vector<double> eel; //expected effective lengths
+
+Buffer *buffer;
+
+bool quiet;
+
+Params *paramsArray;
+pthread_t *threads;
+pthread_attr_t attr;
+int rc;
+
+bool hasSeed;
+seedType seed;
+
+CIParams *ciParamsArray;
+
+void* sample_theta_from_c(void* arg) {
+ int *cvec;
+ double *theta;
+ float *tpm;
+ gamma_dist **gammas;
+ gamma_generator **rgs;
+
+ Params *params = (Params*)arg;
+ FILE *fi = params->fi;
+ const double *mw = params->mw;
+
+ cvec = new int[M + 1];
+ theta = new double[M + 1];
+ gammas = new gamma_dist*[M + 1];
+ rgs = new gamma_generator*[M + 1];
+ tpm = new float[M + 1];
+ float l_bar; // the mean transcript length over the sample
+
+ int cnt = 0;
+ while (fscanf(fi, "%d", &cvec[0]) == 1) {
+ for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
+ assert(cvec[0] >= 0);
+
+ ++cnt;
+
+ for (int j = 0; j <= M; j++) {
+ gammas[j] = NULL; rgs[j] = NULL;
+ if (cvec[j] >= 0) {
+ gammas[j] = new gamma_dist(cvec[j] + pseudoC);
+ rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+ }
+ }
+
+ for (int i = 0; i < nSpC; i++) {
+ double sum = 0.0;
+ for (int j = 0; j <= M; j++) {
+ theta[j] = ((j == 0 || (cvec[j] >= 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
+ sum += theta[j];
+ }
+ assert(sum >= EPSILON);
+ for (int j = 0; j <= M; j++) theta[j] /= sum;
+
+ sum = 0.0;
+ tpm[0] = 0.0;
+ for (int j = 1; j <= M; j++)
+ if (eel[j] >= EPSILON) {
+ tpm[j] = theta[j] / eel[j];
+ sum += tpm[j];
+ }
+ else assert(theta[j] < EPSILON);
+ assert(sum >= EPSILON);
+ l_bar = 0.0; // store mean effective length of the sample
+ for (int j = 1; j <= M; j++) { tpm[j] /= sum; l_bar += tpm[j] * eel[j]; tpm[j] *= 1e6; }
+ buffer->write(l_bar, tpm + 1); // ommit the first element in tpm
+ }
+
+ for (int j = 0; j <= M; j++) {
+ if (gammas[j] != NULL) delete gammas[j];
+ if (rgs[j] != NULL) delete rgs[j];
+ }
+
+ if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); }
+ }
+
+ delete[] cvec;
+ delete[] theta;
+ delete[] gammas;
+ delete[] rgs;
+ delete[] tpm;
+
+ return NULL;
+}
+
+template<class ModelType>
+void sample_theta_vectors_from_count_vectors() {
+ ModelType model;
+ model.read(modelF);
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
+
+ int num_threads = min(nThreads, nCV);
+
+ buffer = new Buffer(nMB, nSamples, M, l_bars, tmpF);
+
+ paramsArray = new Params[num_threads];
+ threads = new pthread_t[num_threads];
+
+ char inpF[STRLEN];
+ hasSeed ? engineFactory::init(seed) : engineFactory::init();
+ for (int i = 0; i < num_threads; i++) {
+ paramsArray[i].no = i;
+ sprintf(inpF, "%s%d", cvsF, i);
+ paramsArray[i].fi = fopen(inpF, "r");
+ paramsArray[i].engine = engineFactory::new_engine();
+ paramsArray[i].mw = model.getMW();
+ }
+ engineFactory::finish();
+
+ /* set thread attribute to be joinable */
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+ for (int i = 0; i < num_threads; i++) {
+ rc = pthread_create(&threads[i], &attr, &sample_theta_from_c, (void*)(&paramsArray[i]));
+ pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!");
+ }
+ for (int i = 0; i < num_threads; i++) {
+ rc = pthread_join(threads[i], NULL);
+ pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!");
+ }
+
+ /* destroy attribute */
+ pthread_attr_destroy(&attr);
+ delete[] threads;
+
+ for (int i = 0; i < num_threads; i++) {
+ fclose(paramsArray[i].fi);
+ delete paramsArray[i].engine;
+ }
+ delete[] paramsArray;
+
+ delete buffer; // Must delete here, force the content left in the buffer be written into the disk
+
+ if (verbose) { printf("Sampling is finished!\n"); }
+}
+
+void calcCI(int nSamples, float *samples, CIType& ci) {
+ int p, q; // p pointer for lb, q pointer for ub;
+ int newp, newq;
+ int threshold = nSamples - (int(confidence * nSamples - 1e-8) + 1);
+ int nOutside = 0;
+
+ // sort values
+ sort(samples, samples + nSamples);
+
+ // calculate credibility interval
+ p = 0; q = nSamples - 1;
+ newq = nSamples - 1;
+ do {
+ q = newq;
+ while (newq > 0 && samples[newq - 1] == samples[newq]) newq--;
+ newq--;
+ } while (newq >= 0 && nSamples - (newq + 1) <= threshold);
+
+ nOutside = nSamples - (q + 1);
+
+ ci.lb = -1e30; ci.ub = 1e30;
+ do {
+ if (samples[q] - samples[p] < ci.ub - ci.lb) {
+ ci.lb = samples[p];
+ ci.ub = samples[q];
+ }
+
+ newp = p;
+ while (newp < nSamples - 1 && samples[newp] == samples[newp + 1]) newp++;
+ newp++;
+ if (newp <= threshold) {
+ nOutside += newp - p;
+ p = newp;
+ while (nOutside > threshold && q < nSamples - 1) {
+ newq = q + 1;
+ while (newq < nSamples - 1 && samples[newq] == samples[newq + 1]) newq++;
+ nOutside -= newq - q;
+ q = newq;
+ }
+ assert(nOutside <= threshold);
+ }
+ else p = newp;
+ } while (p <= threshold);
+
+
+ // calculate coefficient of quartile variation
+ float Q1, Q3; // the first and third quartiles
+
+ // calculate Tukey's hinges
+ int quotient = nSamples / 4;
+ int residue = nSamples % 4;
+
+ if (residue == 0) {
+ Q1 = (samples[quotient - 1] + samples[quotient]) / 2.0;
+ Q3 = (samples[3 * quotient - 1] + samples[3 * quotient]) / 2.0;
+ }
+ else if (residue == 3) {
+ Q1 = (samples[quotient] + samples[quotient + 1]) / 2.0;
+ Q3 = (samples[quotient * 3 + 1] + samples[quotient * 3 + 2]) / 2.0;
+ }
+ else {
+ Q1 = samples[quotient];
+ Q3 = samples[3 * quotient];
+ }
+
+ ci.cqv = (Q3 - Q1 > 0.0 ? (Q3 - Q1) / (Q3 + Q1) : 0.0);
+}
+
+void* calcCI_batch(void* arg) {
+ float *tsamples, *fsamples;
+ float *itsamples = NULL, *ifsamples = NULL, *gtsamples, *gfsamples;
+ ifstream fin;
+ CIParams *ciParams = (CIParams*)arg;
+ int curtid, curaid, tid;
+
+ tsamples = new float[nSamples];
+ fsamples = new float[nSamples];
+ if (alleleS) {
+ itsamples = new float[nSamples];
+ ifsamples = new float[nSamples];
+ }
+ gtsamples = new float[nSamples];
+ gfsamples = new float[nSamples];
+
+ fin.open(tmpF, ios::binary);
+ // minus 1 here for that theta0 is not written!
+ streampos pos = streampos(gi.spAt(ciParams->start_gene_id) - 1) * nSamples * FLOATSIZE;
+ fin.seekg(pos, ios::beg);
+
+ int cnt = 0;
+ if (alleleS) {
+ curtid = curaid = -1;
+ memset(itsamples, 0, FLOATSIZE * nSamples);
+ memset(ifsamples, 0, FLOATSIZE * nSamples);
+ }
+ for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ memset(gtsamples, 0, FLOATSIZE * nSamples);
+ memset(gfsamples, 0, FLOATSIZE * nSamples);
+ for (int j = b; j < e; j++) {
+ if (alleleS) {
+ tid = ta.gidAt(j);
+ if (curtid != tid) {
+ if (curtid >= 0) {
+ if (j - curaid > 1) {
+ calcCI(nSamples, itsamples, iso_tpm[curtid]);
+ calcCI(nSamples, ifsamples, iso_fpkm[curtid]);
+ }
+ else {
+ iso_tpm[curtid] = tpm[curaid];
+ iso_fpkm[curtid] = fpkm[curaid];
+ }
+ }
+ curtid = tid;
+ curaid = j;
+ }
+ }
+
+ for (int k = 0; k < nSamples; k++) {
+ fin.read((char*)(&tsamples[k]), FLOATSIZE);
+ fsamples[k] = 1e3 / l_bars[k] * tsamples[k];
+ if (alleleS) {
+ itsamples[k] += tsamples[k];
+ ifsamples[k] += fsamples[k];
+ }
+ gtsamples[k] += tsamples[k];
+ gfsamples[k] += fsamples[k];
+ }
+ calcCI(nSamples, tsamples, tpm[j]);
+ calcCI(nSamples, fsamples, fpkm[j]);
+ }
+
+ if (e - b > 1) {
+ calcCI(nSamples, gtsamples, gene_tpm[i]);
+ calcCI(nSamples, gfsamples, gene_fpkm[i]);
+ }
+ else {
+ gene_tpm[i] = tpm[b];
+ gene_fpkm[i] = fpkm[b];
+ }
+
+ ++cnt;
+ if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); }
+ }
+ fin.close();
+
+ if (alleleS && (curtid >= 0)) {
+ if (gi.spAt(ciParams->end_gene_id) - curaid > 1) {
+ calcCI(nSamples, itsamples, iso_tpm[curtid]);
+ calcCI(nSamples, ifsamples, iso_fpkm[curtid]);
+ }
+ else {
+ iso_tpm[curtid] = tpm[curaid];
+ iso_fpkm[curtid] = fpkm[curaid];
+ }
+ }
+
+ delete[] tsamples;
+ delete[] fsamples;
+ if (alleleS) {
+ delete[] itsamples;
+ delete[] ifsamples;
+ }
+ delete[] gtsamples;
+ delete[] gfsamples;
+
+ return NULL;
+}
+
+void calculate_credibility_intervals(char* imdName) {
+ FILE *fo;
+ char outF[STRLEN];
+ int num_threads = nThreads;
+
+ tpm = new CIType[M + 1];
+ fpkm = new CIType[M + 1];
+ if (alleleS) {
+ iso_tpm = new CIType[m_trans];
+ iso_fpkm = new CIType[m_trans];
+ }
+ gene_tpm = new CIType[m];
+ gene_fpkm = new CIType[m];
+
+ assert(M > 0);
+ int quotient = M / num_threads;
+ if (quotient < 1) { num_threads = M; quotient = 1; }
+ int cur_gene_id = 0;
+ int num_isoforms = 0;
+
+ // A just so so strategy for paralleling
+ ciParamsArray = new CIParams[num_threads];
+ for (int i = 0; i < num_threads; i++) {
+ ciParamsArray[i].no = i;
+ ciParamsArray[i].start_gene_id = cur_gene_id;
+ num_isoforms = 0;
+
+ while ((m - cur_gene_id > num_threads - i - 1) && (i == num_threads - 1 || num_isoforms < quotient)) {
+ num_isoforms += gi.spAt(cur_gene_id + 1) - gi.spAt(cur_gene_id);
+ ++cur_gene_id;
+ }
+
+ ciParamsArray[i].end_gene_id = cur_gene_id;
+ }
+
+ threads = new pthread_t[num_threads];
+
+ /* set thread attribute to be joinable */
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+ // paralleling
+ for (int i = 0; i < num_threads; i++) {
+ rc = pthread_create(&threads[i], &attr, &calcCI_batch, (void*)(&ciParamsArray[i]));
+ pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!");
+ }
+ for (int i = 0; i < num_threads; i++) {
+ rc = pthread_join(threads[i], NULL);
+ pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!");
+ }
+
+ // releasing resources
+
+ /* destroy attribute */
+ pthread_attr_destroy(&attr);
+ delete[] threads;
+
+ delete[] ciParamsArray;
+
+ alleleS ? sprintf(outF, "%s.allele_res", imdName) : sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", tpm[i].lb, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", tpm[i].cqv, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.6g%c", fpkm[i].cqv, (i < M ? '\t' : '\n'));
+ fclose(fo);
+
+ if (alleleS) {
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_tpm[i].cqv, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_fpkm[i].cqv, (i < m_trans - 1 ? '\t' : '\n'));
+ fclose(fo);
+ }
+
+ //gene level results
+ sprintf(outF, "%s.gene_res", imdName);
+ fo = fopen(outF, "a");
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_tpm[i].lb, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_tpm[i].ub, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_tpm[i].cqv, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_fpkm[i].lb, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.6g%c", gene_fpkm[i].cqv, (i < m - 1 ? '\t' : '\n'));
+ fclose(fo);
+
+ delete[] tpm;
+ delete[] fpkm;
+ if (alleleS) {
+ delete[] iso_tpm;
+ delete[] iso_fpkm;
+ }
+ delete[] gene_tpm;
+ delete[] gene_fpkm;
+
+ if (verbose) { printf("All credibility intervals are calculated!\n"); }
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 8) {
+ printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [--seed seed] [--pseudo-count pseudo_count] [-q]\n");
+ exit(-1);
+ }
+
+ strcpy(refName, argv[1]);
+ strcpy(imdName, argv[2]);
+ strcpy(statName, argv[3]);
+
+ confidence = atof(argv[4]);
+ nCV = atoi(argv[5]);
+ nSpC = atoi(argv[6]);
+ nMB = atoi(argv[7]);
+
+ nThreads = 1;
+ quiet = false;
+ hasSeed = false;
+ pseudoC = 1.0;
+ for (int i = 8; i < argc; i++) {
+ if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
+ if (!strcmp(argv[i], "--seed")) {
+ hasSeed = true;
+ int len = strlen(argv[i + 1]);
+ seed = 0;
+ for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
+ }
+ if (!strcmp(argv[i], "--pseudo-count")) pseudoC = atof(argv[i + 1]);
+ if (!strcmp(argv[i], "-q")) quiet = true;
+ }
+ verbose = !quiet;
+
+ sprintf(refF, "%s.seq", refName);
+ refs.loadRefs(refF, 1);
+ M = refs.getM();
+
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ // allele-specific
+ alleleS = isAlleleSpecific(refName, NULL, &ta);
+ if (alleleS) m_trans = ta.getm();
+
+ nSamples = nCV * nSpC;
+ assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples
+ l_bars = new float[nSamples];
+
+ sprintf(tmpF, "%s.tmp", imdName);
+ sprintf(cvsF, "%s.countvectors", imdName);
+
+ sprintf(modelF, "%s.model", statName);
+ FILE *fi = fopen(modelF, "r");
+ general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
+ assert(fscanf(fi, "%d", &model_type) == 1);
+ fclose(fi);
+
+ // Phase I
+ switch(model_type) {
+ case 0 : sample_theta_vectors_from_count_vectors<SingleModel>(); break;
+ case 1 : sample_theta_vectors_from_count_vectors<SingleQModel>(); break;
+ case 2 : sample_theta_vectors_from_count_vectors<PairedEndModel>(); break;
+ case 3 : sample_theta_vectors_from_count_vectors<PairedEndQModel>(); break;
+ }
+
+ // Phase II
+ calculate_credibility_intervals(imdName);
+
+ delete l_bars;
+
+ return 0;
+}
diff --git a/cnt_file_description.txt b/cnt_file_description.txt
new file mode 100644
index 0000000..c39a67b
--- /dev/null
+++ b/cnt_file_description.txt
@@ -0,0 +1,15 @@
+# '#' marks the start of comments (till the end of the line)
+# *.cnt file contains alignment statistics based purely on the alignment results obtained from aligners
+N0 N1 N2 N_tot # N0, number of unalignable reads; N1, number of alignable reads; N2, number of filtered reads due to too many alignments; N_tot = N0 + N1 + N2
+nUnique nMulti nUncertain # nUnique, number of reads aligned uniquely to a gene; nMulti, number of reads aligned to multiple genes; nUnique + nMulti = N1;
+ # nUncertain, number of reads aligned to multiple locations in the given reference sequences, which include isoform-level multi-mapping reads
+nHits read_type # nHits, number of total alignments.
+ # read_type: 0, single-end read, no quality score; 1, single-end read, with quality score; 2, paired-end read, no quality score; 3, paired-end read, with quality score
+
+# The next section counts reads by the number of alignments they have. Each line contains two values separated by a TAB character. The first value is number of alignments. 'Inf' refers to reads filtered due to too many alignments. The second value is the number of reads that contain such many alignments
+
+0 N0
+...
+number_of_alignments number_of_reads_with_that_many_alignments
+...
+Inf N2
diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem
new file mode 100755
index 0000000..1354cc3
--- /dev/null
+++ b/convert-sam-for-rsem
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use Pod::Usage;
+use File::Basename;
+
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils qw(runCommand getSAMTOOLS);
+
+use Env qw(@PATH);
+@PATH = ($FindBin::RealBin, "$FindBin::RealBin/" . getSAMTOOLS(), @PATH);
+
+use strict;
+
+
+my $p = 1;
+my $mem = "1G";
+my $help = 0;
+
+GetOptions("p|num-threads=i" => \$p,
+ "memory-per-thread=s" => \$mem,
+ "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2);
+
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+
+my $command;
+
+my $in_file = $ARGV[0];
+my $tmp_file = "$ARGV[1].tmp.bam";
+my $out_file = "$ARGV[1].bam";
+
+# Phase I, sort entries so that all entries of a same read groups together
+
+$command = "samtools sort -n -@ $p -m $mem -o $tmp_file $in_file";
+&runCommand($command);
+
+# Phase II, parse the temporary BAM file to make paired-end alignments' two mates adjacent to each other
+
+$command = "rsem-scan-for-paired-end-reads $p $tmp_file $out_file";
+&runCommand($command);
+
+# delete temporary file
+unlink $tmp_file or die "Could not delete $tmp_file.";
+
+print "Conversion is completed. $out_file will be checked by 'rsem-sam-validator'.\n";
+
+# Phase III, validate if the resulting bam file is correct
+
+$command = "rsem-sam-validator $out_file";
+&runCommand($command);
+
+__END__
+
+=head1 NAME
+
+convert-sam-for-rsem - Make a RSEM compatible BAM file.
+
+=head1 SYNOPSIS
+
+ convert-sam-for-rsem [options] <input.sam/input.bam/input.cram> output_file_name
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<input.sam/input.bam/input.cram>
+
+The SAM/BAM/CRAM file generated by user's aligner. We require this file contains the header section.
+
+=item B<output_file_name>
+
+The output name for the converted file. 'convert-sam-for-rsem' will output a BAM with the name 'output_file_name.bam'.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<-p/--num-threads> <int>
+
+Set the number of threads to be used for converting. (Default: 1)
+
+=item B<--memory-per-thread> <string>
+
+Set the maximum allowable memory per thread. <string> represents the memory and accepts suffices 'K/M/G'. (Default: 1G)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program converts the SAM/BAM/CRAM file generated by user's aligner into a BAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references and output header sections. After the conversion, this program will call 'rsem-sam-validator' to validate the resulting BAM file.
+
+Note: You do not need to run this script if `rsem-sam-validator' reports that your SAM/BAM/CRAM file is valid.
+
+=head1 EXAMPLES
+
+Suppose input is set to 'input.sam' and output file name is "output"
+
+ convert-sam-for-rsem input.sam output
+
+We will get a file called 'output.bam' as output.
+
+=cut
diff --git a/extract-transcript-to-gene-map-from-trinity b/extract-transcript-to-gene-map-from-trinity
new file mode 100755
index 0000000..1ec63d5
--- /dev/null
+++ b/extract-transcript-to-gene-map-from-trinity
@@ -0,0 +1,34 @@
+#!/usr/bin/env perl
+
+use strict;
+
+if (scalar(@ARGV) != 2) {
+ print "Usage: extract-transcript-to-gene-map-from-trinity trinity_fasta_file map_file\n";
+ exit(-1);
+}
+
+open(INPUT, $ARGV[0]);
+open(OUTPUT, ">$ARGV[1]");
+
+my ($tag, $line);
+$tag = <INPUT>; chomp($tag);
+while (substr($tag, 0, 1) eq ">") {
+ $tag = substr($tag, 1);
+ my $cnt = 0;
+ while (($line = <INPUT>) && substr($line, 0, 1) ne ">") {
+ $cnt++;
+ }
+ if ($cnt == 0) { print "Warning: Fasta entry $tag has an empty sequence, it is omitted.\n"; }
+ else {
+ my ($tid, @tmp) = split(/ /, $tag);
+ my $pos = rindex($tid, "_");
+ my $gid = "";
+ if ($pos >= 0) { $gid = substr($tid, 0, $pos); }
+ else { $gid = $tid; }
+ print OUTPUT "$gid\t$tid\n";
+ }
+ $tag = $line; chomp($tag);
+}
+
+close(INPUT);
+close(OUTPUT);
diff --git a/extractRef.cpp b/extractRef.cpp
new file mode 100644
index 0000000..e29592a
--- /dev/null
+++ b/extractRef.cpp
@@ -0,0 +1,376 @@
+#include<cstdio>
+#include<cstring>
+#include<cctype>
+#include<cstdlib>
+#include<fstream>
+#include<sstream>
+#include<set>
+#include<map>
+#include<vector>
+#include<algorithm>
+#include<string>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "GTFItem.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+using namespace std;
+
+bool verbose = true;
+
+struct ChrInfo {
+ string name;
+ size_t len;
+
+ ChrInfo(const string& name, size_t len) {
+ this->name = name;
+ this->len = len;
+ }
+
+ bool operator< (const ChrInfo& o) const {
+ return name < o.name;
+ }
+};
+
+int M;
+
+vector<GTFItem> items;
+vector<string> seqs;
+vector<int> starts; // used to generate .grp
+map<string, vector<int> > sn2tr; // map from seqname to transcripts
+map<string, vector<int> >::iterator iter;
+vector<ChrInfo> chrvec;
+
+Transcripts transcripts;
+
+char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN];
+char chromListF[STRLEN];
+
+bool hasMappingFile;
+char mappingFile[STRLEN];
+
+map<string, string> mi_table; // mapping info table
+map<string, string>::iterator mi_iter; //mapping info table's iterator
+
+set<string> sources;
+
+void parseSources(char* sstr) {
+ char* p = strtok(sstr, ",");
+ while (p != NULL) {
+ sources.insert(p);
+ p = strtok(NULL, ",");
+ }
+}
+
+inline bool isTrusted(const string& source) {
+ return sources.size() == 0 || sources.find(source) != sources.end();
+}
+
+void loadMappingInfo(char* mappingF) {
+ ifstream fin(mappingF);
+ string line, key, value;
+
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist.");
+
+ mi_table.clear();
+ while (getline(fin, line)) {
+ line = cleanStr(line);
+ if (line[0] == '#') continue;
+ istringstream strin(line);
+ strin>>value>>key;
+ mi_table[key] = value;
+ }
+
+ fin.close();
+}
+
+bool buildTranscript(int sp, int ep) {
+ int cur_s, cur_e; // current_start, current_end
+ vector<Interval> vec;
+
+ string transcript_id = items[sp].getTranscriptID();
+ string gene_id = items[sp].getGeneID();
+ string gene_name = "", transcript_name = "";
+
+ char strand = items[sp].getStrand();
+ string seqname = items[sp].getSeqName();
+ string left = items[sp].getLeft();
+
+ vec.clear();
+ cur_s = cur_e = -1;
+ for (int i = sp; i <= ep; i++) {
+ int start = items[i].getStart();
+ int end = items[i].getEnd();
+
+ general_assert(strand == items[i].getStrand(), "According to the GTF file given, transcript " + transcript_id + " has exons from different orientations!");
+ general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, transcript " + transcript_id + " has exons on multiple chromosomes!");
+
+ if (items[i].getGeneName() != "") {
+ if (gene_name == "") gene_name = items[i].getGeneName();
+ else general_assert(gene_name == items[i].getGeneName(), "Transcript " + transcript_id + " is associated with multiple gene names!");
+ }
+ if (items[i].getTranscriptName() != "") {
+ if (transcript_name == "") transcript_name = items[i].getTranscriptName();
+ else general_assert(transcript_name == items[i].getTranscriptName(), "Transcript " + transcript_id + " is associated with multiple transcript names!");
+ }
+
+ if (cur_e + 1 < start) {
+ if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
+ cur_s = start;
+ }
+ cur_e = (cur_e < end ? end : cur_e);
+ }
+ if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
+
+ transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left, transcript_name, gene_name));
+
+ return true;
+}
+
+void parse_gtf_file(char* gtfF) {
+ ifstream fin(gtfF);
+ string line, tid, gid;
+ GTFItem item;
+
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist.");
+
+ int cnt = 0;
+
+ int n_warns = 0;
+
+ items.clear();
+ while (getline(fin, line)) {
+ if (line[0] == '#') continue; // if this line is comment, jump it
+ item.parse(line);
+ if (item.getFeature() == "exon" && isTrusted(item.getSource())) {
+ if (item.getStart() > item.getEnd()) {
+ if (++n_warns <= MAX_WARNS) {
+ fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n");
+ fprintf(stderr, "\t%s\n\n", line.c_str());
+ }
+ }
+ else if (item.getStart() < 1) {
+ if (++n_warns <= MAX_WARNS) {
+ fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n");
+ fprintf(stderr, "\t%s\n\n", line.c_str());
+ }
+ }
+ else {
+ item.parseAttributes(line);
+ if (hasMappingFile) {
+ tid = item.getTranscriptID();
+ mi_iter = mi_table.find(tid);
+ general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!");
+ gid = mi_iter->second;
+ item.setGeneID(gid);
+ }
+ items.push_back(item);
+ }
+ }
+
+ ++cnt;
+ if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
+ }
+ fin.close();
+
+ if (n_warns > 0) fprintf(stderr, "Warning: In total, %d exons are discarded.", n_warns);
+
+ sort(items.begin(), items.end());
+
+ int sp = 0, ep; // start pointer, end pointer
+ int nItems = items.size();
+
+ sn2tr.clear();
+ while (sp < nItems) {
+ tid = items[sp].getTranscriptID();
+
+ ep = sp + 1;
+ while (ep < nItems && items[ep].getTranscriptID() == tid) ep++;
+ ep--;
+
+ buildTranscript(sp, ep);
+
+ int sid = transcripts.getM();
+ const Transcript& transcript = transcripts.getTranscriptAt(sid);
+
+ iter = sn2tr.find(transcript.getSeqName());
+ if (iter == sn2tr.end()) {
+ vector<int> vec(1, sid);
+ sn2tr[transcript.getSeqName()] = vec;
+ }
+ else {
+ iter->second.push_back(sid);
+ }
+
+ sp = ep + 1;
+ }
+
+ items.clear();
+
+ M = transcripts.getM();
+ general_assert(M > 0, "The reference contains no transcripts!");
+
+ if (verbose) { printf("Parsing gtf File is done!\n"); }
+}
+
+void shrink() {
+ int curp = 0;
+
+ int n_warns = 0;
+
+ for (int i = 1; i <= M; i++)
+ if (seqs[i] == "") {
+ if (++n_warns <= MAX_WARNS) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(stderr, "Warning: Cannot extract transcript %s's sequence since the chromosome it locates, %s, is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
+ }
+ }
+ else {
+ ++curp;
+ transcripts.move(i, curp);
+ if (i > curp) seqs[curp] = seqs[i];
+ }
+
+ if (n_warns > 0) fprintf(stderr, "Warning: %d transcripts are failed to extract because their chromosome sequences are absent.\n", n_warns);
+ if (verbose) printf("%d transcripts are extracted.\n", curp);
+
+ transcripts.setM(curp);
+ M = transcripts.getM();
+ general_assert(M > 0, "The reference contains no transcripts!");
+
+ starts.clear();
+ string curgid = "", gid;
+
+ for (int i = 1; i <= M; i++) {
+ gid = transcripts.getTranscriptAt(i).getGeneID();
+ if (curgid != gid) {
+ starts.push_back(i);
+ curgid = gid;
+ }
+ }
+ starts.push_back(M + 1);
+}
+
+void writeResults(char* refName) {
+ int s;
+ ofstream fout;
+
+ sprintf(groupF, "%s.grp", refName);
+ sprintf(tiF, "%s.ti", refName);
+ sprintf(refFastaF, "%s.transcripts.fa", refName);
+ sprintf(chromListF, "%s.chrlist", refName);
+
+
+ fout.open(groupF);
+ s = starts.size();
+ for (int i = 0; i < s; i++) fout<<starts[i]<<endl;
+ fout.close();
+ if (verbose) { printf("Group File is generated!\n"); }
+
+ transcripts.writeTo(tiF);
+ if (verbose) { printf("Transcript Information File is generated!\n"); }
+
+ fout.open(chromListF);
+ s = chrvec.size();
+ for (int i = 0; i < s; i++) {
+ fout<<chrvec[i].name<<'\t'<<chrvec[i].len<<endl;
+ }
+ fout.close();
+ if (verbose) { printf("Chromosome List File is generated!\n"); }
+
+ fout.open(refFastaF);
+ for (int i = 1; i <= M; i++) {
+ fout<<">"<<transcripts.getTranscriptAt(i).getTranscriptID()<<endl;
+ fout<<seqs[i]<<endl;
+ }
+ fout.close();
+ if (verbose) { printf("Extracted Sequences File is generated!\n"); }
+}
+
+struct CursorPos {
+ char *filename;
+ int line_no, pos;
+} cursor;
+
+inline char check(char c) {
+ general_assert(isalpha(c), "FASTA file " + cstrtos(cursor.filename) + " contains an unknown character, " + \
+ ctos(c) + " (ASCII code " + itos(c) + "), at line " + itos(cursor.line_no) + ", position " + itos(cursor.pos + 1) + "!");
+ if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
+ if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
+ return c;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 7 || ((hasMappingFile = atoi(argv[5])) && argc < 8)) {
+ printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF sources hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
+ exit(-1);
+ }
+
+ verbose = !atoi(argv[2]);
+ if (hasMappingFile) {
+ loadMappingInfo(argv[6]);
+ }
+
+ sources.clear();
+ if (strcmp(argv[4], "None")) parseSources(argv[4]);
+
+ parse_gtf_file(argv[3]);
+
+ ifstream fin;
+ string line, gseq, seqname;
+ int len;
+ size_t seqlen;
+
+ chrvec.clear();
+
+ seqs.clear();
+ seqs.resize(M + 1, "");
+ int start = hasMappingFile ? 7 : 6;
+ for (int i = start; i < argc; i++) {
+ fin.open(argv[i]);
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
+ cursor.filename = argv[i]; cursor.line_no = cursor.pos = 0;
+
+ getline(fin, line);
+ while ((fin) && (line[0] == '>')) {
+ istringstream strin(line.substr(1));
+ strin>>seqname;
+ ++cursor.line_no;
+
+ gseq = ""; seqlen = 0;
+ while((getline(fin, line)) && (line[0] != '>')) {
+ ++cursor.line_no;
+ len = line.length();
+ for (cursor.pos = 0; cursor.pos < len; ++cursor.pos) line[cursor.pos] = check(line[cursor.pos]);
+ seqlen += len;
+ gseq += line;
+ }
+ assert(seqlen > 0);
+
+ iter = sn2tr.find(seqname);
+ if (iter == sn2tr.end()) continue;
+
+ chrvec.push_back(ChrInfo(seqname, seqlen));
+
+ vector<int>& vec = iter->second;
+ int s = vec.size();
+ for (int j = 0; j < s; j++) {
+ assert(vec[j] > 0 && vec[j] <= M);
+ transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
+ }
+ }
+ fin.close();
+
+ if (verbose) { printf("%s is processed!\n", argv[i]); }
+ }
+ sort(chrvec.begin(), chrvec.end());
+
+ shrink();
+ if (verbose) { printf("Extracting sequences is done!\n"); }
+
+ writeResults(argv[1]);
+
+ return 0;
+}
diff --git a/getUnique.cpp b/getUnique.cpp
new file mode 100644
index 0000000..3d61589
--- /dev/null
+++ b/getUnique.cpp
@@ -0,0 +1,83 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<vector>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+
+#include "utils.h"
+#include "my_assert.h"
+
+using namespace std;
+
+int nThreads;
+string cqname;
+samFile *in, *out;
+bam_hdr_t *header;
+bam1_t *b;
+vector<bam1_t*> arr;
+bool unaligned;
+
+void output() {
+ if (unaligned || arr.size() == 0) return;
+ bool isPaired = bam_is_paired(arr[0]);
+ if ((isPaired && arr.size() != 2) || (!isPaired && arr.size() != 1)) return;
+ for (size_t i = 0; i < arr.size(); ++i) sam_write1(out, header, arr[i]);
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 4) {
+ printf("Usage: rsem-get-unique number_of_threads unsorted_transcript_bam_input bam_output\n");
+ exit(-1);
+ }
+
+ nThreads = atoi(argv[1]);
+ in = sam_open(argv[2], "r");
+ assert(in != 0);
+ header = sam_hdr_read(in);
+ assert(header != 0);
+ out = sam_open(argv[3], "wb");
+ assert(out != 0);
+ sam_hdr_write(out, header);
+ if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
+
+ HIT_INT_TYPE cnt = 0;
+
+ cqname = "";
+ arr.clear();
+ b = bam_init1();
+ unaligned = false;
+
+ while (sam_read1(in, header, b) >= 0) {
+ if (cqname != bam_get_qname(b)) {
+ output();
+ cqname = bam_get_qname(b);
+ for (size_t i = 0; i < arr.size(); ++i) bam_destroy1(arr[i]);
+ arr.clear();
+ unaligned = false;
+ }
+
+ unaligned = unaligned || bam_is_unmapped(b);
+ arr.push_back(bam_dup1(b));
+
+ ++cnt;
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+ }
+
+ if (cnt >= 1000000) printf("\n");
+
+ output();
+
+ bam_destroy1(b);
+ bam_hdr_destroy(header);
+ sam_close(in);
+ sam_close(out);
+
+ printf("done!\n");
+
+ return 0;
+}
diff --git a/model_file_description.txt b/model_file_description.txt
new file mode 100644
index 0000000..47356bc
--- /dev/null
+++ b/model_file_description.txt
@@ -0,0 +1,74 @@
+model_type # 0, single-end, no quality score; 1, single-end, quality score; 2, paired-end, no quality score; 3, paired-end, quality score
+
+forward_prob # The probability of generating a read from forward strand
+
+# Fragment length distribution or read length distribution (if reads are single-end reads and no fragment length distribution parameters are provided)
+lower_bound upper_bound span # Fragment/read length should be in the range of (lower_bound, upper_bound] and span = upper_bound - lower_bound
+prob_1 prob_2 ... prob_span # This line contains span probabilities separated by a single space. prob_i is the probability of generating a length of (lower_bound + i)
+
+[
+# Read length distribution, this section is optional for single-end reads model, the "[" and "]" suggest this section is optional.
+has_optional_length_dist # This line only appear if model_type < 2. 0 stands no optional length distribution for that the previous section already defines the read length distribution, 1 stands for having this optional read length distribution. If has_optional_length_dist = 0, the next two lines will not appear
+lower_bound upper_bound span # Fragment/read length should be in the range of (lower_bound, upper_bound] and span = upper_bound - lower_bound
+prob_1 prob_2 ... prob_span # This line contains span probabilities separated by a single space. prob_i is the probability of generating a length of (lower_bound + i)
+]
+
+# Read start position distribution
+estimate_RSPD # 0 means that RSEM assumes a uniform read generating distribution across a transcript, 1 means that RSEM estimated a RSPD from data
+number_of_bins # number of bins used for estimating RSPD, please refer to Li, B., Ruotti, V., Stewart, R. M., Thomson, J. A., Dewey, C. N. (2010). RNA-Seq gene expression estimation with read mapping uncertainty. Bioinformatics, 26(4), 493-500. for more details
+bin_prob_1 bin_prob_2 ... bin_prob_number_of_bins # number_of_bins single space separated numbers representing the probabilities of starting a read from each bin
+
+[
+# Quality score distribution, only presented if model_type == 1 or 3. Quality scores are generated according to a first order Markov model
+# Quality scores are numbered from 0 internally, thus if prhed+33 quality score is used in the data, then i corresponds to phred score (i+33)
+size # total number of quality scores modeled
+p_init_0 p_init_1 ... p_init_(size-1) # size real-valued, single space separated numbers representing the initial probabilities of each state (quality score)
+p_trans_0,0 ... p_trans_0,(size-1)
+...
+p_trans_(size-1),0 ... p_trans_(size-1),(size-1) # size times size matrix representing the transition matrix for the Morkov chain
+]
+
+[
+# Sequencing error model for quality scores, only presented if model_type == 1 or 3
+size ncodes # size is the total number of quality scores and ncodes is the number of different bases (ncodes = 5, 0: A; 1: C; 2: G; 3: T; 4: N)
+# In the following, RSEM gives size blocks. The i th block represents the sequencing error model for quality score i. Blocks are separated by single blank lines
+...
+
+p_i,0,0 p_i,0,1 p_i,0,2 p_i,0,3 p_i,0,4
+...
+p_i,4,0 p_i,4,1 p_i,4,2 p_i,4,3 p_i,4,4 # The i th block contains a ncodes times ncodes matrix. Each cell (j,k) gives the probability of generating a base k given the quality score is i and the reference base is j
+
+...
+]
+
+[
+# Model for generating a noise read based on quality scores. Only presented if model_type == 1 or 3
+size ncodes # the same as sequencing error model for quality score
+p_0,0 p_0,1 p_0,2 p_0,3 p_0,4
+...
+p_(size-1),0 p_(size-1),1 p_(size-1),2 p_(size-1),3 p_(size-1),4 # A size times ncode matrix giving the probability of generating a base in a noise read given a quality score
+]
+
+[
+# Sequencing error model without quality score, only presented if model_type == 0 or 2. It records the probability of generating a base k at position i given the corresponding reference base is j. Positions are numbered from 0
+profile_length ncodes # profile_length should be equal to the maximum length of any read in the data set
+# There will be profile_length blocks separated by single blank lines
+...
+
+p_i,0,0 p_i,0,1 p_i,0,2 p_i,0,3 p_i,0,4
+...
+p_i,4,0 p_i,4,1 p_i,4,2 p_i,4,3 p_i,4,4 # The i th block contains a ncodes times ncodes matrix. Each cell (j,k) gives the probability of generating a base k at position i given the reference base is j
+
+...
+]
+
+[
+# Model for generating a noise read without quality scores. Only presented if model_type == 0 or 2
+ncodes # ncodes = 5
+p_0 p_1 p_2 p_3 p_4 # probabilities of generating a particular base for the noise read, all positions use the same base generating probabilities
+]
+
+[
+# Some other numbers, which has nothing to do with simulation
+]
+
diff --git a/my_assert.h b/my_assert.h
new file mode 100644
index 0000000..3688d96
--- /dev/null
+++ b/my_assert.h
@@ -0,0 +1,107 @@
+#ifndef MY_ASSERT_H
+#define MY_ASSERT_H
+
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cerrno>
+#include<string>
+#include<sstream>
+#include<iomanip>
+
+inline std::string itos(int i) {
+ std::ostringstream strout;
+ strout<<i;
+ return strout.str();
+}
+
+// n : number of significant digits
+inline std::string ftos(double f, int n) {
+ std::ostringstream strout;
+ strout<<std::setprecision(n)<<f;
+ return strout.str();
+}
+
+inline std::string ctos(char c) {
+ return std::string(1, c);
+}
+
+inline std::string cstrtos(const char* s) {
+ return std::string(s);
+}
+
+
+#define general_assert(expr, errmsg) if (!(expr)) general_report((errmsg), false)
+#define general_assert_1(expr, errmsg) if (!(expr)) general_report((errmsg), true)
+
+inline void general_report(const std::string& errmsg, bool putEnter) {
+ if (putEnter) printf("\n");
+ fprintf(stderr, "%s\n", errmsg.c_str());
+ exit(-1);
+}
+
+#define pthread_assert(rc, func_name, errmsg) if ((rc) != 0) pthread_report((rc), (func_name), (errmsg))
+
+inline void pthread_report(int rc, const std::string& func_name, const std::string& errmsg) {
+ fprintf(stderr, "%s\n", errmsg.c_str());
+
+ if (func_name == "pthread_create") {
+ switch(rc) {
+ case EAGAIN:
+ fprintf(stderr, "Error code: EAGAIN. Insufficient resources to create another thread, or a system-imposed limit on the number of threads was encountered.\n");
+ break;
+ case EINVAL:
+ fprintf(stderr, "Error code: EINVAL. Invalid settings in attr.\n");
+ break;
+ case EPERM:
+ fprintf(stderr, "Error code: EPERM. No permission to set the scheduling policy and parameters specified in attr.\n");
+ break;
+ default: fprintf(stderr, "Unknown error code: %d.\n", rc);
+ }
+ } else if (func_name == "pthread_join") {
+ switch(rc) {
+ case EDEADLK:
+ fprintf(stderr, "Error code: EDEADLK. A deadlock was detected (e.g., two threads tried to join with each other); or thread_id specifies the calling thread.\n");
+ break;
+ case EINVAL:
+ fprintf(stderr, "Error code: EINVAL. The implementation has detected that the value specified by thread_id does not refer to a joinable thread.\n");
+ break;
+ case ESRCH:
+ fprintf(stderr, "Error code: ESRCH. No thread with thread_id could be found.\n");
+ break;
+ default: fprintf(stderr, "Unknown error code: %d.\n", rc);
+ }
+ } else if (func_name == "pthread_mutex_lock") {
+ switch(rc) {
+ case EAGAIN:
+ fprintf(stderr, "Error code: EAGAIN. The mutex could not be acquired because the maximum number of recursive locks for mutex has been exceeded.\n");
+ break;
+ case EDEADLK:
+ fprintf(stderr, "Error code: EDEADLK. The current thread already owns the mutex.\n");
+ break;
+ case EINVAL:
+ fprintf(stderr, "Error code: EINVAL. The mutex was created with the protocol attribute having the value PTHREAD_PRIO_PROTECT and the calling thread's priority is higher than the mutex's current priority ceiling. Or the value specified by mutex does not refer to an initialized mutex object.\n");
+ break;
+ default: fprintf(stderr, "Unknown error code: %d.\n", rc);
+ }
+ } else if (func_name == "pthread_mutex_unlock") {
+ switch(rc) {
+ case EAGAIN:
+ fprintf(stderr, "Error code: EAGAIN. The mutex could not be acquired because the maximum number of recursive locks for mutex has been exceeded.\n");
+ break;
+ case EINVAL:
+ fprintf(stderr, "Error code: EINVAL. The value specified by mutex does not refer to an initialized mutex object.\n");
+ break;
+ case EPERM:
+ fprintf(stderr, "Error code: EPERM. The current thread does not own the mutex.\n");
+ break;
+ default: fprintf(stderr, "Unknown error code: %d.\n", rc);
+ }
+ } else {
+ fprintf(stderr, "Unknown function name: %s.\n", func_name.c_str());
+ }
+
+ exit(-1);
+}
+
+#endif
diff --git a/pRSEM/ChIPSeqExperiment.py b/pRSEM/ChIPSeqExperiment.py
new file mode 100644
index 0000000..e4b2b59
--- /dev/null
+++ b/pRSEM/ChIPSeqExperiment.py
@@ -0,0 +1,257 @@
+__doc__="""
+
+ pliu 20150511
+
+ python module for a ChIP-seq experiment that contains
+ replicates of ChIP-seq data for target and/or control
+"""
+
+import os
+import sys
+import multiprocessing as mp
+
+import Util
+
+
+class ChIPSeqExperiment:
+ def __init__(self):
+ self.param = None ## reference to input parameters
+ self.reps = [] ## list of ChIPSeqReplcate object
+ self.is_control = None ## if is control
+ self.pooled_tagalign = None ## File obj of pooled tagAlign
+ self.peaks = None ## File obj of targetRep0_VS_controlRep0 peaks
+ self.final_peaks = None ## File obj of final peaks
+
+
+ @classmethod
+ def initFromParam(cls, param, is_control, param_attr):
+ import ChIPSeqReplicate
+ import File
+ cse = cls()
+ cse.param = param
+ cse.is_control = is_control
+ ftgts = getattr(param, param_attr).split(',')
+ cse.reps = [ ChIPSeqReplicate.initFromFastqFile(ffq) for ffq in ftgts ]
+ for (i, rep) in enumerate(cse.reps):
+ rep.param = param
+ rep.index = i+1
+ rep.chipseqexp = cse
+ tgt_fta = "%s/%s.tagAlign.gz" % (param.temp_dir, rep.name)
+ rep.tagalign = File.initFromFullFileName(tgt_fta)
+
+ if cse.is_control:
+ frep0 = param.fchipseq_control_signals
+ else:
+ frep0 = param.fchipseq_target_signals
+ cse.pooled_tagalign = File.initFromFullFileName(frep0)
+
+ cse.peaks = File.initFromFullFileName(param.fall_chipseq_peaks)
+
+ cse.final_peaks = File.initFromFullFileName(param.fidr_chipseq_peaks)
+
+ return cse
+
+
+ def getFastqEncoding(self):
+ nthr = self.param.num_threads
+ fin = ','.join([ f.fastq.fullname for f in self.reps])
+ if self.is_control:
+ fenc = self.param.imd_name + '_prsem.chipseq_control_encoding'
+ else:
+ fenc = self.param.imd_name + '_prsem.chipseq_target_encoding'
+
+ Util.runCommand('/bin/env', 'Rscript', self.param.chipseq_rscript,
+ 'guessFqEncoding', nthr, fin, fenc,
+ self.param.prsem_rlib_dir, quiet=self.param.quiet )
+
+ if not os.path.exists(fenc):
+ sys.exit("Failed to generate file: %s\n" % fenc)
+
+ with open(fenc, 'r') as f_fenc:
+ next(f_fenc)
+ file2enc = dict([ line.rstrip("\n").split("\t") for line in f_fenc ])
+
+ for f in self.reps:
+ f.encoding = file2enc[f.fastq.fullname]
+
+
+ def alignReadByBowtie(self):
+ ## running zat, filterSam2Bed and gzip takes about 1 thread
+ if self.param.num_threads > 2:
+ nthr_bowtie = self.param.num_threads - 1
+ else:
+ nthr_bowtie = 1
+
+ bowtie_ref_name = "%s_prsem" % self.param.ref_name
+ for rep in self.reps:
+ cmd_cat = Util.getCatCommand(rep.fastq.is_gz)
+
+ if not os.path.exists(rep.fastq.fullname):
+ sys.exit("File not found: %s\n" % rep.fastq.fullname)
+
+ s_quiet = None
+ if self.param.quiet:
+ s_quiet = ' --quiet '
+ else:
+ s_quiet = ''
+
+ ## many pipes, have to use os.system
+ cmds = [ "%s %s |" % (cmd_cat, rep.fastq.fullname) ] + \
+ [ "%s/bowtie " % self.param.bowtie_path ] + \
+ [ "%s -q -v 2 -a --best --strata -m 1 %s -S -p %d %s - | " % (
+ s_quiet, rep.encoding, nthr_bowtie, bowtie_ref_name ) ] + \
+ [ "%s - | " % self.param.filterSam2Bed ] + \
+ [ "gzip -c > %s " % rep.tagalign.fullname ]
+
+ cmd = ' '.join(cmds)
+
+ ## use all threads to align ChIP-seq reads sequentially
+ Util.runCommand(cmd, quiet=self.param.quiet)
+
+ if not os.path.exists(rep.tagalign.fullname):
+ sys.exit("failed to generate file: %s\n" % rep.tagalign.fullname)
+
+
+
+ def poolTagAlign(self):
+ frep0 = self.pooled_tagalign.fullname
+ if os.path.exists(frep0):
+ os.remove(frep0)
+ for rep in self.reps:
+ cat_cmd = Util.getCatCommand(rep.fastq.is_gz)
+ if not os.path.exists(rep.tagalign.fullname):
+ sys.exit("File not found: %s\n" % rep.tagalign.fullname)
+
+ cmd = "%s %s | gzip -c >> %s" % (cat_cmd, rep.tagalign.fullname, frep0)
+ Util.runCommand(cmd, quiet=self.param.quiet)
+
+ if not os.path.exists(frep0):
+ sys.exit("Failed to generate file: %s\n" % frep0)
+
+
+ def callPeaksBySPP(self, ctrl_tagalign):
+ """
+ in principle, this function is only for ChIP-seq target experiment
+ should make target and control inherit from ChIPSeqExperiment, will do
+ """
+ if self.is_control:
+ sys.exit( "ChIPSeqExperiment::runSPP() cann't be applied to control" )
+
+ tgt_tagaligns = [self.pooled_tagalign] + [rep.tagalign for rep in self.reps]
+ prm = self.param
+
+ ## need to add pRSEM's R_LIBS path so that run_spp.R can load spp library
+ if 'R_LIBS' in os.environ:
+ os.environ['R_LIBS'] = "%s:%s" % (os.environ['R_LIBS'],
+ prm.prsem_rlib_dir)
+ else:
+ os.environ['R_LIBS'] = prm.prsem_rlib_dir
+
+ nthr = prm.num_threads/len(tgt_tagaligns)
+ fctrl_tagalign = ctrl_tagalign.fullname
+ procs = [ mp.Process(target=runSPP, args=(tgt_tagalign, fctrl_tagalign,
+ prm, nthr)) for tgt_tagalign in tgt_tagaligns ]
+ for p in procs:
+ p.start()
+ for p in procs:
+ p.join()
+
+
+ def getPeaksByIDR(self, ctrl_tagalign):
+ """
+ in principle, this function is only for ChIP-seq target experiment
+ should make target and control inherit from ChIPSeqExperiment, will do
+ """
+ import gzip
+ import itertools
+ if self.is_control:
+ sys.exit( "ChIPSeqExperiment::runSPP() can't be applied to control" )
+
+ procs = []
+ out_q = mp.Queue()
+ prm = self.param
+ for (repa, repb) in itertools.combinations(self.reps, 2):
+ fpeaka = prm.temp_dir + repa.tagalign.filename_sans_ext + '_VS_' + \
+ ctrl_tagalign.filename_sans_ext + '.regionPeak.gz'
+ fpeakb = prm.temp_dir + repb.tagalign.filename_sans_ext + '_VS_' + \
+ ctrl_tagalign.filename_sans_ext + '.regionPeak.gz'
+ if not os.path.exists(fpeaka):
+ sys.exit("File not found: %s\n" % fpeaka)
+ if not os.path.exists(fpeakb):
+ sys.exit("File not found: %s\n" % fpeakb)
+
+ idr_prefix = prm.temp_dir + 'idr_' + repa.tagalign.basename + '_vs_' + \
+ repb.tagalign.basename
+ proc = mp.Process(target=getNPeaksByIDR,
+ args=(fpeaka, fpeakb, idr_prefix, prm, out_q))
+ procs.append(proc)
+ proc.start()
+
+ fidr2npeaks = {}
+ for p in procs:
+ fidr2npeaks.update(out_q.get())
+ p.join()
+
+ max_npeaks = max(fidr2npeaks.values())
+ if not os.path.exists(self.peaks.fullname):
+ sys.exit("File not found: %s\n" % self.peaks.fullname)
+
+ with gzip.open(self.peaks.fullname, 'rb') as f_fin:
+ sig_line = [ (float(line.split("\t")[6]), line) for line in f_fin ]
+ sorted_sig_line = sorted(sig_line, key=lambda t: t[0], reverse=True)
+
+ with gzip.open(self.final_peaks.fullname, 'wb') as f_fout:
+ for (sig, line) in sorted_sig_line[:max_npeaks]:
+ f_fout.write(line)
+
+
+def getNPeaksByIDR(fpeaka, fpeakb, idr_prefix, prm, out_q):
+ Util.runCommand('/bin/env', 'Rscript', prm.idr_script, fpeaka, fpeakb,
+ '-1', idr_prefix, '0', 'F', 'signal.value', prm.idr_scr_dir,
+ prm.fgenome_table, quiet=prm.quiet)
+ fidr = idr_prefix + '-overlapped-peaks.txt'
+ outdict = {}
+ with open(fidr, 'r') as f_fidr:
+ next(f_fidr)
+ ## count the number of peaks w/ IDR <= IDR_THRESHOLD
+ npk = sum( float(line.split()[10]) <= prm.IDR_THRESHOLD for line in f_fidr )
+ outdict[fidr] = npk
+ out_q.put(outdict)
+
+
+def runSPP(tgt_tagalign, fctrl_tagalign, prm, nthr):
+ spp_tmpdir = prm.temp_dir + tgt_tagalign.basename + '_spp_tmp/'
+ if not os.path.exists(spp_tmpdir):
+ os.mkdir(spp_tmpdir)
+ fout = prm.temp_dir + tgt_tagalign.basename + '_phantom.tab'
+ Util.runCommand('/bin/env', 'Rscript', prm.spp_script,
+ "-c=%s" % tgt_tagalign.fullname,
+ "-i=%s" % fctrl_tagalign,
+ "-npeak=%d" % prm.N_PEAK,
+ prm.PEAK_TYPE,
+ '-savp',
+ "-x=%s" % prm.EXCLUSION_ZONE,
+ '-rf',
+ "-odir=%s" % prm.temp_dir,
+ "-p=%d" % nthr,
+ "-tmpdir=%s" % spp_tmpdir,
+ "-out=%s" % fout,
+ quiet=prm.quiet)
+ Util.runCommand('rm', '-fr', spp_tmpdir, quiet=prm.quiet)
+
+ if not os.path.exists(fout):
+ sys.exit("Failed to generate file: %s\n" % fout)
+
+
+def initFromParam(param, typ):
+ if typ.lower() == 'target':
+ is_ctrl = False
+ param_attr = 'chipseq_target_read_files'
+ elif typ.lower() in [ 'control', 'input' ]:
+ is_ctrl = True
+ param_attr = 'chipseq_control_read_files'
+ elif typ.lower() == 'multi-targets':
+ is_ctrl = False
+ param_attr = 'chipseq_read_files_multi_targets'
+
+ return ChIPSeqExperiment.initFromParam(param, is_ctrl, param_attr)
diff --git a/pRSEM/ChIPSeqReplicate.py b/pRSEM/ChIPSeqReplicate.py
new file mode 100644
index 0000000..79b873e
--- /dev/null
+++ b/pRSEM/ChIPSeqReplicate.py
@@ -0,0 +1,43 @@
+__doc__="""
+
+ pliu 20150510
+
+ python module for one ChIP-seq replicate
+"""
+
+import File
+
+class ChIPSeqReplicate:
+ def __init__(self):
+ self.fastq = None ## File object for fastq
+ self.name = None ## default: fastq file's basename
+ self.index = None ## replicate's index number
+ self.tagalign = None ## File object for tagAlign
+ self.encoding = None ## fastq encoding, not sure if needed
+
+ self.param = None ## reference to parameters
+ self.chipseqexp = None ## reference to ChIPSeqExperiment object
+
+ #def __str__(self):
+ # return "%s %s %d %s" % (self.fastq.fullname, self.name, self.index,
+ # self.encoding)
+
+ @classmethod
+ def initFromFastqFile(cls, ffq):
+ csr = cls()
+ csr.fastq = File.initFromFullFileName(ffq)
+ csr.name = csr.fastq.basename
+ return csr
+
+ @classmethod
+ def initFromBedFile(cls, fbed):
+ csr = cls()
+ csr.tagalign = File.initFromFullFileName(fbed)
+ csr.name = csr.tagalign.basename
+ return csr
+
+def initFromFastqFile(ffq):
+ return ChIPSeqReplicate.initFromFastqFile(ffq)
+
+def initFromBedFile(fbed):
+ return ChIPSeqReplicate.initFromBedFile(fbed)
diff --git a/pRSEM/File.py b/pRSEM/File.py
new file mode 100644
index 0000000..34427a8
--- /dev/null
+++ b/pRSEM/File.py
@@ -0,0 +1,47 @@
+__doc__="""
+
+ pliu 20150511
+
+ modele for file-related definition and functions
+"""
+
+class File:
+ def __init__(self):
+ self.fullname = None ## file's full name, include dir, base, and all ext
+ self.is_gz = None ## if file is gzipped
+ self.dirname = None ## directory name
+ self.basename = None ## base name sans all extension separated by dot
+ self.filename_sans_ext = None ## no path, no last extension sep by dot
+
+
+ def __str__(self):
+ ss = [ "fullname: %s\n" % self.fullname ] + \
+ [ "dirname: %s\n" % self.dirname ] + \
+ [ "basename: %s\n" % self.basename ] + \
+ [ "filename_sans_ext: %s\n" % self.filename_sans_ext ]
+
+ if self.is_gz:
+ ss += [ "is gzipped" ]
+ else:
+ ss += [ "not gzipped" ]
+ return ''.join(ss)
+
+
+ @classmethod
+ def initFromFullFileName(cls, filename):
+ import os
+ f = cls()
+ f.fullname = filename
+ (f.dirname, fname) = os.path.split(filename)
+ words = fname.split('.')
+ f.basename = words[0]
+ f.filename_sans_ext = '.'.join(words[:-1])
+ if words[-1] in ['gz', 'gzip']:
+ f.is_gz = True
+ else:
+ f.is_gz = False
+ return f
+
+
+def initFromFullFileName(ffq):
+ return File.initFromFullFileName(ffq)
diff --git a/pRSEM/Gene.py b/pRSEM/Gene.py
new file mode 100644
index 0000000..c5c177e
--- /dev/null
+++ b/pRSEM/Gene.py
@@ -0,0 +1,143 @@
+__doc__="""
+
+ pliu 20131002
+
+ module for gene
+"""
+
+
+class Gene:
+ def __init__(self):
+ self.gene_id = None;
+ #self.rsem_result = None;
+
+ self.chrom = None;
+ self.strand = None;
+ self.tss = None; ## transcription starting sites, strand ori considered
+ self.tes = None; ## transcription ending sites, strand ori considered
+ self.start = None; ## genomic starting position regardless of strand
+ ## direction, always have a number smaller than self.end
+ self.end = None; ## genomic ending position
+
+
+ self.transcripts = [];
+ self.gtfs = [];
+ self.transcript_tss_groups = []; ## a list of list of transcripts having TSS
+ ## within user-specified distance
+ self.transcript_groups = [] ## a list of TranscriptionGroup objects
+
+
+ def __str__(self):
+ s = "%s" % self.gene_id;
+ return s;
+
+
+ ## should be moved to TranscriptGroup file
+ #def groupTranscriptsByTSS(self):
+ # """
+ # put transcripts that have TSS within certain distance into a group
+ # """
+ # #import Param;
+
+ # #cutoff = 100; ## TSS within 100 bp
+ # #cutoff = Param.TSS_GROUP_CUTOFF; ## cutoff for grouping TSS
+ # cutoff = 500 ## cutoff for grouping TSS
+
+ # group = [self.transcripts[0]]
+ # self.transcript_tss_groups.append(group)
+ # for tr in self.transcripts[1:]:
+ # is_assigned = False
+ # for grp in self.transcript_tss_groups:
+ # if (self.strand == '+') and (abs(tr.start - grp[0].start)<=cutoff):
+ # grp.append(tr)
+ # is_assigned = True;
+ # elif (self.strand == '-') and (abs(tr.end - grp[0].end)<=cutoff):
+ # grp.append(tr)
+ # is_assigned = True
+
+ # if is_assigned:
+ # break
+
+ # if not is_assigned:
+ # self.transcript_tss_groups.append([tr])
+
+
+ ## should be moved to TranscriptGroup file
+ #def constructTranscriptGroups(self):
+ # """
+ # construct a list of TranscriptGroup objects
+ # """
+ # import TranscriptGroup
+
+ # if len(self.transcript_tss_groups) == 0:
+ # self.groupTranscriptsByTSS()
+
+ # for transcripts in self.transcript_tss_groups:
+ # grp = TranscriptGroup.TranscriptGroup()
+ # grp.chrom = self.chrom
+ # grp.gene_id = self.gene_id
+ # grp.strand = self.strand
+ # grp.transcripts = transcripts
+ # self.transcript_groups.append(grp)
+
+
+
+ def getStartEndTSSTESFromTranscripts(self):
+ """
+ define start and end from gene's transcripts
+ start = min{all starts for transcripts};
+ end = max{all ends for transcripts};
+ """
+ starts = [tr.start for tr in self.transcripts];
+ ends = [tr.end for tr in self.transcripts];
+ self.start = min(starts);
+ self.end = max(ends);
+
+ if self.strand == '+':
+ self.tss = self.start;
+ self.tes = self.end;
+ elif self.strand == '-':
+ self.tss = self.end;
+ self.tes = self.start;
+
+
+ def definePeakTypeByTranscriptGroups(self):
+ """
+ all: all its transcript groups have peaks
+ none: none of its transcript groups has peak
+ mixed: some of its transcript groups have peaks, the others do not
+ """
+ has_tss_peaks = [grp.has_peak_around_TSS for grp in self.transcript_groups]
+ if all(has_tss_peaks): ## all groups have peaks
+ self.peak_type = 'all'
+ else:
+ if any(has_tss_peaks): ## some groups have peaks, the others not
+ self.peak_type = 'mixed'
+ else: ## no group has peak
+ self.peak_type = 'no'
+
+
+def constructGenesFromTranscripts(transcripts):
+ """
+ return a list of genes constructed from input transcripts
+ """
+ genes = []
+ gene_dict_id = {}
+ for tr in transcripts:
+ if gene_dict_id.has_key(tr.gene_id):
+ gene_dict_id[tr.gene_id].transcripts.append(tr)
+ tr.gene = gene_dict_id[tr.gene_id]
+ else:
+ gene = Gene()
+ gene.gene_id = tr.gene_id
+ gene.chrom = tr.chrom
+ gene.strand = tr.strand
+ gene.transcripts.append(tr)
+ genes.append(gene)
+ gene_dict_id[tr.gene_id] = gene
+ tr.gene = gene
+
+ map(lambda gene: gene.getStartEndTSSTESFromTranscripts(), genes);
+
+ return genes;
+
diff --git a/pRSEM/Makefile b/pRSEM/Makefile
new file mode 100644
index 0000000..70a8a4c
--- /dev/null
+++ b/pRSEM/Makefile
@@ -0,0 +1,35 @@
+CXX = g++
+
+SAMTOOLS = ../samtools-1.3
+HTSLIB = htslib-1.3
+SAMLIBS = $(SAMTOOLS)/$(HTSLIB)/libhts.a
+
+PROGRAMS = bigWigSummary RLib filterSam2Bed
+
+.PHONY : all clean
+
+all : $(PROGRAMS)
+
+OS := $(shell uname)
+ifeq ($(OS), Darwin)
+ UCSCEXEDIR = http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64
+endif
+ifeq ($(OS), Linux)
+ UCSCEXEDIR = http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64.v287
+endif
+
+bigWigSummary :
+ if [ ! -e "bigWigSummary" ]; then \
+ curl -O $(UCSCEXEDIR)/bigWigSummary; \
+ chmod +x bigWigSummary; \
+ fi
+
+RLib : installRLib.R
+ if [ ! -d "RLib" ]; then mkdir RLib/; fi; \
+ cd RLib/; Rscript ../installRLib.R
+
+filterSam2Bed : filterSam2Bed.c $(SAMTOOLS)/libbam.a $(SAMLIBS)
+ $(CXX) $@.c $(SAMTOOLS)/libbam.a $(SAMLIBS) -lz -lpthread -I$(SAMTOOLS) -I$(SAMTOOLS)/$(HTSLIB) -o $@
+
+clean :
+ rm -fr $(PROGRAMS) *.pyc
diff --git a/pRSEM/Param.py b/pRSEM/Param.py
new file mode 100644
index 0000000..d93a234
--- /dev/null
+++ b/pRSEM/Param.py
@@ -0,0 +1,178 @@
+__doc__="""
+
+ pliu 20150511
+
+ python module for all parameters, input arguments
+"""
+
+class Param:
+ IDR_THRESHOLD = 0.05
+ N_PEAK = 300000
+ PEAK_TYPE = '-savr'
+ EXCLUSION_ZONE = '-500:85' ## Anshul recommend -500:85
+ TRAINING_GENE_MIN_LEN = 1003
+ TRAINING_MIN_MAPPABILITY = 0.8
+ FLANKING_WIDTH = 500 ## in nt, flanking region around TSS and TES
+ INFORMATIVE_DATA_MAX_P_VALUE = 0.01 ## external data set is informative if
+ ## p-value is not more than this value
+
+ def __init__(self):
+ self.argdict = None
+
+ ## has to be in the same naming convention as prsem-calculate-expression
+ self.num_threads = None
+ self.chipseq_target_read_files = None
+ self.chipseq_control_read_files = None
+ self.chipseq_read_files_multi_targets = None
+ self.chipseq_bed_files_multi_targets = None
+ self.cap_stacked_chipseq_reads = None
+ self.n_max_stacked_chipseq_reads = None
+ self.bowtie_path = None
+ self.chipseq_peak_file = None
+ self.mappability_bigwig_file = None
+ self.partition_model = None
+ self.gibbs_burnin = None
+ self.gibbs_number_of_samples = None
+ self.gibbs_sampling_gap = None
+ self.quiet = False
+
+ ## arguments
+ self.ref_fasta = None
+ self.ref_name = None
+ self.sample_name = None
+ self.stat_name = None
+ self.imd_name = None
+
+ ## path and pRSEM scripts
+ self.temp_dir = None ## dir to save RSEM/pRSEM intermediate files
+ self.prsem_scr_dir = None ## pRSEM scripts dir
+ self.prsem_rlib_dir = None ## place to install pRSEM required R libraries
+
+ ## genome reference: training set isoforms
+ self.fall_exon_crd = None
+ self.fall_tr_crd = None ## tr info + mappability
+ self.ftraining_tr_crd = None ## training set tr
+
+ ## ChIP-seq
+ self.chipseqexperiment_target = None ## reference to ChIP-seq experiment
+ self.chipseqexperiment_control = None ## reference to ChIP-seq experiment
+ self.chipseq_rscript = None ## full name of process-chipseq.R
+ self.filterSam2Bed = None ## full name of filterSam2Bed binary
+ self.spp_tgz = None
+ self.spp_script = None
+ self.idr_scr_dir = None
+ self.idr_script = None
+ self.fgenome_table = None
+ self.fidr_chipseq_peaks = None
+ self.fall_chipseq_peaks = None
+ self.fchipseq_peaks = None ## full name of user supplied ChIP-seq peak
+ ## file, otherwise is fidr_chipseq_peaks
+ self.chipseq_target_fraglen = None ## spp-estimated fragment length
+ self.fsppout_target = None ## full name of SPP output
+ ## this implementation needs to be refined since
+ ## the var is define in both Param and ChIPSeqExp
+ self.fchipseq_target_signals = None
+ self.fchipseq_control_signals = None
+
+ ## transcripts and RNA-seq
+ self.transcripts = None ## reference to all transcripts to be quantified
+ self.genes = None ## reference to all genes to be quantified
+
+ self.rnaseq_rscript = None ## fullname of R script for dealing RNA-seq
+ self.fti = None ## RSEM's reference .ti file
+ self.bigwigsummary_bin = None ## bigWigSummary binary
+ self.fall_tr_features = None ## file for all isoforms' features
+ self.fall_tr_prior = None ## file for all isoforms' priors
+ self.fisoforms_results = None ## file for RSEM .isoforms.results
+ self.fpvalLL = None ## file for p-value on if informative
+ ## and for log-likelihood
+ self.fall_pvalLL = None ## file to store all the p-val and log-likelihood
+
+ ## for multiple external data sets
+ self.targetid2fchipseq_alignment = {}
+ self.finfo_multi_targets = None
+ self.flgt_model_multi_targets = None
+
+ ## for testing procedure
+ self.targetids = []
+
+
+ def __str__(self):
+ ss = [ "%-33s %s\n" % (key, val) for (key, val) in self.argdict.items()] + \
+ [ "%-33s %s\n" % ('RSEM_temp_dir', self.temp_dir ) ] + \
+ [ "%-33s %s\n" % ('pRSEM_scr_dir', self.prsem_scr_dir) ]
+ return ''.join(ss)
+
+
+ @classmethod
+ def initFromCommandLineArguments(cls, argdict):
+ import os
+ prm = cls()
+ prm.argdict = argdict
+ for (key, val) in argdict.items():
+ setattr(prm, key, val)
+
+ if prm.imd_name is not None:
+ prm.temp_dir = os.path.split(prm.imd_name)[0] + '/'
+ prm.prsem_scr_dir = os.path.dirname(os.path.realpath(__file__)) + '/'
+ prm.prsem_rlib_dir = prm.prsem_scr_dir + 'RLib/'
+ if not os.path.exists(prm.prsem_rlib_dir):
+ os.mkdir(prm.prsem_rlib_dir)
+
+ ## genome reference: pRSEM training set isoforms
+ prm.fall_exon_crd = prm.ref_name + '_prsem.all_exon_crd'
+ prm.fall_tr_crd = prm.ref_name + '_prsem.all_tr_crd'
+ prm.ftraining_tr_crd = prm.ref_name + '_prsem.training_tr_crd'
+
+ ## ChIP-seq
+ prm.chipseq_rscript = prm.prsem_scr_dir + 'process-chipseq.R'
+ prm.filterSam2Bed = prm.prsem_scr_dir + 'filterSam2Bed'
+ prm.spp_tgz = prm.prsem_scr_dir + 'phantompeakqualtools/spp_1.10.1.tar.gz'
+ prm.spp_script = prm.prsem_scr_dir + 'phantompeakqualtools/run_spp.R'
+ prm.idr_scr_dir = prm.prsem_scr_dir + 'idrCode/'
+ prm.idr_script = prm.idr_scr_dir + 'batch-consistency-analysis.r'
+ prm.fgenome_table = prm.ref_name + '.chrlist'
+
+ if prm.temp_dir is not None:
+ prm.fsppout_target = prm.temp_dir + 'target_phantom.tab'
+ prm.fchipseq_target_signals = prm.temp_dir + 'target.tagAlign.gz'
+ prm.fchipseq_control_signals = prm.temp_dir + 'control.tagAlign.gz'
+ prm.fidr_chipseq_peaks = "%s/%s" % (prm.temp_dir,
+ 'idr_target_vs_control.regionPeak.gz')
+ ## have to name it this way due to run_spp.R's wired naming convention
+ ## this names depens on the next two names
+ prm.fall_chipseq_peaks = "%s/%s" % (prm.temp_dir,
+ 'target.tagAlign_VS_control.tagAlign.regionPeak.gz')
+
+ if prm.chipseq_peak_file is not None:
+ prm.fchipseq_peaks = prm.chipseq_peak_file
+ else:
+ prm.fchipseq_peaks = prm.fidr_chipseq_peaks
+
+
+ ## transcripts and RNA-seq
+ prm.rnaseq_rscript = prm.prsem_scr_dir + 'process-rnaseq.R'
+ prm.fti = prm.ref_name + '.ti'
+ prm.ffasta = prm.ref_name + '.transcripts.fa'
+ prm.bigwigsummary_bin = prm.prsem_scr_dir + 'bigWigSummary'
+ #prm.fall_exon_crd = prm.imd_name + '_prsem.all_exon_crd'
+ #prm.fall_tr_crd = prm.imd_name + '_prsem.all_tr_crd'
+ #prm.ftraining_tr_crd = prm.imd_name + '_prsem.training_tr_crd'
+ if prm.sample_name is not None: ## for calc-expr
+ prm.fall_tr_gc = prm.imd_name + '_prsem.all_tr_gc'
+ prm.fall_tr_features = prm.stat_name + '_prsem.all_tr_features'
+ prm.fall_tr_prior = prm.stat_name + '_prsem.all_tr_prior'
+ prm.fpvalLL = prm.stat_name + '_prsem.pval_LL'
+
+ prm.fisoforms_results = prm.sample_name + '.isoforms.results'
+ prm.fall_pvalLL = prm.sample_name + '.all.pval_LL'
+
+ ## for multiple external data sets
+ prm.finfo_multi_targets = prm.temp_dir + 'multi_targets.info'
+ prm.flgt_model_multi_targets = prm.stat_name + '_prsem.lgt_mdl.RData'
+
+ return prm
+
+
+def initFromCommandLineArguments(argdict):
+ return Param.initFromCommandLineArguments(argdict)
diff --git a/pRSEM/Prsem.py b/pRSEM/Prsem.py
new file mode 100644
index 0000000..16f060a
--- /dev/null
+++ b/pRSEM/Prsem.py
@@ -0,0 +1,253 @@
+#!/bin/env python
+
+__doc__="""
+
+ pliu 20150304
+
+ python function for pRSEM
+"""
+
+import os
+import sys
+import Util
+
+
+def genChIPSeqSignalFilesFromBed(param):
+ import ChIPSeqReplicate
+ fbeds = param.chipseq_bed_files_multi_targets.split(',')
+ for fbed in fbeds:
+ csr = ChIPSeqReplicate.initFromBedFile(fbed)
+ ta = csr.tagalign
+ param.targetid2fchipseq_alignment[ta.basename] = ta.fullname
+
+
+def genChIPSeqSignalFilesFromReads(param):
+ import ChIPSeqExperiment
+ cse_target = ChIPSeqExperiment.initFromParam(param, 'multi-targets')
+ cse_target.getFastqEncoding()
+ cse_target.alignReadByBowtie()
+
+ param.chipseqexperiment_target = cse_target
+ for rep in cse_target.reps:
+ ta = rep.tagalign
+ param.targetid2fchipseq_alignment[ta.basename] = ta.fullname
+
+
+
+def genChIPSeqPeakFileBySPPIDR(param):
+ import ChIPSeqExperiment
+
+ cse_target = ChIPSeqExperiment.initFromParam(param, 'target')
+ cse_target.getFastqEncoding()
+ cse_target.alignReadByBowtie()
+ cse_target.poolTagAlign()
+
+ param.chipseqexperiment_target = cse_target
+
+ if param.chipseq_control_read_files is not None:
+ cse_control = ChIPSeqExperiment.initFromParam(param, 'control')
+ cse_control.getFastqEncoding()
+ cse_control.alignReadByBowtie()
+ cse_control.poolTagAlign()
+ cse_target.callPeaksBySPP(cse_control.pooled_tagalign)
+ cse_target.getPeaksByIDR(cse_control.pooled_tagalign)
+
+ param.chipseq_peak_file = cse_target.final_peaks.fullname
+ param.chipseqexperiment_control = cse_control
+ else:
+ pass ## to-be-implemented, call peaks by MOSAiCS without ChIP-seq control
+
+
+def buildTrainingSet(prm):
+ """
+ write training set in file Param.ftraining_tr_crd
+ transcript as listed in the same order as RSEM's .ti file
+ The order is required by rsem-run-gibbs so that prior can be assigned to
+ transcript correctly
+ """
+ ogot_genes = filter(lambda g: len(g.transcripts) == 1 and
+ (g.end - g.start + 1) >=
+ prm.TRAINING_GENE_MIN_LEN, prm.genes)
+
+ trs = [tr for g in ogot_genes for tr in g.transcripts]
+
+ trid2mpps = Util.runMPOverAList(prm.num_threads, calTSSBodyTESMappability,
+ [trs, prm])
+
+ with open(prm.fall_tr_crd, 'w') as f_fout:
+ f_fout.write("geneid\ttrid\tchrom\tstrand\tstart\tend\t")
+ f_fout.write("tss_mpp\tbody_mpp\ttes_mpp\n")
+ for tr in prm.transcripts: ## in the same order as RSEM's .ti file
+ f_fout.write("%s\t%s\t%s\t%s\t%d\t%d\t" % ( tr.gene_id,
+ tr.transcript_id, tr.chrom, tr.strand, tr.start, tr.end))
+ if tr.transcript_id in trid2mpps:
+ mpps = trid2mpps[tr.transcript_id]
+ f_fout.write("%5.3f\t%5.3f\t%5.3f\n" % mpps)
+ else:
+ f_fout.write("NA\tNA\tNA\n")
+
+ with open(prm.fall_exon_crd, 'w') as f_fexon:
+ f_fexon.write("trid\texon_index\tchrom\tstrand\tstart\tend\n")
+ for tr in prm.transcripts:
+ for (i, (exon_start, exon_end)) in enumerate(tr.exon_ranges):
+ f_fexon.write("%s\t%d\t%s\t%s\t%d\t%d\n" % (tr.transcript_id, i+1,
+ tr.chrom, tr.strand, exon_start, exon_end))
+
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript, 'selTrainingTr',
+ prm.prsem_rlib_dir, prm.fall_tr_crd, prm.fall_exon_crd,
+ prm.TRAINING_MIN_MAPPABILITY, prm.FLANKING_WIDTH,
+ prm.ftraining_tr_crd, quiet=prm.quiet)
+
+ if not os.path.exists(prm.ftraining_tr_crd):
+ sys.exit("Failed to generate file: %s\n" % prm.ftraining_tr_crd)
+
+
+def calTSSBodyTESMappability(trs, prm, out_q):
+ """
+ calculate average mappability around TSS, body, and TES for all transcripts of
+ given list of genes
+
+ save results in transcript's attribute
+ """
+ outdict = {}
+ for tr in trs:
+ tr.calculateMappability(prm.bigwigsummary_bin, prm.mappability_bigwig_file,
+ prm.FLANKING_WIDTH, prm.quiet)
+ outdict[tr.transcript_id] = (tr.ave_mpp_around_TSS, tr.ave_mpp_around_body,
+ tr.ave_mpp_around_TES)
+ out_q.put(outdict)
+
+
+def genPriorByCombinedTSSSignals(prm):
+ """
+ calculate TSS signals for all external data sets
+ compute informative p-value, LL for individual data set and combined one
+ learn prior from training set partitioned by combined TSS signals
+ derive priors for all isoforms
+ """
+ f_fout = open(prm.finfo_multi_targets, 'w')
+ f_fout.write("targetid\tfaln\tfftrs\n")
+ for (tgtid, faln) in prm.targetid2fchipseq_alignment.items():
+ fftrs = prm.imd_name + '_prsem.' + tgtid + '.all_tr_features'
+ f_fout.write("%s\t%s\t%s\n" % (tgtid, faln, fftrs))
+ f_fout.close()
+
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript,
+ 'prepMultiTargetsFeatures', prm.prsem_rlib_dir,
+ prm.fall_tr_crd, prm.ftraining_tr_crd,
+ prm.fisoforms_results, prm.FLANKING_WIDTH,
+ prm.cap_stacked_chipseq_reads,
+ prm.n_max_stacked_chipseq_reads,
+ prm.finfo_multi_targets, prm.num_threads, quiet=prm.quiet)
+
+ ## learn prior from partitioning by combined external data set
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript,
+ 'genPriorByCombinedTSSSignals', prm.prsem_rlib_dir,
+ prm.finfo_multi_targets, prm.flgt_model_multi_targets,
+ prm.fall_tr_features, prm.fpvalLL, prm.fall_tr_prior,
+ quiet=prm.quiet)
+
+ pval = float(Util.readFile(prm.fpvalLL)[1].split("\t")[0])
+
+ if pval > prm.INFORMATIVE_DATA_MAX_P_VALUE:
+ err_msg = "\nError: current external data is NOT informative for RNA-seq quantification\n" + \
+ "\tp-value %.10e > %.3f\n" % (pval, prm.INFORMATIVE_DATA_MAX_P_VALUE) + \
+ "pRSEM STOPs here. Please use other external data set(s)\n\n"
+ sys.stderr.write(err_msg)
+ sys.exit(0)
+
+ if not os.path.exists(prm.fall_tr_prior):
+ sys.exit("Failed to generate file: %s\n" % prm.fall_tr_prior)
+
+
+
+def genPriorByPeakSignalGCLen(prm):
+ """
+ calculate peaks/signals for the TSS, body, and TES regions
+ calculate GC contenct and effective length
+ learn prior from training set and derived priors for all isoforms
+ """
+ ## calculate GC contect for isoforms
+ trid2seq = Util.getFastaID2Seq(prm.ffasta)
+ with open(prm.fall_tr_gc, 'w') as f_fall_tr_gc:
+ f_fall_tr_gc.write("trid\tGC_fraction\n")
+ for tr in prm.transcripts:
+ gc_frac = Util.getGCFraction(trid2seq[tr.transcript_id])
+ f_fall_tr_gc.write("%s\t%.2f\n" % (tr.transcript_id, gc_frac) )
+
+ with open(prm.fsppout_target, 'r') as f_fsppout_target:
+ words = f_fsppout_target.read().split("\t")
+ prm.chipseq_target_fraglen = int(words[2])
+
+ ## prepare a feature file of peaks and signals for all isoforms,
+ ## isoforms in training set will be labeled
+ if not os.path.exists(prm.fchipseq_peaks):
+ sys.exit("File not exists: %s\n" % prm.fchipseq_peaks)
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript,
+ 'prepPeakSignalGCLenFeatures', prm.prsem_rlib_dir,
+ prm.fall_tr_crd, prm.ftraining_tr_crd, prm.fall_tr_features,
+ prm.fisoforms_results, prm.FLANKING_WIDTH,
+ prm.partition_model, prm.fchipseq_peaks,
+ prm.fchipseq_target_signals, prm.fall_tr_gc, prm.num_threads,
+ prm.chipseq_target_fraglen, quiet=prm.quiet)
+
+ if not os.path.exists(prm.fall_tr_gc):
+ sys.exit("Failed to generate file: %s\n" % prm.fall_tr_gc)
+
+ ## learn and generate prior for all transcripts
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript,
+ 'genPriorByPeakSignalGCLen', prm.prsem_rlib_dir,
+ prm.fall_tr_features, prm.partition_model, prm.fall_tr_prior,
+ quiet=prm.quiet)
+
+ if not os.path.exists(prm.fall_tr_prior):
+ sys.exit("Failed to generate file: %s\n" % prm.fall_tr_prior)
+
+
+def genPriorByTSSPeak(prm):
+ """
+ determine if isoform have TSS peak or not
+ learn priors from training set and derived priors for all isoforms
+ """
+ ## prepare a feature file of TSS peaks for all isoforms,
+ ## isoforms in training set will be labeled
+ if not os.path.exists(prm.fchipseq_peaks):
+ sys.exit("File not exists: %s\n" % prm.fchipseq_peaks)
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript,
+ 'prepTSSPeakFeatures', prm.prsem_rlib_dir,
+ prm.fall_tr_crd, prm.ftraining_tr_crd, prm.fall_tr_features,
+ prm.fisoforms_results, prm.FLANKING_WIDTH,
+ prm.fchipseq_peaks, quiet=prm.quiet)
+
+ if not os.path.exists(prm.fall_tr_features):
+ sys.exit("Failed to generate file: %s\n" % prm.fall_tr_features)
+
+ Util.runCommand('/bin/env', 'Rscript', prm.rnaseq_rscript,
+ 'genPriorByTSSPeak', prm.prsem_rlib_dir,
+ prm.fall_tr_features, prm.fpvalLL, prm.fall_tr_prior,
+ quiet=prm.quiet)
+
+ pval = float(Util.readFile(prm.fpvalLL)[1].split("\t")[0])
+
+ if pval > prm.INFORMATIVE_DATA_MAX_P_VALUE:
+ err_msg = "\nError: current external data is NOT informative for RNA-seq quantification\n" + \
+ "\tp-value %.10e > %.3f\n" % (pval, prm.INFORMATIVE_DATA_MAX_P_VALUE) + \
+ "pRSEM STOPs here. Please use other external data set(s)\n\n"
+ sys.stderr.write(err_msg)
+ sys.exit(0)
+
+ if not os.path.exists(prm.fall_tr_prior):
+ sys.exit("Failed to generate file: %s\n" % prm.fall_tr_prior)
+
+
+def runGibbsSampling(prm):
+ if prm.quiet:
+ run_gibbs_quiet = '-q'
+ else:
+ run_gibbs_quiet = ''
+ Util.runCommand("%s/../rsem-run-gibbs" % prm.prsem_scr_dir,
+ prm.ref_name, prm.imd_name, prm.stat_name, prm.gibbs_burnin,
+ prm.gibbs_number_of_samples, prm.gibbs_sampling_gap,
+ '-p', prm.num_threads, run_gibbs_quiet,
+ '--prior', prm.fall_tr_prior,
+ quiet=prm.quiet)
diff --git a/pRSEM/Transcript.py b/pRSEM/Transcript.py
new file mode 100644
index 0000000..9638ffe
--- /dev/null
+++ b/pRSEM/Transcript.py
@@ -0,0 +1,189 @@
+__doc__="""
+
+ peng 20131009
+
+ Data structure copied from RSEM, made some name changes
+"""
+
+
+class Transcript:
+ def __init__(self):
+ self.transcript_id = None
+ self.gene_id = None
+ self.gene = None
+ self.transcript_group = None
+
+ self.chrom = None ## RSEM Transcript's string seqname
+ self.strand = None
+ self.length = None
+ self.exon_ranges = []; ## RSEM Transcript's vector<Interval> structure
+ self.gtf_attr = {}; ## RSEM Transcript's string left
+ self.gtf_additional_info = None;
+
+ self.start = None; ## genomic starting postion,
+ ## regardless of strand direction
+ ## always have a number smaller than self.end
+ self.end = None; ## genomic ending postion
+
+ self.tss = None; ## genomic coordinate of transcription starting site
+ self.tes = None; ## genomic coordinate of transcription ending site
+
+ ## mappability
+ self.ave_mpp_around_TSS = None ## [TSS-flanking_width, TSS+flanking_width]
+ self.ave_mpp_around_body = None ## (TSS+flanking_width, TES-flanking_width)
+ self.ave_mpp_around_TES = None ## [TES-flanking_width, TES+flanking_width]
+
+
+ def __str__(self):
+ s = "%s\n%s\n%s\n%s %d\n" % (self.transcript_id, self.gene_id, self.chrom,
+ self.strand, self.length);
+ s += "%d" % len(self.exon_ranges);
+ for (start, end) in self.exon_ranges:
+ s += " %d %d" % (start, end);
+ s += "\n";
+ for key in self.gtf_attr.keys():
+ for val in self.gtf_attr[key]:
+ s += '%s "%s"; ' % (key, val);
+ s = s.rstrip();
+ return s;
+
+
+ def constructFromRSEMTI(self, ti_lines):
+ """
+ construct Transcript from the 6 lines from RSEM .TI file
+ """
+ self.quicklyConstructFromRSEMTI(ti_lines);
+
+ feature_words = ti_lines[5].rstrip(';').split(';');
+ for feature_word in feature_words:
+ feature_word.lstrip();
+ (key, val) = feature_word.split();
+ if not self.gtf_attr.has_key(key):
+ self.gtf_attr[key] = [];
+ self.gtf_attr[key].append(val.strip('"'));
+
+
+ def quicklyConstructFromRSEMTI(self, ti_lines):
+ """
+ quickly construct Transcript from the 6 lines from RSEM .TI file, the last
+ line won't be parsed.
+ """
+ self.transcript_id = ti_lines[0].split("\t")[0]
+ self.gene_id = ti_lines[1].split("\t")[0]
+ self.chrom = ti_lines[2];
+ (self.strand, self.length) = ti_lines[3].split();
+ self.length = int(self.length);
+ words = ti_lines[4].split();
+ for j in range(0, int(words[0])):
+ start = int(words[j*2+1]);
+ end = int(words[j*2+2]);
+ self.exon_ranges.append( (start, end) );
+
+ self.start = self.exon_ranges[0][0];
+ self.end = self.exon_ranges[-1][-1];
+ if self.strand == '+':
+ self.tss = self.start
+ self.tes = self.end
+ elif self.strand == '-':
+ self.tss = self.end
+ self.tes = self.start
+ self.gtf_additional_info = ti_lines[5];
+
+
+ def defineTSSAndTES(self):
+ """
+ define TSS and TES
+ """
+ if (self.tss is None) or (self.tes is None):
+ if self.strand == '+':
+ self.tss = self.start;
+ self.tes = self.end;
+ elif self.strand == '-':
+ self.tss = self.end;
+ self.tes = self.start;
+
+
+ def calculateMappability(self, bin_bigwigsummary, fbigwig, width=500,
+ quiet=True):
+ """
+ calculate average mappability for a transcript's
+ TSS region: [TSS-width, TSS+width],
+ body region: [start+width+1, end-width-1],
+ TES region: [TES-width, TES+width]
+
+ if start+width+1 > end-width-1, then define body region as
+ [end-width-1, start+width+1]
+
+ assign the values for
+ self.ave_mpp_around_TSS, self.max_mpp_around_TSS
+ self.ave_mpp_around_body, self.max_mpp_around_body
+ self.ave_mpp_around_TES, self.max_mpp_around_TES
+ """
+ import Util
+
+ if (self.tss is None) or (self.tes is None):
+ self.defineTSSAndTES()
+
+ self.ave_mpp_around_TSS = Util.calculateMappability('mean', self.chrom,
+ self.tss - width, self.tss + width,
+ bin_bigwigsummary, fbigwig, quiet)
+
+ if (self.start + width + 1) < (self.end - width - 1):
+ self.ave_mpp_around_body = Util.calculateMappability('mean', self.chrom,
+ self.start+width+1, self.end-width-1,
+ bin_bigwigsummary, fbigwig, quiet)
+ elif (self.start + width + 1) > (self.end - width - 1):
+ self.ave_mpp_around_body = Util.calculateMappability('mean', self.chrom,
+ self.end-width-1, self.start+width+1,
+ bin_bigwigsummary, fbigwig, quiet)
+ elif (self.start + width + 1) == (self.end - width - 1):
+ self.ave_mpp_around_body = 1.0
+
+ self.ave_mpp_around_TES = Util.calculateMappability('mean', self.chrom,
+ self.tes - width, self.tes + width,
+ bin_bigwigsummary, fbigwig, quiet)
+
+
+
+def readRSEMTI(fin):
+ """
+ read RSEM's .ti file, return a list of Transcripts objects
+ """
+ import Util
+
+ lines = Util.readFile(fin);
+ (ntranscripts, foo) = lines[0].split();
+ ntranscripts = int(ntranscripts);
+ transcripts = [];
+ for i in range(0, ntranscripts):
+ tr = Transcript();
+ tr.constructFromRSEMTI(lines[i*6+1:i*6+7]);
+ transcripts.append(tr);
+ if (i > 0) and (i % 20000 == 0):
+ print "processed %d transcripts" % i;
+
+ return transcripts;
+
+
+def quicklyReadRSEMTI(fin):
+ """
+ read RSEM's .ti file without parsing the additional information line (the last
+ line in a transcript's block
+
+ return a list of Transcripts objects
+ """
+ import Util
+
+ lines = Util.readFile(fin);
+ (ntranscripts, foo) = lines[0].split();
+ ntranscripts = int(ntranscripts);
+ transcripts = [];
+ for i in range(0, ntranscripts):
+ tr = Transcript();
+ tr.quicklyConstructFromRSEMTI(lines[i*6+1:i*6+7]);
+ transcripts.append(tr);
+ if (i > 0) and (i % 20000 == 0):
+ print "processed %d transcripts" % i;
+
+ return transcripts;
+
diff --git a/pRSEM/Util.py b/pRSEM/Util.py
new file mode 100644
index 0000000..9107a36
--- /dev/null
+++ b/pRSEM/Util.py
@@ -0,0 +1,173 @@
+__doc__="""
+
+ pliu 20150605
+
+ utility module for pRSEM
+ no class is defined here
+"""
+
+def runCommand(*args, **kwargs):
+ import os
+ import subprocess
+ import sys
+
+ is_quiet = False
+ if 'quiet' in kwargs:
+ if kwargs['quiet']:
+ is_quiet = True
+
+ str_args = [ str(arg) for arg in args ]
+ if is_quiet:
+ pass
+ else:
+ sys.stdout.write("\n%s\n" % (' '.join(str_args)))
+
+ f_null = open(os.devnull, 'w')
+
+ try:
+ if len(str_args) == 1:
+ if is_quiet:
+ retcode = subprocess.call(str_args[0], stdout=f_null, shell=True)
+ else:
+ retcode = subprocess.call(str_args[0], shell=True)
+ else:
+ if is_quiet:
+ #print '##', is_quiet, '##';
+ retcode = subprocess.call(str_args, stdout=f_null)
+ else:
+ #print '##', is_quiet, '##';
+ retcode = subprocess.call(str_args)
+ if retcode < 0:
+ sys.exit("\nTerminated by singal %d\n" % -retcode)
+ elif retcode > 0:
+ sys.exit("\nFailed with return code %d\n" % retcode)
+ except OSError as e:
+ sys.exit("\nExecution failed: %s\n" % e)
+
+ f_null.close()
+
+
+def runCommandAndGetOutput(*args, **kwargs):
+ import subprocess
+ import sys
+
+ str_args = [ str(arg) for arg in args ]
+ if 'quiet' in kwargs:
+ if not kwargs['quiet']:
+ sys.stdout.write("\n%s\n" % (' '.join(str_args)))
+ else:
+ sys.stdout.write("\n%s\n" % (' '.join(str_args)))
+
+ try:
+ output = subprocess.check_output(str_args)
+ except subprocess.CalledProcessError, e:
+ sys.exit("\nExecution failed: %s\n" % e.output)
+
+ return output
+
+
+def getCatCommand(is_gzipped):
+ if is_gzipped:
+ cat_cmd = 'zcat'
+ else:
+ cat_cmd = 'cat'
+ return cat_cmd
+
+
+def readFile(fin):
+ """
+ return all the lines of the input file.
+ """
+ import os
+ assert os.path.exists(fin), "File not found: %s\n" % fin
+
+ lines = [];
+ f_fin = open(fin, 'r');
+ lines = f_fin.read().split('\n');
+ f_fin.close();
+ lines.pop();
+
+ newlines = [];
+ for line in lines:
+ if line[-1] == '\r':
+ newline = line[:-1];
+ else:
+ newline = line;
+ newlines.append(newline);
+
+ return newlines;
+
+
+def calculateMappability(mpp_type, chrom, start, end, bigwigsummary_bin,
+ fbigwig, quiet=True):
+ """
+ calculate mappability for the given genomic coordinate interval
+ mpp_type = {mean|max}
+ """
+ mpp = -10.0
+ mpp = runCommandAndGetOutput(bigwigsummary_bin, '-type=%s' % mpp_type,
+ fbigwig, chrom, start, end, '1', quiet=True)
+ return float(mpp)
+
+
+def runMPOverAList(nprocs, func, args):
+ """
+ run multiprocessing for the given function and arguments on nprocs CPUs
+ args[0] must be a list to-be-split and run func
+ func must return a dict
+ """
+ import multiprocessing as mp
+
+ out_q = mp.Queue()
+ chunksize = 1
+ if len(args[0]) > nprocs:
+ chunksize = len(args[0])/nprocs + 1
+ procs = []
+ for i in xrange(nprocs):
+ list_args = [args[0][chunksize*i:chunksize*(i+1)]] + args[1:] + [out_q]
+ p = mp.Process(target = func, args = tuple(list_args))
+ procs.append(p)
+ p.start()
+
+ dict_to_return = {}
+ for i in xrange(nprocs):
+ dict_to_return.update(out_q.get())
+
+ for p in procs:
+ p.join()
+
+ return dict_to_return
+
+
+def getFastaID2Seq(ffasta):
+ """
+ read fasta file, return a dict with key as seq_id and value as seq
+ """
+ import os
+ assert os.path.exists(ffasta), "File not found: %s\n" % ffasta
+ fastas = {};
+ f_fin = open(ffasta, 'r');
+ entries = f_fin.read().split('>');
+ f_fin.close();
+ for entry in entries[1:]:
+ words = entry.split("\n");
+ fastas[words[0]] = words[1];
+
+ return fastas;
+
+
+def getGCFraction(seq):
+ """
+ return the percetage of GC in the given sequence
+ """
+ length = len(seq);
+ if length == 0:
+ sys.stderr.write("Util::getFraction(): sequence length is 0\n");
+ return 0;
+ else:
+ seq = seq.upper();
+ n_G = seq.count('G');
+ n_C = seq.count('C');
+
+ return (n_G + n_C) * 1.0/length;
+
diff --git a/pRSEM/filterSam2Bed.c b/pRSEM/filterSam2Bed.c
new file mode 100644
index 0000000..29d6a53
--- /dev/null
+++ b/pRSEM/filterSam2Bed.c
@@ -0,0 +1,52 @@
+/*
+ * pliu 20150621
+ *
+ * filter Sam file by flag 1548 and output alignment in Bed format
+ *
+ * this code is modified from sam/examples/bam2bed.c
+ *
+*/
+
+#include <stdio.h>
+#include "sam.h"
+//#include "../samtools-1.3/htslib-1.3/htslib/sam.h"
+
+static int fetch_func(const bam1_t *b, void *data) {
+ samfile_t *fp = (samfile_t*)data;
+ uint32_t *cigar = bam1_cigar(b);
+ const bam1_core_t *c = &b->core;
+ int i, l;
+ if (b->core.tid < 0) return 0;
+ if ( (b->core.flag & 0x4) || (b->core.flag & 0x8) || (b->core.flag & 0x200) ||
+ (b->core.flag & 0x400) ) return 0;
+ for (i = l = 0; i < c->n_cigar; ++i) {
+ int op = cigar[i]&0xf;
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+ l += cigar[i]>>4;
+ }
+ printf("%s\t%d\t%d\tN\t%d\t%c\n", fp->header->target_name[c->tid],
+ c->pos, c->pos + l, c->qual, (c->flag&BAM_FREVERSE)? '-' : '+');
+ return 0;
+}
+
+
+int main(int argc, char *argv[]) {
+ samfile_t *fp;
+ if (argc != 2) {
+ fprintf(stderr, "\nUsage: filterSam2Bed <in.sam>\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Filter SAM file by flag 1548 (0x4, 0x8, 0x200, 0x400)\n");
+ fprintf(stderr, "and write to STDOUT in BED format\n");
+ fprintf(stderr, "<in.sam>: input SAM file name, '-' for STDIN\n\n");
+ return 1;
+ }
+ if ((fp = samopen(argv[1], "r", 0)) == 0) {
+ fprintf(stderr, "filterSam2Bed: Fail to open SAM file %s\n", argv[1]);
+ return 1;
+ }
+ bam1_t *b = bam_init1();
+ while (samread(fp, b) >= 0) fetch_func(b, fp);
+ bam_destroy1(b);
+ samclose(fp);
+ return 0;
+}
diff --git a/pRSEM/idrCode/README.txt b/pRSEM/idrCode/README.txt
new file mode 100644
index 0000000..fae2ba0
--- /dev/null
+++ b/pRSEM/idrCode/README.txt
@@ -0,0 +1,146 @@
+pliu 20150607
+genome_table.txt and genome_tables/ are removed. The same info can be obtained
+from rsem-prepare-reference
+
+===========================
+README for consistency analysis of peak calling on replicates
+Qunhua Li and Anshul Kundaje (Oct,2010)
+===========================
+This set of programs are used for consistency analysis on peak calling results on multiple replicates of a dataset
+
+================
+DEPENDENCIES
+================
+unix, R version 2.9 or higher
+
+================
+FILES:
+================
+batch-consistency-analysis.r : for pairwise IDR analysis of replicates
+batch-consistency-plot.r: for creating diagnostic and IDR plots
+functions-all-clayton-12-13.r: helper function
+genome_table.txt: This file MUST contain the size of each chromosome of the genome of the organism that the peak files are referring to
+
+================
+INPUT FILE FORMATS
+================
+(1) genome_table.txt
+It contains two space delimited fields
+Col1: chromosome name (These MUST match the chromosome names in the peak files)
+Col2: chromosome size (in bp)
+
+(1) Peak Files
+Peak files MUST be in narrowPeak format (and unzipped ... the code currently doesnt handle gzipped peak files directly)
+
+NarrowPeak files are in BED6+4 format. It consists of 10 tab-delimited columns
+
+chrom string Name of the chromosome
+chromStart int The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0.
+chromEnd int The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature.
+ For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+name string Name given to a region (preferably unique). Use '.' if no name is assigned.
+score int Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value. Ideally average signalValue per base spread between 100-1000.
+strand char +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
+signalValue float Measurement of overall (usually, average) enrichment for the region.
+pValue float Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned.
+qValue float Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+peak int Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+*NOTE*: the p-value and q-value columns MUST be in -log10() scale
+
+The narrowPeak format has 3 columns that can be used to rank peaks
+(signal.value, p.value (-log_10) and q.value (-log_10)).
+The peak summit column must have values relative to the start coordinate of the peaks.
+You can use any of these columns but make sure that whichever measure you are using rank peaks is relatively continuous without too many ties.
+e.g. For the SPP peak caller it is recommended to use signal.value column
+e.g. PeakSeq peak caller has relatively continuous q.values without too many ties. So for PeakSeq it is better to use q.value
+
+================
+RUNNING INSTRUCTIONS
+================
+First make sure the genome_table.txt file contains the appropriate chromosome names and sizes. If not replace the contents of this file. Make sure the file continues to be named 'genome_table.txt'.
+The file name is currently hardcoded. We will change this in the next release of the code.
+
+(1) batch-consistency-analysis.r
+
+This is used to run pairwise consistency analysis on a pair of replicate peak files
+
+----------------
+GENERAL USAGE:
+----------------
+Rscript batch-consistency-analysis.r [peakfile1] [peakfile2] [peak.half.width] [outfile.prefix] [min.overlap.ratio] [is.broadpeak] [ranking.measure]
+
+Typical usage for SPP peak caller peaks
+Rscript batch-consistency-analysis.r [peakfile1] [peakfile2] -1 [outfile.prefix] 0 F q.value
+
+Typical usage for MACS peak caller peaks
+Rscript batch-consistency-analysis.r [peakfile1] [peakfile2] 200 [outfile.prefix] 0 F p.value
+
+[peakfile1] and [peakfile2] are the peak calls for the pair of replicates in narrowPeak format. They must be uncompressed files.
+e.g. /peaks/reps/chipSampleRep1_VS_controlSampleRep0.narrowPeak AND
+ /peaks/reps/chipSampleRep2_VS_controlSampleRep0.narrowPeak
+
+[peak.half.width]: Set this to -1 if you want to use the reported peak width in the peak files.
+If you want to truncate peak widths to say 400 bp max then use a value of 200.
+
+[outfile.prefix] is a prefix that will be used to name the output data for this pair of replicates.
+The prefix must also include the PATH to the directory where you want to store the output data.
+e.g. /consistency/reps/chipSampleRep1_VS_chipSampleRep2
+
+[min.overlap.ratio]: fractional bp overlap (ranges from 0 to 1) between peaks in replicates to be considered as overlapping peaks.
+Set to 0 if you want to allow overlap to be defined as >= 1 bp overlap.
+If set to say 0.5 this would mean that atleast 50% of the peak in one replicate should be covered by a peak in the other replicate to count as an overlap.
+
+[is.broadpeak]: Is the peak file format narrowPeak or broadPeak. Set to F if it is narrowPeak/regionPeak or T if it is broadPeak.
+
+[ranking.measure] is the ranking measure to use. It can take only one of the following values
+signal.value , p.value or q.value
+
+OUTPUT:
+The results will be written to the directory contained in [outfile.prefix]
+a. The output from EM fitting: suffixed by -em.sav
+b. The output for plotting empirical curves: suffixed by -uri.sav
+ Note: 1 and 2 are objects that can be loaded back to R for plotting or other purposes (e.g. retrieve data)
+c. The parameters estimated from EM and the log of consistency analysis, suffixed by -Rout.txt
+d. The number of peaks that pass specific IDR thresholds for the pairwise analysis: suffixed by npeaks-aboveIDR.txt
+e. The full set of peaks that overlap between the replicates with local and global IDR scores: suffixed by overlapped-peaks.txt
+
+
+(2) batch-consistency-plot.r
+
+This is used to plot the IDR plots and diagnostic plots for a single or multiple pairs of replicates.
+
+----------------
+GENERAL USAGE:
+----------------
+Rscript batch-consistency-plot.r [npairs] [output.prefix] [input.file.prefix1] [input.file.prefix2] [input.file.prefix3] ....
+
+[n.pairs] is the number of pairs of replicates that you want to plot on the same plot
+e.g. 1 or 3 or ...
+
+[output.prefix] is a prefix that will be used to name output data from this analysis.
+NOT TO BE CONFUSED with [outfile.prefix] in batch-consistency-analysis.r
+The prefix must also include the PATH to the directory where you want to store the output data.
+e.g. /consistency/plots/chipSampleAllReps
+
+[input.file.prefix 1, 2, 3 ...] are the [outfile.prefix] values used to name the output from pairwise analysis on all replicates
+e.g. /consistency/reps/chipSampleRep1_VS_chipSampleRep2
+ /consistency/reps/chipSampleRep1_VS_chipSampleRep3
+ /consistency/reps/chipSampleRep2_VS_chipSampleRep3
+
+OUTPUT:
+1. summary consistency plots in .ps format: suffixed by -plot.ps
+These plots are very informative about the quality and similarity of the replicates.
+
+===================================================
+GETTING NUMBER OF PEAKS THAT PASS AN IDR THRESHOLD
+===================================================
+For each pairwise analysis, we have a *overlapped-peaks.txt file
+
+The last column (Column 11) of the overlapped-peaks.txt file has the global IDR score for each pair of overlapping peaks
+To get the number of peaks that pass an IDR threshold of T (e.g. 0.01) you simply find the number of lines that have a global IDR score <= T
+
+awk '$11 <= 0.01 {print $0}' [overlappedPeaksFileName] | wc -l
+
+
+
diff --git a/pRSEM/idrCode/batch-consistency-analysis.r b/pRSEM/idrCode/batch-consistency-analysis.r
new file mode 100644
index 0000000..1adc19f
--- /dev/null
+++ b/pRSEM/idrCode/batch-consistency-analysis.r
@@ -0,0 +1,164 @@
+# modified 06/07/2015 pliu
+#
+# modified 3-29-10: Qunhua Li
+# add 2 columns in the output of "-overlapped-peaks.txt": local.idr and IDR
+
+# 01-20-2010 Qunhua Li
+#
+# This program performs consistency analysis for a pair of peak calling outputs
+# It takes narrowPeak or broadPeak formats.
+#
+# usage: Rscript batch-consistency-analysis2.r peakfile1 peakfile2 half.width outfile.prefix overlap.ratio is.broadpeak sig.value
+#
+# peakfile1 and peakfile2 : the output from peak callers in narrowPeak or broadPeak format
+# half.width: -1 if using the reported peak width,
+# a numerical value to truncate the peaks to
+# outfile.prefix: prefix of output file
+# overlap.ratio: a value between 0 and 1. It controls how much overlaps two peaks need to have to be called as calling the same region. It is the ratio of overlap / short peak of the two. When setting at 0, it means as long as overlapped width >=1bp, two peaks are deemed as calling the same region.
+# is.broadpeak: a logical value. If broadpeak is used, set as T; if narrowpeak is used, set as F
+# sig.value: type of significant values, "q.value", "p.value" or "signal.value" (default, i.e. fold of enrichment)
+
+args <- commandArgs(trailingOnly=T)
+
+# consistency between peakfile1 and peakfile2
+#input1.dir <- args[1]
+#input2.dir <- args[2] # directories of the two input files
+peakfile1 <- args[1]
+peakfile2 <- args[2]
+
+if(as.numeric(args[3])==-1){ # enter -1 when using the reported length
+ half.width <- NULL
+}else{
+ half.width <- as.numeric(args[3])
+}
+
+
+output.prefix <- args[4]
+overlap.ratio <- args[5]
+
+if(args[6] == "T"){
+ is.broadpeak <- T
+}else{
+ is.broadpeak <- F
+}
+
+sig.value <- args[7]
+
+
+#dir1 <- "~/ENCODE/anshul/data/"
+#dir2 <- dir1
+#peakfile1 <- "../data/SPP.YaleRep1Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
+#peakfile2 <- "../data/SPP.YaleRep3Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
+#half.width <- NULL
+#overlap.ratio <- 0.1
+#sig.value <- "signal.value"
+
+## pliu
+idr_code_path <- args[8]
+source(paste0(idr_code_path, "/functions-all-clayton-12-13.r"))
+#source("functions-all-clayton-12-13.r")
+##
+
+# read the length of the chromosomes, which will be used to concatenate chr's
+## pliu 20150607
+#chr.file <- "genome_table.txt"
+chr.file <- args[9]
+##
+
+chr.size <- read.table(chr.file)
+
+
+sink(paste(output.prefix, "-Rout.txt", sep=""))
+
+############# process the data
+cat("is.broadpeak", is.broadpeak, "\n")
+# process data, summit: the representation of the location of summit
+rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
+rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
+
+cat(paste("read", peakfile1, ": ", nrow(rep1$data.ori), "peaks\n", nrow(rep1$data.cleaned), "peaks are left after cleaning\n", peakfile2, ": ", nrow(rep2$data.ori), "peaks\n", nrow(rep2$data.cleaned), " peaks are left after cleaning"))
+
+if(args[3]==-1){
+ cat(paste("half.width=", "reported", "\n"))
+}else{
+ cat(paste("half.width=", half.width, "\n"))
+}
+cat(paste("significant measure=", sig.value, "\n"))
+
+# compute correspondence profile (URI)
+uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio)
+
+#uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned)
+
+cat(paste("URI is done\n"))
+
+# save output
+save(uri.output, file=paste(output.prefix, "-uri.sav", sep=""))
+cat(paste("URI is saved at: ", output.prefix, "-uri.sav \n", sep=""))
+
+
+# EM procedure for inference
+em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T)
+
+#em.output <- fit.2copula.em(uri.output$data12.enrich, fix.rho2=T, "gaussian")
+
+cat(paste("EM is done\n\n"))
+
+
+save(em.output, file=paste(output.prefix, "-em.sav", sep=""))
+cat(paste("EM is saved at: ", output.prefix, "-em.sav \n", sep=""))
+
+
+# write em output into a file
+
+cat(paste("EM estimation for the following files\n", peakfile1, "\n", peakfile2, "\n", sep=""))
+
+print(em.output$em.fit$para)
+
+# add on 3-29-10
+# output both local idr and IDR
+idr.local <- 1-em.output$em.fit$e.z
+IDR <- c()
+o <- order(idr.local)
+IDR[o] <- cumsum(idr.local[o])/c(1:length(o))
+
+
+write.out.data <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"],
+ start1=em.output$data.pruned$sample1[, "start.ori"],
+ stop1=em.output$data.pruned$sample1[, "stop.ori"],
+ sig.value1=em.output$data.pruned$sample1[, "sig.value"],
+ chr2=em.output$data.pruned$sample2[, "chr"],
+ start2=em.output$data.pruned$sample2[, "start.ori"],
+ stop2=em.output$data.pruned$sample2[, "stop.ori"],
+ sig.value2=em.output$data.pruned$sample2[, "sig.value"],
+ idr.local=1-em.output$em.fit$e.z, IDR=IDR)
+
+write.table(write.out.data, file=paste(output.prefix, "-overlapped-peaks.txt", sep=""))
+cat(paste("Write overlapped peaks and local idr to: ", output.prefix, "-overlapped-peaks.txt\n", sep=""))
+
+# number of peaks passing IDR range (0.01-0.25)
+IDR.cutoff <- seq(0.01, 0.25, by=0.01)
+idr.o <- order(write.out.data$idr.local)
+idr.ordered <- write.out.data$idr.local[idr.o]
+IDR.sum <- cumsum(idr.ordered)/c(1:length(idr.ordered))
+
+IDR.count <- c()
+n.cutoff <- length(IDR.cutoff)
+for(i in 1:n.cutoff){
+ IDR.count[i] <- sum(IDR.sum <= IDR.cutoff[i])
+}
+
+
+# write the number of peaks passing various IDR range into a file
+idr.cut <- data.frame(peakfile1, peakfile2, IDR.cutoff=IDR.cutoff, IDR.count=IDR.count)
+write.table(idr.cut, file=paste(output.prefix,"-npeaks-aboveIDR.txt", sep=""), append=T, quote=F, row.names=F, col.names=F)
+cat(paste("Write number of peaks above IDR cutoff [0.01, 0.25]: ","npeaks-aboveIDR.txt\n", sep=""))
+
+mar.mean <- get.mar.mean(em.output$em.fit)
+
+cat(paste("Marginal mean of two components:\n"))
+print(mar.mean)
+
+sink()
+
+
diff --git a/pRSEM/idrCode/batch-consistency-plot-merged2.r b/pRSEM/idrCode/batch-consistency-plot-merged2.r
new file mode 100644
index 0000000..7dadeb6
--- /dev/null
+++ b/pRSEM/idrCode/batch-consistency-plot-merged2.r
@@ -0,0 +1,213 @@
+# 1-20-10 Qunhua Li
+#
+# This program first plots correspondence curve and IDR threshold plot
+# (i.e. number of selected peaks vs IDR) for each pair of sample
+#
+# It then performs consistency analysis on merged data
+# It takes the parameters estimated from pairwise consistency analysis, and
+# use the same parameters to determine threshold on the merged data
+#
+# usage:
+# Rscript batch-consistency-plot-merged.r [npairs] [output.dir] [input.file.prefix 1, 2, 3 ...] [half.width] [idr.level] [significant measure] [pooled.filename] [write out option] [overlap.ratio] [is.broadpeak]
+# [npairs]: integer, number of consistency analyses
+# (e.g. if 2 replicates, npairs=1, if 3 replicates, npairs=3
+# [output.dir]: output directory for plot
+# [input.file.prefix 1, 2, 3]: prefix for the output from batch-consistency-analysis2. They are the input files for merged analysis see below for examples (i.e. saved.file.prefix). It can be multiple files
+#
+# The parameters below are for processing merged data
+# [half.width]: -1 if using reported interval
+# [idr.level]: threshold for idr
+# [significant measure]: choose from "p.value", "q.value" or "signal.value"
+# [pooled.filename]: peak caller output in narrowpeak or broadpeak format
+# [write out option]: logical, T: write out selected peaks in merged data, F: not write out
+# [overlap.ratio]: minimum overlap for two peaks to be called as calling the
+# same region. A numerical value between 0 and 1. If 0, minimum overlap
+# is >=1bp.
+# [is.broadpeak]: a logical value. If broadpeak is used, set as T;
+# if narrowpeak is used, set as F
+
+args <- commandArgs(trailingOnly=T)
+
+npair <- args[1] # number of curves to plot on the same figure
+output.file.prefix <- args[2] # file name for plot, generated from script at the outer level
+
+df.txt <- 10
+
+## examples for debugging
+#npair <- 3
+#output.file.prefix <- "~/ENCODE/anshul/results/gm12878-cfos-YALE-combined-threshold/consistency-plot"
+#combofile <- "~/ENCODE/anshul/data/SPP.YaleGm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
+#saved.file.prefix <- list()
+#saved.file.prefix[[1]] <- "~/ENCODE/anshul/results/gm12878-cfos-YALE-combined-threshold/SPP.YaleRep1Gm12878Cfos.VS.SPP.YaleRep2Gm12878Cfos"
+#saved.file.prefix[[2]] <- "~/ENCODE/anshul/results/gm12878-cfos-YALE-combined-threshold/SPP.YaleRep1Gm12878Cfos.VS.SPP.YaleRep3Gm12878Cfos"
+#saved.file.prefix[[3]] <- "~/ENCODE/anshul/results/gm12878-cfos-YALE-combined-threshold/SPP.YaleRep2Gm12878Cfos.VS.SPP.YaleRep3Gm12878Cfos"
+
+#npair <- 1
+#output.file.prefix <- "~/ENCODE/anshul/results/gm12878-pol2-YALE-combined-threshold/consistency-plot"
+#combofile <- "~/ENCODE/anshul/data/SPP.YaleGm12878Pol2.VS.Gm12878Input.PointPeak.narrowPeak"
+#saved.file.prefix <- "~/ENCODE/anshul/results/gm12878-pol2-YALE-combined-threshold/SPP.YaleRep1Gm12878Pol2.VS.SPP.YaleRep2Gm12878Pol2"
+
+
+#ori.sig.value <- "signal.value"
+# nominal.sig.value <- "q.value"
+# idr.level <- 0.05
+# half.width <- NULL
+###################
+
+# the df for plotting the smooth spline on the consistency curve
+#if(length(args)-3> npair){ # if df is specified
+# df.txt <- as.numeric(args[length(args)]) # df for plotting, default is 10
+#}else{
+# df.txt <- 10
+#}
+
+ntemp <- as.numeric(npair)
+
+###### this is needed for pooled data
+
+cat(as.numeric(args[3+ntemp]))
+if(as.numeric(args[3+ntemp])==-1){ # enter -1 when using the reported length
+ half.width <- NULL
+}else{
+ half.width <- as.numeric(args[3+ntemp])
+}
+
+
+idr.level <- as.numeric(args[4+ntemp]) # this is the consistency FDR, e.g. 0.05
+# a string: "signal.value", "p.value" or "q.value", for specifying which
+# significant value to use for thresholding the merged data
+ori.sig.value <- args[5+ntemp]
+
+
+# pooled data file
+combofile <- args[6+ntemp]
+is.write.out <- as.logical(args[7+ntemp])
+overlap.ratio <- as.numeric(args[8+ntemp]) # the minimum amount of overlap to be called as an overlap
+
+is.broadpeak <- args[9+ntemp]
+
+saved.file.prefix <- list() # identifier of filenames that contain the em and URI results
+
+
+source("functions-all-clayton-12-13.r")
+
+uri.list <- list()
+uri.list.match <- list()
+ez.list <- list()
+legend.txt <- c()
+#fdr.map <- c()
+sig.map <- list()
+em.output.list <- list()
+uri.output.list <- list()
+
+for(i in 1:npair){
+ saved.file.prefix[i] <- args[2+i]
+
+ load(paste(saved.file.prefix[i], "-uri.sav", sep=""))
+ load(paste(saved.file.prefix[i], "-em.sav", sep=""))
+
+ uri.output.list[[i]] <- uri.output
+ em.output.list[[i]] <- em.output
+
+ ez.list[[i]] <- get.ez.tt.all(em.output, uri.output.list[[i]]$data12.enrich$merge1,
+ uri.output.list[[i]]$data12.enrich$merge2, idr.level=idr.level) # reverse =T for error rate
+
+ # URI for all peaks
+ uri.list[[i]] <- uri.output$uri.n
+ # URI for matched peaks
+ uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt)
+ uri.list.match[[i]] <- uri.match$uri.n
+
+ file.name <- unlist(strsplit(as.character(saved.file.prefix[i]), "/"))
+
+ legend.txt[i] <- paste(i, "=", file.name[length(file.name)])
+ sig.map[[i]] <- cbind(idr.level, ez.list[[i]]$map.uv)
+
+
+ # map idr computed from consistency back to the original significant measure
+ #
+ # if(is.null(nominal.sig.value)){
+ # sig.map <- cbind(idr.level, ez.list[[i]]$map.uv)
+ #} else {
+
+ # for SPP, need find the significant value based on FDR
+ # temp.map <- map.sig.value(uri.output$data12.enrich, ez.list[[i]]$map.uv, nominal.value=nominal.sig.value)
+ # sig.map <- cbind(idr.level, ez.list[[i]]$map.uv)
+ # this is the corresponding FDR mapped from the significant value
+ # you don't need this in general
+ # fdr.map <- cbind(idr.level, temp.map)
+ #}
+
+}
+
+plot.uri.file <- paste(output.file.prefix, "-plot.ps", sep="")
+
+cat("plot consistency plots\n")
+############# plot and report output
+# plot correspondence curve for each pair,
+# plot number of selected peaks vs IDR
+
+# plot all into 1 file
+postscript(paste(output.file.prefix, "-plot.ps", sep=""))
+par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)
+plot.uri.group(uri.list, NULL, file.name=NULL, c(1:npair), title.txt="all peaks")
+plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:npair), title.txt="matched peaks")
+plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:npair), y.lim=c(0, 0.6))
+plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n") # legends
+legend(0, 1, legend.txt, cex=0.6)
+
+dev.off()
+
+############### consistency cutoff on the replicates #############
+
+cat("read pooled sample \n")
+##################################################
+########## now this part is for combined dataset
+##################################################
+
+chr.file <- "genome_table.txt"
+
+chr.size <- read.table(chr.file)
+
+# read combined data
+combined.ori <- process.narrowpeak(paste(combofile, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)$data.cleaned
+
+#combined <- combined.ori[, c("chr", "start", "stop", ori.sig.value, "signal.value", "p.value", "q.value", "start.chr", "stop.chr")]
+#colnames(combined) <- c("chr", "start", "stop", "sig.value", "signal.value", "p.value", "q.value", "start.chr", "stop.chr")
+
+combined <- combined.ori[, c( ori.sig.value, "start", "stop","signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori")]
+colnames(combined) <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori")
+combined$frac.ratio <- NA
+
+########
+# map by the matched structure
+########
+cat("Selecting peaks using parameters from consistency analysis\n")
+sig.select.method2 <- pass.structure(uri.output.list, em.output.list, combined, idr.level=idr.level, sig.value.impute=0, chr.size)
+
+if(is.write.out){
+ write.table(sig.select.method2$combined.selected, file=paste(output.file.prefix, "-combined.selection.txt", sep=""), quote=F, row.names=F)
+}
+
+save(sig.select.method2, file=paste(output.file.prefix, "-select.sav", sep=""))
+
+
+# output for ez
+sink(paste(output.file.prefix, "-Rout.txt", sep=""))
+cat("IDR Map for specified sig.value", "\n")
+print(sig.map)
+
+# cat("IDR Map for", nominal.sig.value, "\n")
+# print(fdr.map)
+
+
+# output for merged dataset
+cat("Merged dataset has ", nrow(combined), "p\n")
+
+cat("Apply parameters estimated from consistency analysis to merged data: select by ", ori.sig.value, "\n")
+print(sig.select.method2$npeak.stat)
+cat("Range of significant values on the selected pooled data", "\n")
+print(sig.select.method2$sig.combined)
+
+sink()
+
diff --git a/pRSEM/idrCode/batch-consistency-plot.r b/pRSEM/idrCode/batch-consistency-plot.r
new file mode 100644
index 0000000..f2fc2ee
--- /dev/null
+++ b/pRSEM/idrCode/batch-consistency-plot.r
@@ -0,0 +1,67 @@
+# 1-20-10 Qunhua Li
+#
+# This program first plots correspondence curve and IDR threshold plot
+# (i.e. number of selected peaks vs IDR) for each pair of sample
+#
+# usage:
+# Rscript batch-consistency-plot-merged.r [npairs] [output.dir] [input.file.prefix 1, 2, 3 ...]
+# [npairs]: integer, number of consistency analyses
+# (e.g. if 2 replicates, npairs=1, if 3 replicates, npairs=3
+# [output.dir]: output directory for plot
+# [input.file.prefix 1, 2, 3]: prefix for the output from batch-consistency-analysis2. They are the input files for merged analysis see below for examples (i.e. saved.file.prefix). It can be multiple files
+#
+
+args <- commandArgs(trailingOnly=T)
+npair <- args[1] # number of curves to plot on the same figure
+output.file.prefix <- args[2] # file name for plot, generated from script at the outer level
+df.txt <- 10
+ntemp <- as.numeric(npair)
+saved.file.prefix <- list() # identifier of filenames that contain the em and URI results
+source("functions-all-clayton-12-13.r")
+
+uri.list <- list()
+uri.list.match <- list()
+ez.list <- list()
+legend.txt <- c()
+em.output.list <- list()
+uri.output.list <- list()
+
+for(i in 1:npair){
+ saved.file.prefix[i] <- args[2+i]
+
+ load(paste(saved.file.prefix[i], "-uri.sav", sep=""))
+ load(paste(saved.file.prefix[i], "-em.sav", sep=""))
+
+ uri.output.list[[i]] <- uri.output
+ em.output.list[[i]] <- em.output
+
+ ez.list[[i]] <- get.ez.tt.all(em.output, uri.output.list[[i]]$data12.enrich$merge1,
+ uri.output.list[[i]]$data12.enrich$merge2) # reverse =T for error rate
+
+ # URI for all peaks
+ uri.list[[i]] <- uri.output$uri.n
+ # URI for matched peaks
+ uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt)
+ uri.list.match[[i]] <- uri.match$uri.n
+
+ file.name <- unlist(strsplit(as.character(saved.file.prefix[i]), "/"))
+
+ legend.txt[i] <- paste(i, "=", file.name[length(file.name)])
+
+}
+
+plot.uri.file <- paste(output.file.prefix, "-plot.ps", sep="")
+
+############# plot and report output
+# plot correspondence curve for each pair,
+# plot number of selected peaks vs IDR
+# plot all into 1 file
+postscript(paste(output.file.prefix, "-plot.ps", sep=""))
+par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)
+plot.uri.group(uri.list, NULL, file.name=NULL, c(1:npair), title.txt="all peaks")
+plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:npair), title.txt="matched peaks")
+plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:npair), y.lim=c(0, 0.6))
+plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n") # legends
+legend(0, 1, legend.txt, cex=0.6)
+
+dev.off()
diff --git a/pRSEM/idrCode/functions-all-clayton-12-13.r b/pRSEM/idrCode/functions-all-clayton-12-13.r
new file mode 100644
index 0000000..2497398
--- /dev/null
+++ b/pRSEM/idrCode/functions-all-clayton-12-13.r
@@ -0,0 +1,3182 @@
+# modified by pliu
+#
+# revised on 2-20-10
+# - fix error in pass.structure: reverse rank.combined, so that big sig.value
+# are ranked with small numbers (1, 2, ...)
+# - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z
+
+#
+# modified EM procedure to compute empirical CDF more precisely - 09/2009
+
+
+
+# this file contains the functions for
+# 1. computing the correspondence profile (upper rank intersection and derivatives)
+# 2. inference of copula mixture model
+#
+# It also has functions for
+# 1. reading peak caller results
+# 2. processing and matching called peaks
+# 3. plotting results
+
+
+################ read peak caller results
+
+# process narrow peak format
+# some peak callers may not report q-values, p-values or fold of enrichment
+# need further process before comparison
+#
+# stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive.
+# If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two
+# adjacent peaks, which creates trouble for finding correct intersect
+process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){
+
+ ## pliu 20150607
+ ## to read gzipped files
+ library(tools)
+ narrow.file.ext <- file_ext(narrow.file)
+ if ( narrow.file.ext %in% c('gz', 'gzip') ) {
+ aa <- read.table(gzfile(narrow.file))
+ } else {
+ aa <- read.table(narrow.file)
+ }
+ #aa <- read.table(narrow.file)
+ ######
+
+ if(broadpeak){
+ bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9)
+ }else{
+ bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10)
+ }
+
+ if(summit=="summit"){
+ bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes
+ }
+
+ bb <- concatenate.chr(bb.ori, chr.size)
+
+ #bb <- bb.ori
+
+ # remove the peaks that has the same start and stop value
+ bb <- bb[bb$start != bb$stop,]
+
+ if(stop.exclusive==T){
+ bb$stop <- bb$stop-1
+ }
+
+ if(!is.null(half.width)){
+ bb$start.ori <- bb$start #Anshul changed this
+ bb$stop.ori <- bb$stop #Anshul changed this
+
+ # if peak is narrower than the specified window, stay with its width
+ # otherwise chop wider peaks to specified width
+ width <- bb$stop-bb$start +1
+ is.wider <- width > 2*half.width
+
+ if(summit=="offset" | summit=="summit"){ # if summit is offset from start
+ bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width
+ bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width
+ } else {
+ if(summit=="unknown"){
+ bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width
+ bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width
+ }
+ }
+
+ bb$start.ori <- bb.ori$start #Anshul changed this
+ bb$stop.ori <- bb.ori$stop #Anshul changed this
+ }
+
+ bb <- clean.data(bb)
+ invisible(list(data.ori=bb.ori, data.cleaned=bb))
+}
+
+# clean data
+# and concatenate chromosomes if needed
+clean.data <- function(adata){
+
+ # remove the peaks that has the same start and stop value
+ adata <- adata[adata$start != adata$stop,]
+
+ # if some stops and starts are the same, need fix them
+ stop.in.start <- is.element(adata$stop, adata$start)
+ n.fix <- sum(stop.in.start)
+ if(n.fix >0){
+ print(paste("Fix", n.fix, "stops\n"))
+ adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1
+ }
+
+ return(adata)
+}
+
+# concatenate peaks
+# peaks: the dataframe to have all the peaks
+# chr.file: the file to keep the length of each chromosome
+# chr files should come from the species that the data is from
+#concatenate.chr <- function(peaks, chr.size){
+
+ # chr.size <- read.table(chr.file)
+# chr.o <- order(chr.size[,1])
+# chr.size <- chr.size[chr.o,]
+#
+# chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+# chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
+#
+# for(i in 1:nrow(chr.size)){
+# is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
+# if(sum(is.in)>0){
+# peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
+# peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
+# }
+# }
+#
+# invisible(peaks)
+#}
+
+
+
+
+# concatenate peaks
+# peaks: the dataframe to have all the peaks
+# chr.file: the file to keep the length of each chromosome
+# chr files should come from the species that the data is from
+concatenate.chr <- function(peaks, chr.size){
+
+ # chr.size <- read.table(chr.file)
+ chr.o <- order(chr.size[,1])
+ chr.size <- chr.size[chr.o,]
+
+ chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+ chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
+
+ peaks$start.ori <- peaks$start
+ peaks$stop.ori <- peaks$stop
+
+ for(i in 1:nrow(chr.size)){
+ is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
+ if(sum(is.in)>0){
+ peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
+ peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
+ }
+ }
+
+ invisible(peaks)
+}
+
+
+deconcatenate.chr <- function(peaks, chr.size){
+
+ chr.o <- order(chr.size[,1])
+ chr.size <- chr.size[chr.o,]
+
+ chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+ chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
+
+ peaks$chr <- rep(NA, nrow(peaks))
+
+ for(i in 1:(nrow(chr.size.cum)-1)){
+ is.in <- peaks$start > chr.size.cum[i,2] & peaks$start <= chr.size.cum[i+1, 2]
+ if(sum(is.in)>0){
+ peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
+ peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1
+ peaks[is.in,]$chr <- chr.size[i,1]
+ }
+ }
+
+ if(i == nrow(chr.size.cum)){
+ is.in <- peaks$start > chr.size.cum[i, 2]
+ if(sum(is.in)>0){
+ peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
+ peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1
+ peaks[is.in,]$chr <- chr.size[i,1]
+ }
+ }
+
+ invisible(peaks)
+}
+
+################ preprocessing peak calling output
+
+
+#
+# read two calling results and sort by peak starting locations,
+# then find overlap between peaks
+# INPUT:
+# rep1: the 1st replicate
+# rep2: the 2nd replicate
+# OUTPUT:
+# id1, id2: the labels for the identified peaks on the replicates
+find.overlap <- function(rep1, rep2){
+
+ o1 <- order(rep1$start)
+ rep1 <- rep1[o1,]
+
+ o2 <- order(rep2$start)
+ rep2 <- rep2[o2,]
+
+ n1 <- length(o1)
+ n2 <- length(o2)
+
+ # assign common ID to peaks
+ id1 <- rep(0, n1) # ID assigned on rep1
+ id2 <- rep(0, n2) # ID assigned on rep2
+ id <- 1 # keep track common id's
+
+ # check if two replicates overlap with each other
+ i <- 1
+ j <- 1
+
+ while(i <= n1|| j <= n2){
+
+ # && (id1[n1] ==0 || id2[n2] ==0)
+
+ # if one list runs out
+ if(i > n1 && j < n2){
+
+ j <- j+1
+ id2[j] <- id
+ id <- id +1
+ next
+ } else{
+ if(j > n2 && i < n1){
+ i <- i+1
+ id1[i] <- id
+ id <- id +1
+ next
+ } else {
+ if(i >= n1 && j >=n2)
+ break
+ }
+ }
+
+ # if not overlap
+
+ if(!(rep1$start[i] <= rep2$stop[j] && rep2$start[j] <= rep1$stop[i])){
+
+ # at the start of loop, when both are not assigned an ID
+ # the one locates in front is assigned first
+ if(id1[i] ==0 && id2[j]==0){
+ if(rep1$stop[i] < rep2$stop[j]){
+ id1[i] <- id
+ } else {
+ id2[j] <- id
+ }
+ } else { # in the middle of the loop, when one is already assigned
+ # The one that has not assigned gets assigned
+ # if(id1[i] ==0){ # id1[i] is not assigned
+ # id1[i] <- id
+ # } else { # id2[i] is not assigned
+ # id2[j] <- id
+ # }
+
+ # order the id according to location
+ if(rep1$stop[i] <= rep2$stop[j]){
+ id1[i] <- max(id2[j], id1[i])
+ id2[j] <- id
+ } else {
+ if(rep1$stop[i] > rep2$stop[j]){
+ id2[j] <- max(id1[i], id2[j])
+ id1[i] <- id
+ }
+ }
+
+ }
+
+ id <- id +1
+
+ } else { # if overlap
+
+ if(id1[i] == 0 && id2[j] == 0){ # not assign label yet
+ id1[i] <- id
+ id2[j] <- id
+ id <- id +1
+ } else { # one peak is already assigned label, the other is 0
+
+ id1[i] <- max(id1[i], id2[j]) # this is a way to copy the label of the assigned peak without knowing which one is already assigned
+ id2[j] <- id1[i] # syncronize the labels
+ }
+
+ }
+
+ if(rep1$stop[i] < rep2$stop[j]){
+ i <- i+1
+ } else {
+ j <- j+1
+ }
+
+ }
+
+ invisible(list(id1=id1, id2=id2))
+
+}
+
+# Impute the missing significant value for the peaks called only on one replicate.
+# value
+# INPUT:
+# rep1, rep2: the two peak calling output
+# id1, id2: the IDs assigned by function find.overlap, vectors
+# If id1[i]==id2[j], peak i on rep1 overlaps with peak j on rep2
+# p.value.impute: the significant value to impute for the missing peaks
+# OUTPUT:
+# rep1, rep2: peaks ordered by the start locations with imputed peaks
+# id1, id2: the IDs with imputed peaks
+fill.missing.peaks <- function(rep1, rep2, id1, id2, p.value.impute){
+
+# rep1 <- data.frame(chr=rep1$chr, start=rep1$start, stop=rep1$stop, sig.value=rep1$sig.value)
+# rep2 <- data.frame(chr=rep2$chr, start=rep2$start, stop=rep2$stop, sig.value=rep2$sig.value)
+
+ o1 <- order(rep1$start)
+ rep1 <- rep1[o1,]
+
+ o2 <- order(rep2$start)
+ rep2 <- rep2[o2,]
+
+ entry.in1.not2 <- !is.element(id1, id2)
+ entry.in2.not1 <- !is.element(id2, id1)
+
+ if(sum(entry.in1.not2) > 0){
+
+ temp1 <- rep1[entry.in1.not2, ]
+
+ # impute sig.value
+ temp1$sig.value <- p.value.impute
+ temp1$signal.value <- p.value.impute
+ temp1$p.value <- p.value.impute
+ temp1$q.value <- p.value.impute
+
+ rep2.filled <- rbind(rep2, temp1)
+ id2.filled <- c(id2, id1[entry.in1.not2])
+ } else {
+ id2.filled <- id2
+ rep2.filled <- rep2
+ }
+
+ if(sum(entry.in2.not1) > 0){
+
+ temp2 <- rep2[entry.in2.not1, ]
+
+ # fill in p.values to 1
+ temp2$sig.value <- p.value.impute
+ temp2$signal.value <- p.value.impute
+ temp2$p.value <- p.value.impute
+ temp2$q.value <- p.value.impute
+
+
+ # append to the end
+ rep1.filled <- rbind(rep1, temp2)
+
+ id1.filled <- c(id1, id2[entry.in2.not1])
+ } else {
+ id1.filled <- id1
+ rep1.filled <- rep1
+ }
+
+ # sort rep1 and rep2 by the same id
+ o1 <- order(id1.filled)
+ rep1.ordered <- rep1.filled[o1, ]
+
+ o2 <- order(id2.filled)
+ rep2.ordered <- rep2.filled[o2, ]
+
+ invisible(list(rep1=rep1.ordered, rep2=rep2.ordered,
+ id1=id1.filled[o1], id2=id2.filled[o2]))
+ }
+
+# Merge peaks with same ID on the same replicates
+# (They are generated if two peaks on rep1 map to the same peak on rep2)
+# need peak.list have 3 columns: start, stop and sig.value
+merge.peaks.best <- function(peak.list, id){
+
+ i <- 1
+ j <- 1
+ dup.index <- c()
+ sig.value <- c()
+ start.new <- c()
+ stop.new <- c()
+ id.new <- c()
+
+ # original data
+ chr <- c()
+ start.ori <- c()
+ stop.ori <- c()
+
+ signal.value <- c()
+ p.value <- c()
+ q.value <- c()
+
+ while(i < length(id)){
+
+ if(id[i] == id[i+1]){
+ dup.index <- c(dup.index, i, i+1) # push on dup.index
+ } else {
+ if(length(dup.index)>0){ # pop from dup.index
+ # sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue)
+ sig.value[j] <- max(peak.list$sig.value[unique(dup.index)])
+ start.new[j] <- peak.list$start[min(dup.index)]
+ stop.new[j] <- peak.list$stop[max(dup.index)]
+ id.new[j] <- id[max(dup.index)]
+
+ # signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue)
+ # q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue)
+ signal.value[j] <- max(peak.list$signal.value[unique(dup.index)])
+ p.value[j] <- max(peak.list$p.value[unique(dup.index)])
+ q.value[j] <- max(peak.list$q.value[unique(dup.index)])
+
+ chr[j] <- as.character(peak.list$chr[min(dup.index)])
+ start.ori[j] <- peak.list$start.ori[min(dup.index)]
+ stop.ori[j] <- peak.list$stop.ori[max(dup.index)]
+
+ dup.index <- c()
+ } else { # nothing to pop
+ sig.value[j] <- peak.list$sig.value[i]
+ start.new[j] <- peak.list$start[i]
+ stop.new[j] <- peak.list$stop[i]
+ id.new[j] <- id[i]
+
+ signal.value[j] <- peak.list$signal.value[i]
+ p.value[j] <- peak.list$p.value[i]
+ q.value[j] <- peak.list$q.value[i]
+
+ chr[j] <- as.character(peak.list$chr[i])
+ start.ori[j] <- peak.list$start.ori[i]
+ stop.ori[j] <- peak.list$stop.ori[i]
+
+ }
+ j <- j+1
+ }
+ i <- i+1
+ }
+
+ data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori)
+ invisible(data.new)
+}
+
+# Merge peaks with same ID on the same replicates
+# (They are generated if two peaks on rep1 map to the same peak on rep2)
+# need peak.list have 3 columns: start, stop and sig.value
+merge.peaks <- function(peak.list, id){
+
+ i <- 1
+ j <- 1
+ dup.index <- c()
+ sig.value <- c()
+ start.new <- c()
+ stop.new <- c()
+ id.new <- c()
+
+ # original data
+ chr <- c()
+ start.ori <- c()
+ stop.ori <- c()
+
+ signal.value <- c()
+ p.value <- c()
+ q.value <- c()
+
+ while(i < length(id)){
+
+ if(id[i] == id[i+1]){
+ dup.index <- c(dup.index, i, i+1) # push on dup.index
+ } else {
+ if(length(dup.index)>0){ # pop from dup.index
+ sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue)
+ start.new[j] <- peak.list$start[min(dup.index)]
+ stop.new[j] <- peak.list$stop[max(dup.index)]
+ id.new[j] <- id[max(dup.index)]
+
+ signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # mean of -log(pvalue)
+ p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue)
+ q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue)
+
+ chr[j] <- as.character(peak.list$chr[min(dup.index)])
+ start.ori[j] <- peak.list$start.ori[min(dup.index)]
+ stop.ori[j] <- peak.list$stop.ori[max(dup.index)]
+
+ dup.index <- c()
+ } else { # nothing to pop
+ sig.value[j] <- peak.list$sig.value[i]
+ start.new[j] <- peak.list$start[i]
+ stop.new[j] <- peak.list$stop[i]
+ id.new[j] <- id[i]
+
+ signal.value[j] <- peak.list$signal.value[i]
+ p.value[j] <- peak.list$p.value[i]
+ q.value[j] <- peak.list$q.value[i]
+
+ chr[j] <- as.character(peak.list$chr[i])
+ start.ori[j] <- peak.list$start.ori[i]
+ stop.ori[j] <- peak.list$stop.ori[i]
+
+ }
+ j <- j+1
+ }
+ i <- i+1
+ }
+
+ data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori)
+ invisible(data.new)
+}
+
+
+
+
+
+# a wrap function to fill in missing peaks, merge peaks and impute significant values
+# out1 and out2 are two peak calling outputs
+pair.peaks <- function(out1, out2, p.value.impute=0){
+
+ aa <- find.overlap(out1, out2)
+ bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
+
+ cc1 <- merge.peaks(bb$rep1, bb$id1)
+ cc2 <- merge.peaks(bb$rep2, bb$id2)
+
+ invisible(list(merge1=cc1, merge2=cc2))
+}
+
+
+
+# overlap.ratio is a parameter to define the percentage of overlap
+# if overlap.ratio =0, 1 basepair overlap is counted as overlap
+# if overlap.ratio between 0 and 1, it is the minimum proportion of
+# overlap required to be called as a match
+# it is computed as the overlap part/min(peak1.length, peak2.length)
+pair.peaks.filter <- function(out1, out2, p.value.impute=0, overlap.ratio=0){
+
+ aa <- find.overlap(out1, out2)
+ bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
+
+ cc1 <- merge.peaks(bb$rep1, bb$id1)
+ cc2 <- merge.peaks(bb$rep2, bb$id2)
+
+ frag12 <- cbind(cc1$start, cc1$stop, cc2$start, cc2$stop)
+
+ frag.ratio <- apply(frag12, 1, overlap.middle)
+
+ frag.ratio[cc1$sig.value==p.value.impute | cc2$sig.value==p.value.impute] <- 0
+
+ cc1$frag.ratio <- frag.ratio
+ cc2$frag.ratio <- frag.ratio
+
+ merge1 <- cc1[cc1$frag.ratio >= overlap.ratio,]
+ merge2 <- cc2[cc2$frag.ratio >= overlap.ratio,]
+
+ invisible(list(merge1=merge1, merge2=merge2))
+}
+
+# x[1], x[2] are the start and end of the first fragment
+# and x[3] and x[4] are the start and end of the 2nd fragment
+# If there are two fragments, we can find the overlap by ordering the
+# start and stop of all the ends and find the difference between the middle two
+overlap.middle <- function(x){
+
+ x.o <- x[order(x)]
+ f1 <- x[2]-x[1]
+ f2 <- x[4]-x[3]
+
+ f.overlap <- abs(x.o[3]-x.o[2])
+ f.overlap.ratio <- f.overlap/min(f1, f2)
+
+ return(f.overlap.ratio)
+}
+
+
+
+#######
+####### compute correspondence profile
+#######
+
+# compute upper rank intersection for one t
+# tv: the upper percentile
+# x is sorted by the order of paired variable
+comp.uri <- function(tv, x){
+ n <- length(x)
+ qt <- quantile(x, prob=1-tv[1]) # tv[1] is t
+# sum(x[1:ceiling(n*tv[2])] >= qt)/n/tv[2]- tv[1]*tv[2] #tv[2] is v
+ sum(x[1:ceiling(n*tv[2])] >= qt)/n
+
+}
+
+# compute the correspondence profile
+# tt, vv: vector between (0, 1) for percentages
+get.uri.2d <- function(x1, x2, tt, vv, spline.df=NULL){
+
+ o <- order(x1, x2, decreasing=T)
+
+ # sort x2 by the order of x1
+ x2.ordered <- x2[o]
+
+ tv <- cbind(tt, vv)
+ ntotal <- length(x1) # number of peaks
+
+ uri <- apply(tv, 1, comp.uri, x=x2.ordered)
+
+ # compute the derivative of URI vs t using small bins
+ uri.binned <- uri[seq(1, length(uri), by=4)]
+ tt.binned <- tt[seq(1, length(uri), by=4)]
+ uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)])
+
+ # smooth uri using spline
+ # first find where the jump is and don't fit the jump
+ # this is the index on the left
+ # jump.left.old <- which.max(uri[-1]-uri[-length(uri)])
+ short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2))
+
+ if(short.list.length < max(tt)){
+ jump.left <- which(tt>short.list.length)[1]-1
+ } else {
+ jump.left <- which.max(tt)
+ }
+
+# reversed.index <- seq(length(tt), 1, by=-1)
+# nequal <- sum(uri[reversed.index]== tt[reversed.index])
+# temp <- which(uri[reversed.index]== tt[reversed.index])[nequal]
+# jump.left <- length(tt)-temp
+
+ if(jump.left < 6){
+ jump.left <- length(tt)
+ }
+
+
+ if(is.null(spline.df))
+ uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4)
+ else{
+ uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df)
+ }
+ # predict the first derivative
+ uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1)
+
+ invisible(list(tv=tv, uri=uri,
+ uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)],
+ uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left,
+ ntotal=ntotal))
+ }
+
+
+# change the scale of uri from based on t (percentage) to n (number of peaks or basepairs)
+# this is for plotting multiple pairwise URI's on the same plot
+scale.t2n <- function(uri){
+
+ ntotal <- uri$ntotal
+ tv <- uri$tv*uri$ntotal
+ uri.uri <- uri$uri*uri$ntotal
+ jump.left <- uri$jump.left
+ uri.spl <- uri$uri.spl
+ uri.spl$x <- uri$uri.spl$x*uri$ntotal
+ uri.spl$y <- uri$uri.spl$y*uri$ntotal
+
+ t.binned <- uri$t.binned*uri$ntotal
+ uri.slope <- uri$uri.slope
+ uri.der <- uri$uri.der
+ uri.der$x <- uri$uri.der$x*uri$ntotal
+ uri.der$y <- uri$uri.der$y
+
+ uri.n <- list(tv=tv, uri=uri.uri, t.binned=t.binned, uri.slope=uri.slope, uri.spl=uri.spl, uri.der=uri.der, ntotal=ntotal, jump.left=jump.left)
+ return(uri.n)
+}
+
+
+
+
+# a wrapper for running URI for peaks from peak calling results
+# both data1 and data2 are calling results in narrowpeak format
+compute.pair.uri <- function(data.1, data.2, sig.value1="signal.value", sig.value2="signal.value", spline.df=NULL, overlap.ratio=0){
+
+ tt <- seq(0.01, 1, by=0.01)
+ vv <- tt
+
+ if(sig.value1=="signal.value"){
+ data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$signal.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+ } else {
+ if(sig.value1=="p.value"){
+ data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$p.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+ } else {
+ if(sig.value1=="q.value"){
+ data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$q.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+ }
+ }
+ }
+
+ if(sig.value2=="signal.value"){
+ data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$signal.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+ } else {
+ if(sig.value2=="p.value"){
+ data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$p.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+ } else {
+ if(sig.value2=="q.value"){
+ data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$q.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+ }
+ }
+ }
+
+ ### by peaks
+ # data12.enrich <- pair.peaks(data.1.enrich, data.2.enrich)
+ data12.enrich <- pair.peaks.filter(data.1.enrich, data.2.enrich, p.value.impute=0, overlap.ratio)
+ uri <- get.uri.2d(as.numeric(as.character(data12.enrich$merge1$sig.value)), as.numeric(as.character(data12.enrich$merge2$sig.value)), tt, vv, spline.df=spline.df)
+ uri.n <- scale.t2n(uri)
+
+ return(list(uri=uri, uri.n=uri.n, data12.enrich=data12.enrich, sig.value1=sig.value1, sig.value2=sig.value2))
+
+
+}
+
+
+
+# compute uri for matched sample
+get.uri.matched <- function(data12, df=10){
+
+ tt <- seq(0.01, 1, by=0.01)
+ vv <- tt
+ uri <- get.uri.2d(data12$sample1$sig.value, data12$sample2$sig.value, tt, vv, spline.df=df)
+
+ # change scale from t to n
+ uri.n <- scale.t2n(uri)
+
+ return(list(uri=uri, uri.n=uri.n))
+
+}
+
+# map.uv is a pair of significant values corresponding to specified consistency FDR
+# assuming values in map.uv and qvalue are linearly related
+# data.set is the original data set
+# sig.value is the name of the significant value in map.uv, say enrichment
+# nominal.value is the one we want to map to, say q-value
+#
+map.sig.value <- function(data.set, map.uv, nominal.value){
+
+ index.nominal <- which(names(data.set$merge1)==nominal.value)
+ nentry <- nrow(map.uv)
+ map.nominal <- rbind(map.uv[, c("sig.value1", "sig.value2")])
+
+ for(i in 1:nentry){
+
+ map.nominal[i, "sig.value1"] <- data.set$merge1[unique(which.min(abs(data.set$merge1$sig.value-map.uv[i, "sig.value1"]))), index.nominal]
+ map.nominal[i, "sig.value2"] <- data.set$merge2[unique(which.min(abs(data.set$merge2$sig.value-map.uv[i, "sig.value2"]))), index.nominal]
+ }
+
+ invisible(map.nominal)
+}
+
+
+############### plot correspondence profile
+
+# plot multiple comparison wrt one template
+# uri.list contains the total number of peaks
+# plot.missing=F: not plot the missing points on the right
+plot.uri.group <- function(uri.n.list, plot.dir, file.name=NULL, legend.txt, xlab.txt="num of significant peaks", ylab.txt="num of peaks in common", col.start=0, col.txt=NULL, plot.missing=F, title.txt=NULL){
+
+ if(is.null(col.txt))
+ col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+
+ n <- length(uri.n.list)
+
+ ntotal <- c()
+ for(i in 1:n)
+ ntotal[i] <- uri.n.list[[i]]$ntotal
+
+ jump.left <- c()
+ jump.left.der <- c()
+ ncommon <- c()
+ for(i in 1:n){
+# jump.left[i] <- which.max(uri.n.list[[i]]$uri[-1]-uri.n.list[[i]]$uri[-length(uri.n.list[[i]]$uri)])
+# if(jump.left[i] < 6)
+# jump.left[i] <- length(uri.n.list[[i]]$uri)
+
+## reversed.index <- seq(length(uri.n.list[[i]]$tv[,1]), 1, by=-1)
+## nequal <- sum(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])
+## temp <- which(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])[nequal]
+## jump.left[i] <- length(uri.n.list[[i]]$tv[,1])-temp
+##print(uri.n.list[[i]]$uri)
+##print(uri.n.list[[i]]$tv[,1])
+## jump.left[i] <- uri.n.list[[i]]$jump.left
+
+# jump.left.der[i] <- sum(uri.n.list[[i]]$t.binned < uri.n.list[[i]]$uri.der$x[length(uri.n.list[[i]]$uri.der$x)])
+
+ jump.left[i] <- uri.n.list[[i]]$jump.left
+ jump.left.der[i] <- jump.left[i]
+ ncommon[i] <- uri.n.list[[i]]$tv[jump.left[i],1]
+ }
+
+
+ if(plot.missing){
+ max.peak <- max(ntotal)
+ } else {
+ max.peak <- max(ncommon)*1.05
+ }
+
+ if(!is.null(file.name)){
+ postscript(paste(plot.dir, "uri.", file.name, sep=""))
+ par(mfrow=c(1,1), mar=c(5,5,4,2))
+ }
+
+ plot(uri.n.list[[1]]$tv[,1], uri.n.list[[1]]$uri, type="n", xlab=xlab.txt, ylab=ylab.txt, xlim=c(0, max.peak), ylim=c(0, max.peak), cex.lab=2)
+
+ for(i in 1:n){
+
+ if(plot.missing){
+ points(uri.n.list[[i]]$tv[,1], uri.n.list[[i]]$uri, col=col.txt[i+col.start], cex=0.5 )
+ } else {
+ points(uri.n.list[[i]]$tv[1:jump.left[i],1], uri.n.list[[i]]$uri[1:jump.left[i]], col=col.txt[i+col.start], cex=0.5)
+ }
+ lines(uri.n.list[[i]]$uri.spl, col=col.txt[i+col.start], lwd=4)
+ }
+ abline(coef=c(0,1), lty=3)
+ legend(0, max.peak, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
+ if(!is.null(title))
+ title(title.txt)
+
+ if(!is.null(file.name)){
+ dev.off()
+ }
+
+ if(!is.null(file.name)){
+ postscript(paste(plot.dir, "duri.", file.name, sep=""))
+ par(mfrow=c(1,1), mar=c(5,5,4,2))
+ }
+ plot(uri.n.list[[1]]$t.binned, uri.n.list[[1]]$uri.slope, type="n", xlab=xlab.txt, ylab="slope", xlim=c(0, max.peak), ylim=c(0, 1.5), cex.lab=2)
+
+ for(i in 1:n){
+# if(plot.missing){
+# points(uri.n.list[[i]]$t.binned, uri.n.list[[i]]$uri.slope, col=col.txt[i+col.start], cex=0.5)
+# } else {
+# points(uri.n.list[[i]]$t.binned[1:jump.left.der[i]], uri.n.list[[i]]$uri.slope[1:jump.left.der[i]], col=col.txt[i+col.start], cex=0.5)
+# }
+ lines(uri.n.list[[i]]$uri.der, col=col.txt[i+col.start], lwd=4)
+ }
+ abline(h=1, lty=3)
+ legend(0.5*max.peak, 1.5, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
+
+ if(!is.null(title))
+ title(title.txt)
+
+ if(!is.null(file.name)){
+ dev.off()
+ }
+
+}
+
+
+
+#######################
+####################### copula fitting for matched peaks
+#######################
+
+# estimation from mixed copula model
+
+# 4-5-09
+# A nonparametric estimation of mixed copula model
+
+
+# updated
+
+# c1, c2, f1, f2, g1, g2 are vectors
+# c1*f1*g1 and c2*f2*g2 are copula densities for the two components
+# xd1 and yd1 are the values of marginals for the first component
+# xd2 and yd2 are the values of marginals for the 2nd component
+#
+# ez is the prob for being in the consistent group
+get.ez <- function(p, c1, c2, xd1, yd1, xd2, yd2){
+
+ return(p*c1*xd1*yd1/(p*c1*xd1*yd1 + (1-p)*c2*xd2*yd2))
+}
+
+# checked
+
+# this is C_12 not the copula density function c=C_12 * f1* f2
+# since nonparametric estimation is used here for f1 and f2, which
+# are constant throughout the iterations, we don't need them for optimization
+#
+# bivariate gaussian copula function
+# t and s are vectors of same length, both are percentiles
+# return a vector
+gaussian.cop.den <- function(t, s, rho){
+
+ A <- qnorm(t)^2 + qnorm(s)^2
+ B <- qnorm(t)*qnorm(s)
+
+ loglik <- -log(1-rho^2)/2 - rho/(2*(1-rho^2))*(rho*A-2*B)
+
+ return(exp(loglik))
+}
+
+clayton.cop.den <- function(t, s, rho){
+
+ if(rho > 0)
+ return(exp(log(rho+1)-(rho+1)*(log(t)+log(s))-(2+1/rho)*log(t^(-rho) + s^(-rho)-1)))
+
+ if(rho==0)
+ return(1)
+
+ if(rho<0)
+ stop("Incorrect Clayton copula coefficient")
+
+}
+
+
+# checked
+# estimate rho from Gaussian copula
+mle.gaussian.copula <- function(t, s, e.z){
+
+ # reparameterize to bound from rho=+-1
+ l.c <- function(rho, t, s, e.z){
+# cat("rho=", rho, "\n")
+ sum(e.z*log(gaussian.cop.den(t, s, rho)))}
+
+ rho.max <- optimize(f=l.c, c(-0.998, 0.998), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
+
+#print(rho.max$m)
+
+#cat("cor=", cor(qnorm(t)*e.z, qnorm(s)*e.z), "\t", "rho.max=", rho.max$m, "\n")
+# return(sign(rho.max$m)/(1+rho.max$m))
+ return(rho.max$m)
+}
+
+
+# estimate mle from Clayton copula,
+mle.clayton.copula <- function(t, s, e.z){
+
+ l.c <- function(rho, t, s, e.z){
+ lc <- sum(e.z*log(clayton.cop.den(t, s, rho)))
+# cat("rho=", rho, "\t", "l.c=", lc, "\n")
+ return(lc)
+ }
+
+ rho.max <- optimize(f=l.c, c(0.1, 20), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
+
+ return(rho.max$m)
+}
+
+
+
+# updated
+# mixture likelihood of two gaussian copula
+# nonparametric and ranked transformed
+loglik.2gaussian.copula <- function(x, y, p, rho1, rho2, x.mar, y.mar){
+
+ px.1 <- get.pdf.cdf(x, x.mar$f1)
+ px.2 <- get.pdf.cdf(x, x.mar$f2)
+ py.1 <- get.pdf.cdf(y, y.mar$f1)
+ py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+
+ sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+loglik.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
+
+ px.1 <- pdf.cdf$px.1
+ px.2 <- pdf.cdf$px.2
+ py.1 <- pdf.cdf$py.1
+ py.2 <- pdf.cdf$py.2
+
+ if(copula.txt=="gaussian"){
+ c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ } else {
+ if(copula.txt=="clayton"){
+ c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+ }
+ }
+ sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+
+# estimate the marginals of each component using histogram estimator in EM
+# return the density, breaks, and cdf of the histogram estimator
+est.mar.hist <- function(x, e.z, breaks){
+
+ binwidth <- c()
+ nbin <- length(breaks)-1
+ nx <- length(x)
+
+ # the histogram
+ x1.pdf <- c()
+ x2.pdf <- c()
+ x1.cdf <- c()
+ x2.cdf <- c()
+
+ # the pdf for each point
+ x1.pdf.value <- rep(NA, nx)
+ x2.pdf.value <- rep(NA, nx)
+
+ x1.cdf.value <- rep(NA, nx)
+ x2.cdf.value <- rep(NA, nx)
+
+ for(i in 1:nbin){
+
+ binwidth[i] <- breaks[i+1] - breaks[i]
+ if(i < nbin)
+ in.bin <- x>= breaks[i] & x < breaks[i+1]
+ else # last bin
+ in.bin <- x>= breaks[i] & x <=breaks[i+1]
+
+ # each bin add one observation to avoid empty bins
+ # multiple (nx+nbin)/(nx+nbin+1) to avoid blowup when looking up for
+ # quantiles
+ x1.pdf[i] <- (sum(e.z[in.bin])+1)/(sum(e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)
+ x2.pdf[i] <- (sum(1-e.z[in.bin])+1)/(sum(1-e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)
+
+
+# x1.pdf[i] <- sum(e.z[in.bin])/sum(e.z)/binwidth[i]*nx/(nx+1)
+# x2.pdf[i] <- sum(1-e.z[in.bin])/sum(1-e.z)/binwidth[i]*nx/(nx+1)
+
+# treat each bin as a value for a discrete variable
+# x1.cdf[i] <- sum(x1.pdf[1:i]*binwidth[1:i])
+# x2.cdf[i] <- sum(x2.pdf[1:i]*binwidth[1:i])
+
+
+ # cumulative density before reaching i
+ if(i>1){
+ x1.cdf[i] <- sum(x1.pdf[1:(i-1)]*binwidth[1:(i-1)])
+ x2.cdf[i] <- sum(x2.pdf[1:(i-1)]*binwidth[1:(i-1)])
+ } else{
+ x1.cdf[i] <- 0
+ x2.cdf[i] <- 0
+ }
+
+ # make a vector of nx to store the values of pdf and cdf for each x
+ # this will speed up the computation dramatically
+ x1.pdf.value[in.bin] <- x1.pdf[i]
+ x2.pdf.value[in.bin] <- x2.pdf[i]
+
+ x1.cdf.value[in.bin] <- x1.cdf[i] + x1.pdf[i]*(x[in.bin]-breaks[i])
+ x2.cdf.value[in.bin] <- x2.cdf[i] + x2.pdf[i]*(x[in.bin]-breaks[i])
+ }
+
+# x1.cdf <- cumsum(x1.pdf*binwidth)
+# x2.cdf <- cumsum(x2.pdf*binwidth)
+
+ f1 <-list(breaks=breaks, density=x1.pdf, cdf=x1.cdf)
+ f2 <-list(breaks=breaks, density=x2.pdf, cdf=x2.cdf)
+
+ f1.value <- list(pdf=x1.pdf.value, cdf=x1.cdf.value)
+ f2.value <- list(pdf=x2.pdf.value, cdf=x2.cdf.value)
+
+ return(list(f1=f1, f2=f2, f1.value=f1.value, f2.value=f2.value))
+}
+
+# estimate the marginal cdf from rank
+est.cdf.rank <- function(x, conf.z){
+
+ # add 1 to prevent blow up
+ x1.cdf <- rank(x[conf.z==1])/(length(x[conf.z==1])+1)
+
+ x2.cdf <- rank(x[conf.z==0])/(length(x[conf.z==0])+1)
+
+ return(list(cdf1=x1.cdf, cdf2=x2.cdf))
+}
+
+# df is a density function with fields: density, cdf and breaks, x is a scalar
+get.pdf <- function(x, df){
+
+ if(x < df$breaks[1])
+ cat("x is out of the range of df\n")
+
+ index <- which(df$breaks >= x)[1]
+
+ if(index==1)
+ index <- index +1
+ return(df$density[index-1])
+}
+
+# get cdf from histgram estimator for a single value
+get.cdf <- function(x, df){
+
+ index <- which(df$breaks >= x)[1]
+ if(index==1)
+ index <- index +1
+ return(df$cdf[index-1])
+}
+
+# df is a density function with fields: density, cdf and breaks
+get.pdf.cdf <- function(x.vec, df){
+
+ x.pdf <- sapply(x.vec, get.pdf, df=df)
+ x.cdf <- sapply(x.vec, get.cdf, df=df)
+ return(list(cdf=x.cdf, pdf=x.pdf))
+}
+
+# E-step
+# x and y are the original observations or ranks
+# rho1 and rho2 are the parameters of each copula
+# f1, f2, g1, g2 are functions, each is a histogram
+e.step.2gaussian <- function(x, y, p, rho1, rho2, x.mar, y.mar){
+
+ # get pdf and cdf of each component from functions in the corresponding component
+ px.1 <- get.pdf.cdf(x, x.mar$f1)
+ px.2 <- get.pdf.cdf(x, x.mar$f2)
+ py.1 <- get.pdf.cdf(y, y.mar$f1)
+ py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+
+ return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
+}
+
+# E-step
+# rho1 and rho2 are the parameters of each copula
+e.step.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
+
+ # get pdf and cdf of each component from functions in the corresponding component
+ px.1 <- get.pdf.cdf(x, x.mar$f1)
+ px.2 <- get.pdf.cdf(x, x.mar$f2)
+ py.1 <- get.pdf.cdf(y, y.mar$f1)
+ py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ if(copula.txt=="gaussian"){
+ c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ } else {
+ if(copula.txt=="clayton"){
+ c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+ }
+ }
+ return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
+}
+
+
+
+
+# M-step
+m.step.2gaussian <- function(x, y, e.z, breaks){
+
+ # compute f1, f2, g1 and g2
+ x.mar <- est.mar.hist(x, e.z, breaks)
+ y.mar <- est.mar.hist(y, e.z, breaks)
+
+ px.1 <- get.pdf.cdf(x, x.mar$f1)
+ px.2 <- get.pdf.cdf(x, x.mar$f2)
+ py.1 <- get.pdf.cdf(y, y.mar$f1)
+ py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+
+ p <- sum(e.z)/length(e.z)
+
+ return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
+}
+
+m.step.2copula <- function(x, y, e.z, breaks, copula.txt){
+
+ # compute f1, f2, g1 and g2
+ x.mar <- est.mar.hist(x, e.z, breaks)
+ y.mar <- est.mar.hist(y, e.z, breaks)
+
+ px.1 <- get.pdf.cdf(x, x.mar$f1)
+ px.2 <- get.pdf.cdf(x, x.mar$f2)
+ py.1 <- get.pdf.cdf(y, y.mar$f1)
+ py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ if(copula.txt=="gaussian"){
+ rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ } else {
+ if(copula.txt=="clayton"){
+ rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)
+ rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ }
+ }
+
+ p <- sum(e.z)/length(e.z)
+
+ return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
+}
+
+
+
+# E-step: pass values
+# x and y are the original observations or ranks
+# rho1 and rho2 are the parameters of each copula
+# f1, f2, g1, g2 are functions, each is a histogram
+e.step.2gaussian.value <- function(x, y, p, rho1, rho2, pdf.cdf){
+
+ c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+
+ e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf,
+ pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
+ return(e.z)
+}
+
+
+e.step.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
+
+ if(copula.txt =="gaussian"){
+ c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+ } else {
+ if(copula.txt =="clayton"){
+ c1 <- clayton.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+ c2 <- clayton.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+ }
+ }
+
+ e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf,
+ pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
+ return(e.z)
+}
+
+
+# M-step: pass values
+m.step.2gaussian.value <- function(x, y, e.z, breaks, fix.rho2){
+
+ # compute f1, f2, g1 and g2
+ x.mar <- est.mar.hist(x, e.z, breaks)
+ y.mar <- est.mar.hist(y, e.z, breaks)
+
+# px.1 <- get.pdf.cdf(x, x.mar$f1)
+# px.2 <- get.pdf.cdf(x, x.mar$f2)
+# py.1 <- get.pdf.cdf(y, y.mar$f1)
+# py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ px.1 <- x.mar$f1.value
+ px.2 <- x.mar$f2.value
+ py.1 <- y.mar$f1.value
+ py.2 <- y.mar$f2.value
+
+ rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+
+ if(!fix.rho2)
+ rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ else
+ rho2 <- 0
+
+ p <- sum(e.z)/length(e.z)
+
+ pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+
+ return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+ pdf.cdf=pdf.cdf))
+}
+
+m.step.2gaussian.value2 <- function(x, y, e.z, breaks, fix.rho2, x.mar, y.mar){
+
+ # compute f1, f2, g1 and g2
+# x.mar <- est.mar.hist(x, e.z, breaks)
+# y.mar <- est.mar.hist(y, e.z, breaks)
+
+# px.1 <- get.pdf.cdf(x, x.mar$f1)
+# px.2 <- get.pdf.cdf(x, x.mar$f2)
+# py.1 <- get.pdf.cdf(y, y.mar$f1)
+# py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ px.1 <- x.mar$f1.value
+ px.2 <- x.mar$f2.value
+ py.1 <- y.mar$f1.value
+ py.2 <- y.mar$f2.value
+
+ rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+
+ if(!fix.rho2)
+ rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ else
+ rho2 <- 0
+
+ p <- sum(e.z)/length(e.z)
+
+ pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+
+ return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+ pdf.cdf=pdf.cdf))
+}
+
+
+
+m.step.2copula.value <- function(x, y, e.z, breaks, fix.rho2, copula.txt){
+
+ # compute f1, f2, g1 and g2
+ x.mar <- est.mar.hist(x, e.z, breaks)
+ y.mar <- est.mar.hist(y, e.z, breaks)
+
+# px.1 <- get.pdf.cdf(x, x.mar$f1)
+# px.2 <- get.pdf.cdf(x, x.mar$f2)
+# py.1 <- get.pdf.cdf(y, y.mar$f1)
+# py.2 <- get.pdf.cdf(y, y.mar$f2)
+
+ px.1 <- x.mar$f1.value
+ px.2 <- x.mar$f2.value
+ py.1 <- y.mar$f1.value
+ py.2 <- y.mar$f2.value
+
+ if(copula.txt=="gaussian"){
+ rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+
+ if(!fix.rho2)
+ rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ else
+ rho2 <- 0
+ } else {
+
+ if(copula.txt=="clayton"){
+ rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)
+
+ if(!fix.rho2)
+ rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ else
+ rho2 <- 0
+ }
+ }
+
+ p <- sum(e.z)/length(e.z)
+
+ pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+
+ return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+ pdf.cdf=pdf.cdf))
+}
+
+
+
+
+# updated
+# mixture likelihood of two gaussian copula
+# nonparametric and ranked transformed
+loglik.2gaussian.copula.value <- function(x, y, p, rho1, rho2, pdf.cdf){
+
+ px.1 <- pdf.cdf$px.1
+ px.2 <- pdf.cdf$px.2
+ py.1 <- pdf.cdf$py.1
+ py.2 <- pdf.cdf$py.2
+
+ c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+
+ sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+
+
+# updated
+# mixture likelihood of two gaussian copula
+# nonparametric and ranked transformed
+loglik.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
+
+ px.1 <- pdf.cdf$px.1
+ px.2 <- pdf.cdf$px.2
+ py.1 <- pdf.cdf$py.1
+ py.2 <- pdf.cdf$py.2
+
+ if(copula.txt=="gaussian"){
+ c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ } else {
+ if(copula.txt=="clayton"){
+ c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+ c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+ }
+ }
+
+ sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+}
+
+
+
+# EM for 2 Gaussian, speed up computation, unfinished
+
+em.2gaussian.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T){
+
+ x <- rank(x, tie="random")
+ y <- rank(y, tie="random")
+
+# x <- rank(x, tie="average")
+# y <- rank(y, tie="average")
+
+ # nbin=20
+ xy.min <- min(x, y)
+ xy.max <- max(x, y)
+ binwidth <- (xy.max-xy.min)/50
+ breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/50)
+# breaks <- seq(xy.min, xy.max, by=binwidth)
+
+
+ # initiate marginals
+ # initialization: first p0 data has
+# e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
+
+ e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
+
+ if(!stoc)
+ para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
+ else
+ para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+
+
+ if(fix.p){
+ p <- p0
+ } else {
+ p <- para$p
+ }
+
+ if(fix.rho2){
+ rho2 <- rho2.0
+ } else {
+ rho2 <- para$rho2
+ }
+
+# rho1 <- 0.8
+ rho1 <- para$rho1
+
+ l0 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
+
+ loglik.trace <- c()
+ loglik.trace[1] <- l0
+# loglik.trace[2] <- l1
+ to.run <- T
+
+ i <- 2
+
+ # this two lines to remove
+# x.mar <- est.mar.hist(x, e.z, breaks)
+# y.mar <- est.mar.hist(y, e.z, breaks)
+
+ while(to.run){
+
+ e.z <- e.step.2gaussian.value(x, y, p, rho1, rho2, para$pdf.cdf)
+ if(!stoc)
+ para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
+ else
+ para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+
+ # fix x.mar and y.mar : to remove
+# if(!stoc)
+# para <- m.step.2gaussian.value2(x, y, e.z, breaks, fix.rho2, x.mar, y.mar)
+# else
+# para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+
+
+ if(fix.p){
+ p <- p0
+ } else {
+ p <- para$p
+ }
+
+ if(fix.rho2){
+ rho2 <- rho2.0
+ } else {
+ rho2 <- para$rho2
+ }
+
+# rho1 <- 0.8
+ rho1 <- para$rho1
+
+ # l0 <- l1
+ l1 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
+ loglik.trace[i] <- l1
+
+#cat("l1=", l1, "\n")
+
+ # Aitken acceleration criterion
+ if(i > 2){
+ l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2]))
+ to.run <- abs(l.inf - loglik.trace[i]) > eps
+#cat("para=", "p=", para$p, " rho1=", rho1, " rho2=", rho2, "\n")
+#cat("l.inf=", l.inf, "\n")
+#cat(l.inf-loglik.trace[i], "\n")
+ }
+
+ i <- i+1
+ }
+
+ bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
+ return(list(para=list(p=para$p, rho1=rho1, rho2=rho2),
+ loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z,
+ loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
+ breaks=breaks))
+}
+
+
+
+em.2copula.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T, copula.txt, nbin=50){
+
+ x <- rank(x, tie="random")
+ y <- rank(y, tie="random")
+
+# x <- rank(x, tie="first")
+# y <- rank(y, tie="first")
+
+ # nbin=50
+ xy.min <- min(x, y)
+ xy.max <- max(x, y)
+ binwidth <- (xy.max-xy.min)/50
+ breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/nbin)
+# breaks <- seq(xy.min, xy.max, by=binwidth)
+
+ # initiate marginals
+ # initialization: first p0 data has
+# e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
+
+ e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
+
+
+ if(!stoc)
+ para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+ else
+ para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+
+ if(fix.p){
+ p <- p0
+ } else {
+ p <- para$p
+ }
+
+ if(fix.rho2){
+ rho2 <- rho2.0
+ } else {
+ rho2 <- para$rho2
+ }
+
+ l0 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+
+ loglik.trace <- c()
+ loglik.trace[1] <- l0
+# loglik.trace[2] <- l1
+ to.run <- T
+
+ i <- 2
+
+ while(to.run){
+
+ e.z <- e.step.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+ if(!stoc)
+ para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+ else
+ para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+
+ if(fix.p){
+ p <- p0
+ } else {
+ p <- para$p
+ }
+
+ if(fix.rho2){
+ rho2 <- rho2.0
+ } else {
+ rho2 <- para$rho2
+ }
+
+
+ # l0 <- l1
+ l1 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+ loglik.trace[i] <- l1
+
+cat("l1=", l1, "\n")
+
+ # Aitken acceleration criterion
+ if(i > 2){
+ l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2]))
+ to.run <- abs(l.inf - loglik.trace[i]) > eps
+cat("para=", "p=", para$p, " rho1=", para$rho1, " rho2=", rho2, "\n")
+#cat("l.inf=", l.inf, "\n")
+#cat(l.inf-loglik.trace[i], "\n")
+ }
+
+ i <- i+1
+ }
+
+ bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
+ return(list(para=list(p=para$p, rho1=para$rho1, rho2=rho2),
+ loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z,
+ loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
+ breaks=breaks))
+}
+
+
+#######################
+####################### fit EM procedure for the matched peaks
+#######################
+
+# remove the unmatched ones
+#rm.unmatch <- function(sample1, sample2, p.value.impute=0){
+#
+# sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+# sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+#
+# invisible(list(sample1=sample1.prune$sig.value, sample2=sample2.prune$sig.value))
+#}
+
+
+# fit 2-component model
+#fit.em <- function(sample12, fix.rho2=T){
+#
+# prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+#
+# em.fit <- em.2gaussian.quick(-prune.sample$sample1, -prune.sample$sample2,
+# p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
+#
+# invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+#}
+
+
+rm.unmatch <- function(sample1, sample2, p.value.impute=0){
+
+ sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+ sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+
+ invisible(list(sample1=sample1.prune, sample2=sample2.prune))
+}
+
+
+# fit 2-component model
+fit.em <- function(sample12, fix.rho2=T){
+
+ prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+
+ em.fit <- em.2gaussian.quick(-prune.sample$sample1$sig.value, -prune.sample$sample2$sig.value,
+ p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
+
+ invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+}
+
+
+
+fit.2copula.em <- function(sample12, fix.rho2=T, copula.txt){
+
+ prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+
+# o <- order(prune.sample$sample1)
+# n <- length(prune.sample$sample1)
+
+# para <- init(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, c(rep(0, round(n/3)), rep(c(0,1), round(n/6)), rep(1, n-round(n/3)-round(n/6))))
+
+# temp <- init.dist(f0, f1)
+ para <- list()
+ para$rho <- 0.6
+ para$p <- 0.3
+ para$mu <- 2.5
+ para$sigma <- 1
+## para$mu <- -temp$mu
+## para$sigma <- temp$sigma
+#cat("mu=", para$mu, "sigma=", para$sigma, "\n")
+
+# em.fit <- em.transform.1loop(-prune.sample$sample1, -prune.sample$sample2,
+ cat("EM is running")
+ em.fit <- em.transform(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, para$mu, para$sigma, para$rho, para$p, eps=0.01)
+
+ invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+}
+
+
+
+
+# fit 1-component model
+fit.1.component <- function(data.pruned, breaks){
+
+# gaussian.1 <- fit.gaussian.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
+# clayton.1 <- fit.clayton.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
+
+ gaussian.1 <- fit.gaussian.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
+ clayton.1 <- fit.clayton.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
+
+ return(list(gaussian.1=gaussian.1, clayton.1=clayton.1))
+}
+
+
+
+#################
+# Fit a single component
+#################
+
+# a single gaussian copula
+# if breaks=NULL, use empirical pdf, otherwise use histogram estimate
+fit.gaussian.1 <- function(x, y, breaks=NULL){
+
+ # rank transformed and compute the empirical cdf
+ t <- emp.mar.cdf.rank(x)
+ s <- emp.mar.cdf.rank(y)
+
+ mle.rho <- mle.gaussian.copula(t, s, rep(1, length(t)))
+
+ c1 <- gaussian.cop.den(t, s, mle.rho)
+cat("c1", sum(log(c1)), "\n")
+
+ if(is.null(breaks)){
+ f1 <- emp.mar.pdf.rank(t)
+ f2 <- emp.mar.pdf.rank(s)
+ } else {
+ x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
+ y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
+
+ f1 <- x.mar$f1.value$pdf # only one component
+ f2 <- y.mar$f1.value$pdf
+ }
+
+
+cat("f1", sum(log(f1)), "\n")
+cat("f2", sum(log(f2)), "\n")
+
+ loglik <- sum(log(c1)+log(f1)+log(f2))
+
+ bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
+
+ return(list(rho=mle.rho, loglik=loglik, bic=bic))
+}
+
+
+# a single Clayton copula
+fit.clayton.1 <- function(x, y, breaks=NULL){
+
+ # rank transformed and compute the empirical cdf
+ t <- emp.mar.cdf.rank(x)
+ s <- emp.mar.cdf.rank(y)
+
+ mle.rho <- mle.clayton.copula(t, s, rep(1, length(t)))
+
+ c1 <- clayton.cop.den(t, s, mle.rho)
+
+ if(is.null(breaks)){
+ f1 <- emp.mar.pdf.rank(t)
+ f2 <- emp.mar.pdf.rank(s)
+ } else {
+ x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
+ y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
+
+ f1 <- x.mar$f1.value$pdf # only one component
+ f2 <- y.mar$f1.value$pdf
+ }
+
+ loglik <- sum(log(c1)+log(f1)+log(f2))
+
+ bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
+
+ return(list(rho=mle.rho, tau=rho/(rho+2), loglik=loglik, bic=bic))
+}
+
+## obsolete function (01-06-2010)
+## compute the average posterior probability to belong to the random component
+## for peaks selected at different cutoffs
+comp.uri.ez <- function(tt, u, v, e.z){
+
+ u.t <- quantile(u, prob=(1-tt))
+ v.t <- quantile(v, prob=(1-tt))
+
+ # ez <- mean(e.z[u >= u.t & v >=u.t]) Is this wrong?
+ ez <- mean(e.z[u >= u.t & v >=v.t])
+
+ return(ez)
+}
+
+## obsolete function (01-06-2010)
+# compute the largest posterior error probability corresponding to
+# the square centered at the origin and spanned top tt% on both coordinates
+# so the consistent low rank ones are excluded
+# boundary.txt: either "max" or "min", if it is error prob, use "max"
+comp.ez.cutoff <- function(tt, u, v, e.z, boundary.txt){
+
+ u.t <- quantile(u, prob=(1-tt))
+ v.t <- quantile(v, prob=(1-tt))
+
+ if(boundary.txt == "max"){
+ # ez.bound <- max(e.z[u >= u.t & v >=u.t])
+ ez.bound <- max(e.z[u >= u.t & v >=v.t])
+ } else {
+ # ez.bound <- min(e.z[u >= u.t & v >=u.t])
+ ez.bound <- min(e.z[u >= u.t & v >=v.t])
+ }
+
+ return(ez.bound)
+
+}
+
+# obsolete functions: 01-06-2010
+# compute the error rate
+# u.t and v.t are the quantiles
+# this one is used for the plots generated initially in the brief writeup
+# and it was used for processing merged data in July before the IDR definition
+# is formalized
+# It does not implement the current definition of IDR
+get.ez.tt.old <- function(em.fit, reverse=T, fdr.level=c(0.01, 0.05, 0.1)){
+
+ u <- em.fit$data.pruned$sample1
+ v <- em.fit$data.pruned$sample2
+
+ tt <- seq(0.01, 0.99, by=0.01)
+ if(reverse){
+ e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+ uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max")
+ } else {
+ e.z <- em.fit$em.fit$e.z
+ uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min")
+ }
+
+ u.t <- quantile(u, prob=(1-tt))
+ v.t <- quantile(v, prob=(1-tt))
+
+ # find the levels on the two replicates
+ sig.value1 <- c()
+ sig.value2 <- c()
+ error.prob.cutoff <- c()
+ n.selected.match <- c()
+
+ for(i in 1:length(fdr.level)){
+
+ # find which uri.ez is closet to fdr.level
+ index <- which.min(abs(uri.ez - fdr.level[i]))
+ sig.value1[i] <- u.t[index]
+ sig.value2[i] <- v.t[index]
+ error.prob.cutoff[i] <- ez.bound[index]
+ if(reverse){
+ n.selected.match[i] <- sum(e.z<=ez.bound[index])
+ } else {
+ n.selected.match[i] <- sum(e.z>=ez.bound[index])
+ }
+ }
+
+ # output the cutoff of posterior probability, signal values on two replicates
+ map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match)
+
+ return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+}
+
+# created: 01-06-2010
+# Output IDR at various number of selected peaks
+# Find cutoff (idr cutoff, sig.value cutoff on each replicate) for specified IDR level
+# IDR definition is similar to FDR
+get.ez.tt <- function(em.fit, idr.level=c(0.01, 0.05, 0.1)){
+
+# u <- em.fit$data.pruned$sample1$sig.value
+# v <- em.fit$data.pruned$sample2$sig.value
+ u <- em.fit$data.pruned$sample1
+ v <- em.fit$data.pruned$sample2
+
+ e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+
+ o <- order(e.z)
+ e.z.ordered <- e.z[o]
+ n.select <- c(1:length(e.z))
+ IDR <- cumsum(e.z.ordered)/n.select
+
+ u.o <- u[o]
+ v.o <- v[o]
+
+ n.level <- length(idr.level)
+# sig.value1 <- rep(NA, n.level)
+# sig.value2 <- rep(NA, n.level)
+ ez.cutoff <- rep(NA, n.level)
+ n.selected <- rep(NA, n.level)
+
+ for(i in 1:length(idr.level)){
+
+ # find which uri.ez is closet to fdr.level
+ index <- which.min(abs(IDR - idr.level[i]))
+# sig.value1[i] <- min(u.o[1:index])
+# sig.value2[i] <- min(v.o[1:index])
+ ez.cutoff[i] <- e.z[index]
+ n.selected[i] <- sum(e.z<=ez.cutoff[i])
+ }
+
+ # output the cutoff of posterior probability, number of selected overlapped peaks
+# map.uv <- cbind(ez.cutoff, sig.value1, sig.value2, n.selected)
+
+ map.uv <- cbind(ez.cutoff, n.selected)
+
+ return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
+}
+
+# return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+
+
+
+
+
+### compute the mean of the marginals
+get.mar.mean <- function(em.out){
+
+ x.f1 <- em.out$x.mar$f1
+ x.f2 <- em.out$x.mar$f2
+
+ y.f1 <- em.out$y.mar$f1
+ y.f2 <- em.out$y.mar$f2
+
+ x.stat1 <- get.hist.mean(x.f1)
+ x.stat2 <- get.hist.mean(x.f2)
+ y.stat1 <- get.hist.mean(y.f1)
+ y.stat2 <- get.hist.mean(y.f2)
+
+ return(list(x.mean1=x.stat1$mean, x.mean2=x.stat2$mean,
+ y.mean1=y.stat1$mean, y.mean2=y.stat2$mean,
+ x.sd1=x.stat1$sd, x.sd2=x.stat2$sd,
+ y.sd1=y.stat1$sd, y.sd2=y.stat2$sd
+ ))
+
+}
+
+
+# compute the mean of marginals
+get.hist.mean <- function(x.f){
+
+ nbreaks <- length(x.f$breaks)
+ x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
+
+ x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
+ x.mean <- sum(x.mid*x.f$density*x.bin)
+ x.sd <- sqrt(sum(x.mid*x.mid*x.f$density*x.bin)-x.mean^2)
+
+ return(list(mean=x.mean, sd=x.sd))
+}
+
+get.hist.var <- function(x.f){
+
+ nbreaks <- length(x.f$breaks)
+ x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
+
+ x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
+ x.mean <- sum(x.mid*x.f$density*x.bin)
+
+ return(mean=x.mean)
+}
+
+# obsolete function (01-06-2010)
+# plot
+plot.ez.group.old <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="avg posterior prob of being random", col.txt=NULL, title.txt=NULL){
+
+ if(is.null(col.txt))
+ col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+
+ x <- c()
+ y <- c()
+
+ for(i in 1:length(ez.list)){
+ x <- c(x, ez.list[[i]]$n)
+
+ y <- c(y, ez.list[[i]]$uri.ez)
+ }
+
+ if(is.null(y.lim))
+ y.lim <- c(0, max(y))
+
+ if(!is.null(file.name)){
+ postscript(paste(plot.dir, "ez.", file.name, sep=""))
+ par(mfrow=c(1,1), mar=c(5,5,4,2))
+ }
+
+ plot(x, y, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+
+ for(i in 1:length(ez.list)){
+ lines(ez.list[[i]]$n, ez.list[[i]]$uri.ez, col=col.txt[i], cex=2, lwd=5)
+ }
+
+# plot(ez.list[[1]]$u.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+# plot(ez.list[[1]]$v.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+
+
+ legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
+
+ if(!is.null(title))
+ title(title.txt)
+
+ if(!is.null(file.name)){
+ dev.off()
+ }
+
+}
+
+
+plot.ez.group <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="IDR", col.txt=NULL, title.txt=NULL){
+
+ if(is.null(col.txt))
+ col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+
+ n.entry <- length(ez.list)
+ x <- rep(NA, n.entry)
+ y.max <- rep(NA, n.entry)
+
+ for(i in 1:n.entry){
+ x[i] <- max(ez.list[[i]]$n)
+
+ y.max[i] <- max(ez.list[[i]]$IDR)
+
+ }
+
+ if(is.null(y.lim))
+ y.lim <- c(0, max(y.max))
+
+ if(!is.null(file.name)){
+ postscript(paste(plot.dir, "ez.", file.name, sep=""))
+ par(mfrow=c(1,1), mar=c(5,5,4,2))
+ }
+
+
+
+ plot(c(0, max(x)), y.lim, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+
+ q <- seq(0.01, 0.99, by=0.01)
+
+ for(i in 1:length(ez.list)){
+
+ n.plot <- round(quantile(ez.list[[i]]$n, prob=q))
+ IDR.plot <- ez.list[[i]]$IDR[n.plot]
+ lines(n.plot, IDR.plot, col=col.txt[i], cex=2, lwd=5)
+ }
+
+
+ legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
+
+ if(!is.null(title))
+ title(title.txt)
+
+ if(!is.null(file.name)){
+ dev.off()
+ }
+
+}
+
+
+
+#############################################################################
+#############################################################################
+# statistics about peaks selected on the individual replicates
+#
+# idr.level: the consistency cutoff, say 0.05
+# uri.output: a list of uri.output from consistency analysis generated by batch-consistency-analysis.r
+# ez.list : a list of IDRs computed from get.ez.tt using the same idr.level
+#
+##################
+
+
+# obsolete?
+# compute the error rate
+# u.t and v.t are the quantiles
+#
+# map back to all peaks and report the number of peaks selected
+get.ez.tt.all.old <- function(em.fit, all.data1, all.data2, idr.level){
+
+ u <- em.fit$data.pruned$sample1
+ v <- em.fit$data.pruned$sample2
+
+ tt <- seq(0.01, 0.99, by=0.01)
+# if(reverse){
+ e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+ uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max")
+# } else {
+# e.z <- em.fit$em.fit$e.z
+# uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+# ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min")
+# }
+
+ u.t <- quantile(u, prob=(1-tt))
+ v.t <- quantile(v, prob=(1-tt))
+
+ # find the levels on the two replicates
+ sig.value1 <- c()
+ sig.value2 <- c()
+ error.prob.cutoff <- c()
+ n.selected.match <- c()
+ npeak.rep1 <- c()
+ npeak.rep2 <- c()
+
+ for(i in 1:length(idr.level)){
+
+ # find which uri.ez is closet to idr.level
+ index <- which.min(abs(uri.ez - as.numeric(idr.level[i])))
+
+ sig.value1[i] <- u.t[index]
+ sig.value2[i] <- v.t[index]
+ error.prob.cutoff[i] <- ez.bound[index]
+ n.selected.match[i] <- sum(u>= u.t[index] & v>=v.t[index])
+
+ npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
+ npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])
+ }
+
+
+ # output the cutoff of posterior probability, signal values on two replicates
+ map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match, npeak.rep1, npeak.rep2)
+
+ return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, idr.level=idr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+}
+
+
+get.ez.tt.all <- function(em.fit, all.data1, all.data2, idr.level=c(0.01, 0.05, 0.1)){
+
+ u <- em.fit$data.pruned$sample1$sig.value
+ v <- em.fit$data.pruned$sample2$sig.value
+# u <- em.fit$data.pruned$sample1
+# v <- em.fit$data.pruned$sample2
+
+ e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+
+ o <- order(e.z)
+ e.z.ordered <- e.z[o]
+ n.select <- c(1:length(e.z))
+ IDR <- cumsum(e.z.ordered)/n.select
+
+ u.o <- u[o]
+ v.o <- v[o]
+
+ n.level <- length(idr.level)
+# sig.value1 <- rep(NA, n.level)
+# sig.value2 <- rep(NA, n.level)
+ ez.cutoff <- rep(NA, n.level)
+ n.selected <- rep(NA, n.level)
+ npeak.rep1 <- rep(NA, n.level)
+ npeak.rep2 <- rep(NA, n.level)
+
+ for(i in 1:length(idr.level)){
+
+ # find which uri.ez is closet to fdr.level
+ index <- which.min(abs(IDR - idr.level[i]))
+# sig.value1[i] <- min(u.o[1:index])
+# sig.value2[i] <- min(v.o[1:index])
+ ez.cutoff[i] <- e.z.ordered[index] # fixed on 02/20/10
+ n.selected[i] <- sum(e.z<=ez.cutoff[i])
+# npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
+# npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])
+ }
+
+ # output the cutoff of posterior probability, number of selected overlapped peaks
+ map.uv <- cbind(ez.cutoff, n.selected)
+
+ return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
+}
+
+# return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+
+
+
+
+
+
+####### the following is for determining thresholds for merged dataset
+
+############# select peaks above a given threshold
+#
+# pass.threshold: a simple method, passing the threshold on the threshold on the individual replicate to the pooled sample
+#
+# sig.map.list: a list of matrix to include all the cutoff values, each row corresponds to a cutoff. The first column is idr.level
+# the 2nd column is the cutoff of ez, the rest of columns are consistency analysis for other replicates
+# sig.value.name: the name of the sig.value column
+# combined: combined dataset
+# nrep: number of pairs of comparisons
+#
+# Procedure:
+# 1. Find the significant threshold corresponding to the idr cutoff on the matched peaks.
+# 2. Each time we will get two or more (if >2 replicates) cutoffs and will report the most stringent and the least stringent
+# cutoff and the number of peaks selected at those two cutoffs
+#############
+
+pass.threshold <- function(sig.map.list, sig.value.name, combined, idr.level, nrep, chr.size){
+
+ sig.map <- c()
+
+ # choose idr.level
+ idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
+ if(length(i) ==0){
+ print("no level matches specified idr.level")
+ return(-1)
+ }
+
+ for(i in 1:length(sig.map.list))
+ sig.map <- c(sig.map, rbind(sig.map.list[[i]])[idr.index, c("sig.value1", "sig.value2")])
+
+
+ npeak.tight <- c()
+ npeak.loose <- c()
+
+
+ max.sig <- max(sig.map)
+ min.sig <- min(sig.map)
+ selected.sig.tight <- combined[combined[,sig.value.name]>=max.sig, ]
+ selected.sig.loose <- combined[combined[,sig.value.name]>=min.sig, ]
+
+ selected.sig.tight <- deconcatenate.chr(selected.sig.tight, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+ selected.sig.loose <- deconcatenate.chr(selected.sig.loose, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+
+ npeak.tight <- nrow(selected.sig.tight)
+ npeak.loose <- nrow(selected.sig.loose)
+
+
+ npeak.stat <- list(idr.level=idr.level, max.sig=max.sig, min.sig=min.sig, npeak.tight=npeak.tight, npeak.loose=npeak.loose)
+
+ invisible(list(npeak.stat=npeak.stat, combined.selected.tight=selected.sig.tight, combined.selected.loose=selected.sig.loose))
+}
+
+#################
+# pass the regions selected from consistency analysis to combined data
+# Threshold is determined on the replicates, the regions above the threshold are selected
+# then peaks on the combined data are selected from the selected regions
+#
+# To avoid being too stringent, regions satisfying the following conditions are selected
+# 1. regions above the significant threshold determined by consistency analysis on either replicate
+# 2. regions that have consistent low peaks, i.e. posterior prob > threshold but not passing the significant threshold
+#
+# This method doesn't make a difference when using different thresholds
+#################
+
+pass.region <- function(sig.map.list, uri.output, ez.list, em.output, combined, idr.level, sig.value.impute=0, chr.size){
+
+ combined <- combined[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ npair <- length(uri.output) # number of pairs of consistency analysis
+ combined.region <- c()
+
+ # choose idr.level
+ idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
+ if(length(idr.index) ==0){
+ print("no level matches specified idr.level")
+ return(-1)
+ }
+
+ for(j in 1:npair){
+ # select peaks from individual replicates using individual cutoff
+ above.1 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value1"]
+ above.2 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value2"]
+ selected.sig.rep1 <- uri.output[[j]]$data12.enrich$merge1[above.1, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ selected.sig.rep2 <- uri.output[[j]]$data12.enrich$merge2[above.2, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+
+ # find the peaks that are overlapped with reliable peaks in the individual replicates
+ overlap.1 <- pair.peaks(selected.sig.rep1, combined)$merge2
+ overlap.2 <- pair.peaks(selected.sig.rep2, combined)$merge2
+
+ # choose the ones with significant value > 0, which are the overlapped ones
+
+ combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+
+ ## consistent low significant ones
+ ## first find consistenct ones, ie. high posterior prob
+ # is.consistent <- ez.list[[j]]$e.z < ez.list[[j]]$ez.cutoff
+
+ # data.matched <- keep.match(uri.output[[j]]$data12.enrich$merge1[!above.1, ], uri.output[[j]]$data12.enrich$merge2[!above.2, ], sig.value.impute=0)
+ # data.matched$sample1 <- data.matched$sample1[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ # data.matched$sample2 <- data.matched$sample2[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+
+ # consistent.in1 <- data.matched$sample1[is.consistent, ]
+ # consistent.in2 <- data.matched$sample2[is.consistent, ]
+
+ # overlap.consistent.1 <- pair.peaks(consistent.in1, combined)$merge2
+ # overlap.consistent.2 <- pair.peaks(consistent.in2, combined)$merge2
+
+ ## choose the ones with significant value > 0, which are the overlapped ones
+
+ # combined.consistent.in1 <- overlap.consistent.1[overlap.consistent.1$sig.value > sig.value.impute, ]
+ # combined.consistent.in2 <- overlap.consistent.2[overlap.consistent.2$sig.value > sig.value.impute, ]
+
+ # combined.region <- rbind(combined.region, combined.in1, combined.in2, combined.consistent.in1, combined.consistent.in2)
+
+ combined.region <- rbind(combined.region, combined.in1, combined.in2)
+
+ is.repeated <- duplicated(combined.region$start)
+ combined.region <- combined.region[!is.repeated, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+
+ }
+ npeak <- nrow(combined.region)
+
+ sig.combined <- c(min(combined.region[,"sig.value"], na.rm=T), max(combined.region[,"sig.value"], na.rm=T))
+
+ # idr.combined <- c(min(combined.region[,"q.value"], na.rm=T), max(combined.region[,"q.value"], na.rm=T))
+
+ npeak.stat <- list(idr.level=idr.level, npeak=npeak)
+
+ combined.region <- deconcatenate.chr(combined.region, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+
+ invisible(list(npeak.stat=npeak.stat, combined.selected=combined.region, sig.combined=sig.combined))
+}
+
+################
+# pass structure: this method does another round of inference on the combined data
+#
+# To make the mixture structure comparable on the replicates and the combined data, the 2nd inference is done on the peaks
+# at the reliable regions on the combined data, using rank transformed significant values. The mixture structure is estimated using my consistency analysis, which
+# estimates marginal distributions of ranks using nonparametric ways. Then the significant values are found out.
+# There are several advantages to do it this way:
+# 1. The premise of passing structure is that the means and variance (i.e. distribution) of two replicates should be the same
+# The significant values on the two replicates clearly have different distributions. The structure estimated from consistency
+# analysis will generate similar rank distribution on two replicates by its setup (i.e. same number of peaks are paired up).
+# 2. Because pooled sample is a black box, the structure is more likely to be followed in the matched regions than other locations,
+# after all, we don't know what other things are. If even the structure doesn't hold on the matched regions,
+# which is possible, let alone the other regions. Focusing on the reliable regions helps to get rid of those unknown noises.
+#
+#
+# modified on 2-20-10: reverse rank.combined, make big sig.value with small
+# ranks, to be consistent with f1 and f2
+################
+
+pass.structure <- function(uri.output, em.output, combined, idr.level, sig.value.impute, chr.size, overlap.ratio=0){
+
+ columns.keep <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori")
+ combined <- combined[, columns.keep]
+ combined.selected.all <- c()
+
+ for(j in 1:npair){
+
+ sample1 <- uri.output[[j]]$data12.enrich$merge1[, columns.keep]
+ sample2 <- uri.output[[j]]$data12.enrich$merge2[, columns.keep]
+
+ # find peaks on the matched region on the combined one
+ data.matched <- keep.match(sample1, sample2, sig.value.impute=sig.value.impute)
+
+ data.matched$sample1 <- data.matched$sample1[, columns.keep]
+ data.matched$sample2 <- data.matched$sample2[, columns.keep]
+
+ overlap.1 <- pair.peaks.filter(data.matched$sample1, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
+ overlap.2 <- pair.peaks.filter(data.matched$sample2, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
+
+ # choose the ones with significant value > sig.value.impute, which are the overlapped ones
+
+ combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, ]
+ combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, ]
+
+ combined.region <- rbind(combined.in1, combined.in2)
+
+ is.repeated <- duplicated(combined.region$start)
+ combined.region <- combined.region[!is.repeated,]
+
+ # now rank the peaks in matched region
+ rank.combined <- rank(-combined.region$sig.value)
+
+ # now transform the parameters estimated into the new scale
+ npeaks.overlap <- nrow(combined.region)
+ npeaks.consistent <- nrow(cbind(em.output[[j]]$data.pruned$sample1))
+
+
+ # the breaks are the same for x and y
+ f1 <- list(breaks=em.output[[j]]$em.fit$x.mar$f1$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f1$density+em.output[[j]]$em.fit$y.mar$f1$density)/2)
+ # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
+ f1$breaks[1] <- min(f1$breaks[1], 0.95)
+
+ f2 <- list(breaks=em.output[[j]]$em.fit$x.mar$f2$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f2$density+em.output[[j]]$em.fit$y.mar$f2$density)/2)
+ # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
+ f2$breaks[1] <- min(f2$breaks[1], 0.95)
+
+ p <- em.output[[j]]$em.fit$para$p
+
+ # find the posterior probability
+ errorprob.combined <- get.comp2.prob(rank.combined, p, f1, f2)
+
+ # compute the FDR and find cutoff of posterior prob and the sig value
+ o <- order(errorprob.combined)
+ idr <- cumsum(errorprob.combined[o])/c(1:length(o))
+ idr.index <- which(idr > idr.level)[1]
+ errorprob.cutoff <- errorprob.combined[o][idr.index]
+
+ # find the minimum significant measure among selected peaks
+ sig.value <- min(combined.region$sig.value[o][1:idr.index])
+ # sig.value <- quantile(combined.region$sig.value[o][1:idr.index], prob=0.05)
+#sig.value <- quantile(combined.region$sig.value[errorprob.combined<=errorprob.cutoff], prob=0.05)
+
+ # apply the significant value on the whole pooled list
+ combined.selected <- combined[combined$sig.value >= sig.value,]
+
+ combined.selected.all <- rbind(combined.selected.all, combined.selected)
+ }
+
+ is.repeated <- duplicated(combined.selected.all$start)
+ combined.selected.all <- combined.selected.all[!is.repeated,]
+
+ npeak <- nrow(combined.selected.all)
+
+ npeak.stat <- list(idr.level=idr.level, npeak=npeak)
+
+ sig.combined <- c(min(combined.selected.all[,"sig.value"], na.rm=T), max(combined.selected.all[,"sig.value"], na.rm=T))
+
+ # idr.combined <- c(min(combined.selected.all[,"q.value"], na.rm=T), max(combined.selected.all[,"q.value"], na.rm=T))
+ # combined.selected.all <- deconcatenate.chr(combined.selected.all, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+
+ combined.selected.all <- combined.selected.all[, c("chr", "start.ori", "stop.ori", "signal.value", "p.value", "q.value")]
+ colnames(combined.selected.all) <- c("chr", "start", "stop", "signal.value", "p.value", "q.value")
+
+ invisible(list(npeak.stat=npeak.stat, combined.selected=combined.selected.all, sig.combined=sig.combined))
+}
+
+
+
+# get the posterior probability of the 2nd component
+get.comp2.prob <- function(x, p, f1, f2){
+
+ # get pdf and cdf of each component from functions in the corresponding component
+ px.1 <- sapply(x, get.pdf, df=f1)
+ px.2 <- sapply(x, get.pdf, df=f2)
+
+ comp2prob <- 1 - p*px.1/(p*px.1+(1-p)*px.2)
+
+ return(comp2prob)
+}
+
+keep.match <- function(sample1, sample2, sig.value.impute=0){
+
+ sample1.prune <- sample1[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
+ sample2.prune <- sample2[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
+
+ invisible(list(sample1=sample1.prune, sample2=sample2.prune))
+}
+
+
+##############################################
+#
+# The following is for simulation
+#
+##############################################
+
+
+# simulate gaussian copula
+# u is the uniform random variable and rho is correlation coefficient
+simu.gaussian.copula <- function(u, rho){
+
+ n <- length(u)
+
+ # simulate y given x=qnorm(u)
+ y <- qnorm(u)*rho + rnorm(n)*sqrt(1-rho^2)
+
+ v <- pnorm(y)
+
+ invisible(v)
+}
+
+## simulate Clayton copula from its generating function
+## Genest and MacKay (1986)
+
+phi.ori <- function(t, s){
+
+ (t^(-s) -1)/s
+}
+
+
+phi.inv <- function(y, s){
+
+ exp(-log(s*y+1)/s)
+}
+
+phi.der <- function(t, s){
+
+ -t^(-s-1)
+}
+
+phi.der.inv <- function(y, s){
+
+ exp(log(-y)/(-s-1))
+}
+
+get.w <- function(u, t, s){
+
+ phi.der.inv(phi.der(u, s)/t, s)
+}
+
+get.v <- function(w, u, s){
+
+ phi.inv(phi.ori(w, s) - phi.ori(u, s), s)
+}
+
+# u is a uniform random variable, s is the association parameter
+simu.clayton.copula <- function(u, s){
+
+ t <- runif(length(u))
+
+ if(s>0){
+ w <- get.w(u, t, s)
+ v <- get.v(w, u, s)
+ return(v)
+ }
+
+ if(s==0){
+ return(t)
+ }
+
+ if(s <0){
+ print("Invalid association parameters for clayton copula")
+ }
+
+}
+
+
+
+###### 09-09-09
+
+# simulate a two-component copula mixture:
+# - marginal distributions for the two variables in each component are both
+# normal and with the same parameters
+# p is the mixing proportion of component 1
+# n is the total sample size
+simu.copula.2mix <- function(s1, s2, p, n, mu1, mu2, sd1, sd2, copula.txt){
+
+ n1 <- round(n*p)
+ n2 <- n-n1
+
+ u1 <- runif(n1)
+
+ if(copula.txt =="clayton")
+ v1 <- simu.clayton.copula(u1, s1)
+ else{
+ if(copula.txt =="gaussian")
+ v1 <- simu.gaussian.copula(u1, s1)
+ }
+
+ u2 <- runif(n2)
+
+ if(copula.txt =="clayton")
+ v2 <- simu.clayton.copula(u2, s2)
+ else{
+ if(copula.txt =="gaussian")
+ v2 <- simu.gaussian.copula(u2, s2)
+ }
+
+ # generate test statistics
+ sample1.1 <- qnorm(u1, mu1, sd1)
+ sample1.2 <- qnorm(v1, mu1, sd1)
+
+ sample2.1 <- qnorm(u2, mu2, sd2)
+ sample2.2 <- qnorm(v2, mu2, sd2)
+
+ return(list(u=c(u1, u2), v=c(v1, v2),
+ u.inv=c(sample1.1, sample2.1), v.inv=c(sample1.2, sample2.2),
+ label=c(rep(1, n1), rep(2, n2))))
+}
+
+# using inverse of the cdf to generate original observations
+
+simu.copula.2mix.inv <- function(s1, s2, p, n, cdf1.x, cdf1.y, cdf2.x, cdf2.y, copula.txt){
+
+ n1 <- round(n*p)
+ n2 <- n-n1
+
+ u1 <- runif(n1)
+
+ if(copula.txt =="clayton")
+ v1 <- simu.clayton.copula(u1, s1)
+ else{
+ if(copula.txt =="gaussian")
+ v1 <- simu.gaussian.copula(u1, s1)
+ }
+
+ u2 <- runif(n2)
+
+ if(copula.txt =="clayton")
+ v2 <- simu.clayton.copula(u2, s2)
+ else{
+ if(copula.txt =="gaussian")
+ v2 <- simu.gaussian.copula(u2, s2)
+ }
+
+ # generate test statistics
+# sample1.1 <- qnorm(u1, mu1, sd1)
+# sample1.2 <- qnorm(v1, mu1, sd1)
+
+# sample2.1 <- qnorm(u2, mu2, sd2)
+# sample2.2 <- qnorm(v2, mu2, sd2)
+
+ sample1.x <- inv.cdf.vec(u1, cdf1.x)
+ sample1.y <- inv.cdf.vec(v1, cdf1.y)
+
+ sample2.x <- inv.cdf.vec(u2, cdf2.x)
+ sample2.y <- inv.cdf.vec(v2, cdf2.y)
+
+
+ return(list(u=c(u1, u2), v=c(v1, v2),
+ u.inv=c(sample1.x, sample2.x), v.inv=c(sample1.y, sample2.y),
+ label=c(rep(1, n1), rep(2, n2))))
+}
+
+# obtain original observation by converting cdf into quantiles
+# u is one cdf
+# u.cdf is a cdf (assuming it is a histogram) and has the break points (cdf$cdf and cdf$breaks)
+# the smallest value of cdf=0 and the largest =1
+inv.cdf <- function(u, u.cdf){
+
+ # which bin it falls into
+ i <- which(u.cdf$cdf> u)[1]
+ q.u <- (u - u.cdf$cdf[i-1])/(u.cdf$cdf[i] - u.cdf$cdf[i-1])* (u.cdf$breaks[i]-u.cdf$breaks[i-1]) + u.cdf$breaks[i-1]
+
+ return(q.u)
+}
+
+inv.cdf.vec <- function(u, u.cdf){
+
+ # check if cdf has the right range (0, 1)
+ ncdf <- length(u.cdf$cdf)
+ nbreaks <- length(u.cdf$breaks)
+
+ if(ncdf == nbreaks-1 & u.cdf$cdf[ncdf]< 1)
+ u.cdf[ncdf] <- 1
+
+ q.u <- sapply(u, inv.cdf, u.cdf)
+
+ return(q.u)
+}
+
+# here we simulate a likely real situation
+# the test statistics from two normal distributions
+# according to their labels, then convert them into p-values w.r.t H0 using
+# one-sided test.
+# The test statistics are correlated for the signal component and independent
+# for the noise component
+# For the signal component, Y = X + eps, where eps ~ N(0, sigma^2)
+simu.test.stat <- function(p, n, mu1, sd1, mu0, sd0, sd.e){
+
+ # first component - signal
+ n.signal <- round(n*p)
+ n.noise <- n - n.signal
+
+ # labels
+ labels <- c(rep(1, n.signal), rep(0, n.noise))
+
+ # test statistics for signal and noise
+ mu.signal <- rnorm(n.signal, mu1, sd1)
+ x.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
+ x.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
+
+ y.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
+ # sd.e can be dependent on signal
+ y.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
+
+ # concatenate
+ x <- c(x.signal, x.noise)
+ y <- c(y.signal, y.noise)
+
+ # convert to p-values based on H0
+ p.x <- 1-pnorm(x, mu0, sqrt(sd0^2+sd.e^2))
+ p.y <- 1-pnorm(y, mu0, sqrt(sd0^2+sd.e^2))
+
+ return(list(p.x=p.x, p.y=p.y, x=x, y=y, labels=labels))
+
+}
+
+# compute the tradeoff and calibration
+forward.decoy.tradeoff.ndecoy <- function(xx, labels, ndecoy){
+
+ xx <- round(xx, 5)
+ o <- order(xx, decreasing=T)
+
+ rand <- 1-labels # if rand==0, consistent
+ # order the random indicator in the same order
+ rand.o <- rand[o]
+
+ if(sum(rand.o) > ndecoy){
+ index.decoy <- which(cumsum(rand.o)==ndecoy)
+ } else {
+ index.decoy <- which(cumsum(rand.o)==sum(rand.o))
+ }
+
+ cutoff.decoy <- xx[o][index.decoy]
+
+ # only consider the unique ones
+ cutoff.unique <- unique(xx[o])
+
+ cutoff <- cutoff.unique[cutoff.unique >= cutoff.decoy[length(cutoff.decoy)]]
+
+ get.decoy.count <- function(cut.off){
+ above <- rep(0, length(xx))
+ above[xx >= cut.off] <- 1
+ decoy.count <- sum(above==1 & rand==1)
+ return(decoy.count)
+ }
+
+ get.forward.count <- function(cut.off){
+ above <- rep(0, length(xx))
+ above[xx >= cut.off] <- 1
+ forward.count <- sum(above==1 & rand==0)
+ return(forward.count)
+ }
+
+ get.est.fdr <- function(cut.off){
+ above <- rep(0, length(xx))
+ above[xx >= cut.off] <- 1
+ est.fdr <- 1-mean(xx[above==1])
+ return(est.fdr)
+ }
+
+ # assuming rand=0 is right
+ get.false.neg.count <- function(cut.off){
+ below <- rep(0, length(xx))
+ below[xx < cut.off] <- 1
+ false.neg.count <- sum(below==1 & rand==0)
+ return(false.neg.count)
+ }
+
+ get.false.pos.count <- function(cut.off){
+ above <- rep(0, length(xx))
+ above[xx >= cut.off] <- 1
+ false.pos.count <- sum(above==1 & rand==1)
+ return(false.pos.count)
+ }
+
+ decoy <- sapply(cutoff, get.decoy.count)
+ forward <- sapply(cutoff, get.forward.count)
+
+ est.fdr <- sapply(cutoff, get.est.fdr)
+ emp.fdr <- decoy/(decoy+forward)
+
+ # compute specificity and sensitivity
+ # assuming rand=1 is wrong and rand=0 is right
+ false.neg <- sapply(cutoff, get.false.neg.count)
+ false.pos <- sapply(cutoff, get.false.pos.count)
+
+ true.pos <- sum(rand==0)-false.neg
+ true.neg <- sum(rand==1)-false.pos
+
+ sensitivity <- true.pos/(true.pos+false.neg)
+ specificity <- true.neg/(true.neg+false.pos)
+
+ return(list(decoy=decoy, forward=forward, cutoff=cutoff, est.fdr=est.fdr, emp.fdr=emp.fdr, sensitivity=sensitivity, specificity=specificity))
+}
+
+
+# compute the em for jackknife and all data, and find FDR
+get.emp.jack <- function(a, p0){
+
+ nobs <- length(a$labels)
+ est <- list()
+ est.all <- list()
+
+ temp.all <- em.transform(-a$p.x, -a$p.y, mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
+# temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
+# rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+ est.all$p <- temp.all$para$p
+ est.all$rho1 <- temp.all$para$rho1
+ est.all$FDR <- get.FDR(temp.all$e.z)
+
+ FDR <- list()
+ p <- c()
+ rho1 <- c()
+
+
+ for(i in 1:nobs){
+
+ temp <- em.transform(-a$p.x[-i], -a$p.y[-i], mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
+# temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
+# rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+ est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
+
+ FDR[[i]] <- est[[i]]$FDR # this is the FDR for top n peaks
+ p[i] <- est[[i]]$p
+ rho1[i] <- est[[i]]$rho1
+ }
+
+ est.jack <- list(FDR=FDR, p=p, rho1=rho1)
+ return(list(est.jack=est.jack, est.all=est.all))
+}
+
+
+# get the npeaks corresponding to the nominal FDR estimated from the sample
+# and find the corresponding FDR from the entire data
+get.FDR.jack <- function(est, FDR.nominal){
+
+ nobs <- length(est$est.jack$FDR)
+ FDR.all <- c()
+ top.n <- c()
+
+ for(i in 1:nobs){
+ top.n[i] <- max(which(est$est.jack$FDR[[i]] <= FDR.nominal))
+ FDR.all[i] <- est$est.all$FDR[top.n[i]]
+ }
+
+ invisible(list(FDR.all=FDR.all, top.n=top.n))
+}
+
+# compute Jackknife peudonumber
+# a is the dataset
+get.emp.IF <- function(a, p0){
+
+ nobs <- length(a$labels)
+ est <- list()
+ est.all <- list()
+
+ temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
+ rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+ est.all$p <- temp.all$para$p
+ est.all$rho1 <- temp.all$para$rho1
+ est.all$FDR <- get.FDR(temp.all$e.z)
+
+ IF.FDR <- list()
+ IF.p <- c()
+ IF.rho1 <- c()
+
+ for(i in 1:nobs){
+
+ temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
+ rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+
+ est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
+
+ IF.FDR[[i]] <- (nobs-1)*(est.all$FDR[-nobs] - est[[i]]$FDR) # this is the FDR for top n peaks
+ IF.p[i] <- (nobs-1)*(est.all$p - est[[i]]$p)
+ IF.rho1[i] <- (nobs-1)*(est.all$rho1 - est[[i]]$rho1)
+ }
+
+ emp.IF <- list(FDR=IF.FDR, p=IF.p, rho1=IF.rho1)
+
+ invisible(list(emp.IF=emp.IF, est.all=est.all, est=est))
+}
+
+# e.z is the posterior probability of being in signal component
+get.FDR <- function(e.z){
+
+ e.z.o <- order(1-e.z)
+ FDR <- cumsum(1-e.z[e.z.o])/c(1:length(e.z.o))
+
+ invisible(FDR)
+}
+
+# get the FDR of selecting the top n peaks
+# IF.est is the sample influence function
+# top.n
+get.IF.FDR <- function(IF.est, top.n){
+
+ nobs <- length(IF.est$emp.IF$FDR)
+ FDR <- c()
+
+ # influence function of p
+ for(i in 1:nobs)
+ FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
+
+ invisible(FDR)
+}
+
+# get the sample influence function for FDR at a given FDR size
+# 1. find the number of peaks selected at a given FDR computed from all obs
+# 2. use the number to find the sample influence function for FDR
+# IF.est$est.all is the FDR with all peaks
+get.IF.FDR.all <- function(IF.est, FDR.size){
+
+ top.n <- which.min(abs(IF.est$est.all$FDR -FDR.size))
+ nobs <- length(IF.est$est.all$FDR)
+ FDR <- c()
+
+ # influence function of p
+ for(i in 1:nobs)
+ FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
+
+ invisible(list(FDR=FDR, top.n=top.n))
+}
+
+plot.simu.uri <- function(x, y){
+
+ tt <- seq(0.01, 0.99, by=0.01)
+ uri <- sapply(tt, comp.uri.prob, u=x, v=y)
+ uri.thin <- uri[seq(1, length(tt), by=3)]
+ tt.thin <- tt[seq(1, length(tt), by=3)]
+ duri <- (uri.thin[-1]-uri.thin[-length(uri.thin)])/(tt.thin[-1]-tt.thin[-length(tt.thin)])
+ uri.spl <- smooth.spline(tt, uri, df=6.4)
+ uri.der <- predict(uri.spl, tt, deriv=1)
+
+ par(mfrow=c(2,2))
+ plot(x[1:n0], y[1:n0])
+ points(x[(n0+1):n], y[(n0+1):n], col=2)
+ plot(rank(-x)[1:n0], rank(-y)[1:n0])
+ points(rank(-x)[(1+n0):n], rank(-y)[(1+n0):n])
+ plot(tt, uri)
+ lines(c(0,1), c(0,1), lty=2)
+ title(paste("rho1=", rho1, " rho2=", rho2, "p=", p, sep=""))
+ plot(tt.thin[-1], duri)
+ lines(uri.der)
+ abline(h=1)
+ invisible(list(x=x, y=y, uri=uri, tt=tt, duri=duri, tt.thin=tt.thin, uri.der=uri.der))
+
+}
+
+
+###### new fitting procedure
+
+
+
+
+# 1. rank pairs
+
+# 2. initialization
+# 3. convert to pseudo-number
+
+# 4. EM
+
+# need plugin and test
+# find the middle point between the bins
+get.pseudo.mix <- function(x, mu, sigma, rho, p){
+
+
+ # first compute cdf for points on the grid
+ # generate 200 points between [-3, mu+3*sigma]
+ nw <- 1000
+ w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw)
+ w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1)
+
+ i <- 1
+
+ quan.x <- rep(NA, length(x))
+
+ for(i in c(1:nw)){
+ index <- which(x >= w.cdf[i] & x < w.cdf[i+1])
+ quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i]
+ }
+
+ index <- which(x < w.cdf[1])
+ if(length(index)>0)
+ quan.x[index] <- w[1]
+
+ index <- which(x > w.cdf[nw])
+ if(length(index)>0)
+ quan.x[index] <- w[nw]
+
+# linear.ext <- function(x, w, w.cdf){
+ # linear interpolation
+# index.up <- which(w.cdf>= x)[1]
+# left.index <- which(w.cdf <=x)
+# index.down <- left.index[length(left.index)]
+# quan.x <- (w[index.up] + w[index.down])/2
+# }
+
+# x.pseudo <- sapply(x, linear.ext, w=w, w.cdf=w.cdf)
+
+# invisible(x.pseudo)
+ invisible(quan.x)
+}
+
+
+# EM to compute the latent structure
+# steps:
+# 1. raw values are first transformed into pseudovalues
+# 2. EM is used to compute the underlining structure, which is a mixture
+# of two normals
+em.transform <- function(x, y, mu, sigma, rho, p, eps){
+
+ x.cdf.func <- ecdf(x)
+ y.cdf.func <- ecdf(y)
+ afactor <- length(x)/(length(x)+1)
+ x.cdf <- x.cdf.func(x)*afactor
+ y.cdf <- y.cdf.func(y)*afactor
+
+ # initialization
+ para <- list()
+ para$mu <- mu
+ para$sigma <- sigma
+ para$rho <- rho
+ para$p <- p
+
+ j <- 1
+ to.run <- T
+ loglik.trace <- c()
+ loglik.inner.trace <- c()
+
+ #to.run.inner <- T
+ z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+ z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+
+# cat("length(z1)", length(z.1), "\n")
+ while(to.run){
+
+ # get pseudo value in each cycle
+# z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+# z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+
+ i <- 1
+ while(to.run){
+
+ # EM for latent structure
+ e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+ para <- m.step.2normal(z.1, z.2, e.z)
+#para$rho <- rho
+#para$p <- p
+#para$mu <- mu
+#para$sigma <- sigma
+ if(i > 1)
+ l.old <- l.new
+
+ # this is just the mixture likelihood of two-component Gaussian
+ l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+
+ loglik.inner.trace[i] <- l.new
+
+ if(i > 1){
+ to.run <- loglik.inner.trace[i]-loglik.inner.trace[i-1]>eps
+ }
+
+
+# if(i > 2){
+# l.inf <- loglik.inner.trace[i-2] + (loglik.inner.trace[i-1] - loglik.inner.trace[i-2])/(1-(loglik.inner.trace[i]-loglik.inner.trace[i-1])/(loglik.inner.trace[i-1]-loglik.inner.trace[i-2]))
+
+# if(loglik.inner.trace[i-1]!=loglik.inner.trace[i-2])
+# to.run <- abs(l.inf - loglik.inner.trace[i]) > eps
+# else
+# to.run <- F
+
+# }
+
+ cat("loglik.inner.trace[", i, "]=", loglik.inner.trace[i], "\n")
+ cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n\n")
+
+ i <- i+1
+ }
+
+
+ # get pseudo value in each cycle
+ z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+ z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+
+ if(j > 1)
+ l.old.outer <- l.new.outer
+
+ l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+
+ loglik.trace[j] <- l.new.outer
+
+ if(j == 1)
+ to.run <- T
+ else{ # stop when iteration>100
+ if(j > 100)
+ to.run <- F
+ else
+ to.run <- l.new.outer - l.old.outer > eps
+ }
+
+# if(j %% 10==0)
+ cat("loglik.trace[", j, "]=", loglik.trace[j], "\n")
+ cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n")
+
+ j <- j+1
+ }
+
+ bic <- -2*l.new + 4*log(length(z.1))
+
+ return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma),
+ loglik=l.new, bic=bic, e.z=e.z, loglik.trace=loglik.trace))
+}
+
+
+
+
+# compute log-likelihood for mixture of two bivariate normals
+loglik.2binormal <- function(z.1, z.2, mu, sigma, rho, p){
+
+ l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p)))
+
+# l.m <- sum((p*d.binormal(z.1, z.2, mu, sigma, rho) + (1-p)*d.binormal(z.1, z.2, 0, 1, 0)))
+ return(l.m)
+}
+
+# check this when rho=1
+
+# density of binomial distribution with equal mean and sigma on both dimensions
+d.binormal <- function(z.1, z.2, mu, sigma, rho){
+
+ loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2))
+
+ return(loglik)
+}
+
+# E-step for computing the latent strucutre
+# e.z is the prob to be in the consistent group
+# e.step for estimating posterior prob
+# z.1 and z.2 can be vectors or scalars
+e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){
+
+ e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p)
+
+ invisible(e.z)
+}
+
+# M-step for computing the latent structure
+# m.step for estimating proportion, mean, sd and correlation coefficient
+m.step.2normal <- function(z.1, z.2, e.z){
+
+ p <- mean(e.z)
+ mu <- sum((z.1+z.2)*e.z)/2/sum(e.z)
+ sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z))
+ rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2)))
+
+ return(list(p=p, mu=mu, sigma=sigma, rho=rho))
+}
+
+
+# assume top p percent of observations are true
+# x and y are ranks, estimate
+init <- function(x, y, x.label){
+
+ x.o <- order(x)
+
+ x.ordered <- x[x.o]
+ y.ordered <- y[x.o]
+ x.label.ordered <- x.label[x.o]
+
+ n <- length(x)
+ p <- sum(x.label)/n
+
+ rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)])
+
+ temp <- find.mu.sigma(x.ordered, x.label.ordered)
+ mu <- temp$mu
+ sigma <- temp$sigma
+
+ invisible(list(mu=mu, sigma=sigma, rho=rho, p=p))
+
+}
+
+# find mu and sigma if the distributions of marginal ranks are known
+# take the medians of the two dist and map back to the original
+init.dist <- function(f0, f1){
+
+ # take the median in f0
+ index.median.0 <- which(f0$cdf>0.5)[1]
+ q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins
+ q.1.small <- f1$cdf[index.median.0]
+
+ # take the median in f1
+ index.median.1 <- which(f1$cdf>0.5)[1]
+ q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins
+ q.1.big <- f1$cdf[index.median.1]
+
+ # find pseudo value for x.middle[1] on normal(0,1)
+ pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
+ pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
+
+ # find pseudo value for x.middle[2] on normal(0,1)
+ pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
+ pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
+
+ mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1)
+
+ sigma <- (pseudo.small.0-mu)/pseudo.small.1
+
+ return(list(mu=mu, sigma=sigma))
+}
+
+# generate labels
+
+# find the part of data with overlap
+
+# find the percentile on noise and signal
+
+# Suppose there are signal and noise components, with mean=0 and sd=1 for noise
+# x and x.label are the rank of the observations and their labels,
+# find the mean and sd of the other component
+# x.label takes values of 0 and 1
+find.mu.sigma <- function(x, x.label){
+
+ x.0 <- x[x.label==0]
+ x.1 <- x[x.label==1]
+
+ n.x0 <- length(x.0)
+ n.x1 <- length(x.1)
+
+ x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1))
+ o <- order(x.end)
+ x.middle <- x.end[o][c(2,3)]
+
+ # the smaller end of the overlap
+ q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1)
+ q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1)
+
+ # the bigger end of the overlap
+ q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1)
+ q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1)
+
+ # find pseudo value for x.middle[1] on normal(0,1)
+ pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
+ pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
+
+ # find pseudo value for x.middle[2] on normal(0,1)
+ pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
+ pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
+
+ mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1)
+
+ sigma <- (pseudo.small.0-mu)/pseudo.small.1
+
+ return(list(mu=mu, sigma=sigma))
+}
diff --git a/pRSEM/idrCode/idrOverlap2npk.sh b/pRSEM/idrCode/idrOverlap2npk.sh
new file mode 100644
index 0000000..7d70280
--- /dev/null
+++ b/pRSEM/idrCode/idrOverlap2npk.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Converts pairwise IDR peak overlap output to narrowPeak
+
+if [[ "$#" -lt 1 ]]
+ then
+ echo 'Converts pairwise IDR peak overlap output to narrowPeak' 1>&2
+ echo "USAGE: $(basename $0) [idrOverlapFile] [oDir]" 1>&2
+ echo '[idrOverlapFile]: overlap output file from pairwise IDR analysis' 1>&2
+ echo '[oDir]: output directory' 1>&2
+ exit 1
+fi
+
+# overlap file
+ovFile=$1
+if [[ ! -e ${ovFile} ]]
+ then
+ echo "ERROR:${ovFile} does not exist" 1>&2
+ exit 1
+fi
+
+# Output directory
+oDir=$(dirname ${ovFile})
+[[ $# -gt 1 ]] && oDir=$2
+if [[ ! -d ${oDir} ]]
+ then
+ mkdir ${oDir}
+fi
+oDir=$(echo ${oDir} | sed -r 's:/$::g')
+
+# Create output file
+oFile="${oDir}/$(basename ${ovFile} .gz).npk.gz"
+if grep -q -E '\.gz$' ${ovFile}
+ then
+ zcat ${ovFile} | sed 1d | sed -r 's/"//g' | sort -k11g,11g | awk '{if ($3 <=$7) st=$3 ; else st=$7 ; if ($4 >= $8) sto=$4 ; else sto=$8 ; printf "%s\t%d\t%d\t%d\t%s\t.\t%s\t%f\t%f\n",$2,st,sto,NR,$5,$9,-log($10)/log(10),-log($11)/log(10)}' | gzip -c > ${oFile}
+else
+ sed 1d ${ovFile} | sed -r 's/"//g' | sort -k11g,11g | awk '{if ($3 <=$7) st=$3 ; else st=$7 ; if ($4 >= $8) sto=$4 ; else sto=$8 ; printf "%s\t%d\t%d\t%d\t%s\t.\t%s\t%f\t%f\n",$2,st,sto,NR,$5,$9,-log($10)/log(10),-log($11)/log(10)}' | gzip -c > ${oFile}
+fi \ No newline at end of file
diff --git a/pRSEM/idrCode/peakCallingPipelineForIdr.txt b/pRSEM/idrCode/peakCallingPipelineForIdr.txt
new file mode 100644
index 0000000..6338bd2
--- /dev/null
+++ b/pRSEM/idrCode/peakCallingPipelineForIdr.txt
@@ -0,0 +1,11 @@
+IDR Peak calling Pipeline:
+Feb 9 2012
+Anshul Kundaje
+akundaje _at_ stanford _dot_ edu
+
+==================
+Mailing list
+==================
+Please join the IDR mailing list https://groups.google.com/group/idr-discuss for FAQs, discussions and updates on software.
+
+See https://sites.google.com/site/anshulkundaje/projects/idr for the latest pipeline.
diff --git a/pRSEM/idrCode/submit.idrmerge.lsf.sh b/pRSEM/idrCode/submit.idrmerge.lsf.sh
new file mode 100644
index 0000000..5c3673a
--- /dev/null
+++ b/pRSEM/idrCode/submit.idrmerge.lsf.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# $1: file containing arguments for batch-consistency-plot-merged3.r
+# File has 4 tab-delimited fields
+# [nPairs]\t[combOutFilePrefix]\t[pairOutFilePrefix,...]\t[combPeakFile]
+# $2: idrThreshold (OPTIONAL: default is 0.1)
+# $3: fdrThreshold (OPTIONAL: default is 0.7)
+# Rscript batch-consistency-plot-merged2.r [npairs] [output.prefix] [input.file.prefix 1, 2, 3 ...] -1 [idr.level] signal.value [pooled.filename] T 0 F
+
+if [[ "$#" -lt 1 ]]
+then
+ echo 'submit.idrmerge.lsf.sh:' 1>&2
+ echo 'Submits jobs to run IDR code on pooled peak calls' 1>&2
+ echo 'USAGE:' 1>&2
+ echo 'submit.idrmerge.lsf.sh <idrMergeArgFile> <OPTIONAL:idrThresh> <OPTIONAL:fdrThresh>' 1>&2
+ echo '<idrMergeArgFile>: File has 4 tab-delimited fields' 1>&2
+ echo ' [nPairs]\t[combOutFilePrefix]\t[pairOutFilePrefix,...]\t[combPeakFile]' 1>&2
+ echo '<idrThresh>: OPTIONAL: Default of 0.1' 1>&2
+ echo '<fdrThresh>: OPTIONAL: Default of 0.7' 1>&2
+ exit 1
+fi
+
+idrlevel='0.1'
+if [[ "$#" -ge 2 ]]; then idrlevel=$2 ; fi
+fdrthresh='0.7'
+if [[ "$#" -ge 3 ]]; then fdrthresh=$3 ; fi
+fdrthresh=$(echo ${fdrthresh} | awk '{print -log($1)/log(10)}') # convert fdrthreshold or logscale
+
+rpath=`which Rscript`
+# TEMPORARY DIRECTORY
+#TMP='/scratch/users/akundaje/temp'
+
+while read inputline
+do
+
+ npairs="$(echo $inputline | awk '{print $1}')" # extract npairs
+ ofPrefix="$(echo $inputline | awk '{print $2}')" # extract output.prefix
+ ifPrefix="$(echo $inputline | awk '{print $3}' | sed -r 's/,/ /g')" # extract input.prefixes
+ combFname="$(echo $inputline | awk '{print $4}')" # extract merged peak file name
+ echo "${ofPrefix}"
+ logfile="${ofPrefix}.log"
+ randseed="$RANDOM"
+
+ # Create submitScript
+ submitScript="tempMerge_${randseed}.sh"
+ echo '#!/bin/bash' > "${submitScript}"
+
+ if [[ `echo ${combFname} | grep -E '\.gz$'` ]]
+ then
+ isgz='1'
+ combStub="$(echo ${combFname} | sed -r -e 's:\.gz$::g' -e 's:^.*/::g')" # remove .gz and remove the directory name
+ combPeakFile="${TMP}/${combStub}_${randseed}"
+ # echo "gunzip -c ${combFname} > ${combPeakFile}" >> "${submitScript}"
+ else
+ isgz='0'
+ combPeakFile="${combFname}"
+ fi
+
+ combPeakFile='random.txt'
+ echo "${rpath} batch-consistency-plot-merged2.r ${npairs} ${ofPrefix} ${ifPrefix} -1 ${idrlevel} signal.value ${combPeakFile} T 0 F" >> "${submitScript}"
+
+ if [[ "${isgz}" == '1' ]]
+ then
+ echo "rm -f ${combPeakFile}" >> "${submitScript}"
+ fi
+
+ chmod 755 "${submitScript}"
+ bsub -W 24:00 -M 4096 -R "rusage[mem=4096]" -o ${logfile} -e ${logfile} < "${submitScript}"
+ rm "${submitScript}"
+
+done < $1
+
+exit 0
diff --git a/pRSEM/idrCode/submit.idrpair.lsf.sh b/pRSEM/idrCode/submit.idrpair.lsf.sh
new file mode 100644
index 0000000..ae03ed5
--- /dev/null
+++ b/pRSEM/idrCode/submit.idrpair.lsf.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# $1: file containing arguments for batch-consistency-analysis.r
+# File has 3 tab-delimited fields
+# [peakFile1]\t[peakFile2]\t[pairOutFilePrefix]
+# Rscript batch-consistency-analysis.r [peakfile1] [peakfile2] -1 [outfile.prefix] 0 F signal.value
+
+if [[ "$#" -lt 1 ]]
+then
+ echo 'submit.idrpair.lsf.sh:' 1>&2
+ echo 'Submits jobs to run pairwise IDR code' 1>&2
+ echo 'USAGE:' 1>&2
+ echo 'submit.idrpair.lsf.sh <idrPairArgFile>' 1>&2
+ echo '<idrPairArgFile>: File has 3 tab-delimited fields' 1>&2
+ echo ' [peakFile1]\t[peakFile2]\t[pairOutFilePrefix]' 1>&2
+ exit 1
+fi
+
+rpath=`which Rscript`
+
+JOBGROUPID="/idrPair${RANDOM}"
+
+counter=1
+
+while read inputline
+do
+
+ [[ $(( counter % 50 )) -eq 0 ]] && sleep 10s
+
+ pf1="$(echo $inputline | awk '{print $1}')" # extract peak file 1
+ pf2="$(echo $inputline | awk '{print $2}')" # extract peak file 2
+ ofPrefix="$(echo $inputline | awk '{print $3}')" # extract outfile.prefix
+
+ if [[ ! -e ${pf1} || ! -e ${pf2} || ! -d $( dirname ${ofPrefix} ) ]]
+ then
+ echo "Some file is not found for $( dirname ${ofPrefix} ): $(basename ${pf1}) $(basename ${pf2})"
+ continue
+ fi
+
+ logfile="${ofPrefix}.log"
+ randseed="${RANDOM}${RANDOM}"
+
+ # If file exists then skip
+ if [[ -e "${ofPrefix}-npeaks-aboveIDR.txt" ]]
+ then
+ continue
+ fi
+
+ # Create submit script
+ submitScript="temp_${randseed}.sh"
+ echo '#!/bin/bash' > ${submitScript}
+
+ if echo ${pf1} | grep -q -E '\.gz$'
+ then
+ pf1gz=1
+ pf1stub="$( basename ${pf1} | sed -r -e 's:\.gz$::g' )" # remove .gz and remove the directory name
+ peakfile1="${TMP}/idr_${randseed}/${pf1stub}_${randseed}"
+ echo "[[ ! -d ${TMP}/idr_${randseed} ]] && mkdir ${TMP}/idr_${randseed}" >> ${submitScript}
+ echo "zcat ${pf1} > ${peakfile1}" >> ${submitScript}
+ else
+ pf1gz=0
+ peakfile1="${pf1}"
+ fi
+
+ if echo ${pf2} | grep -q -E '\.gz$'
+ then
+ pf2gz=1
+ pf2stub="$( basename ${pf2} | sed -r -e 's:\.gz$::g' )" # remove .gz and remove the directory name
+ peakfile2="${TMP}/idr_${randseed}/${pf2stub}_${randseed}"
+ echo "[[ ! -d ${TMP}/idr_${randseed} ]] && mkdir ${TMP}/idr_${randseed}" >> ${submitScript}
+ echo "zcat ${pf2} > ${peakfile2}" >> ${submitScript}
+ else
+ pf2gz=0
+ peakfile2="${pf2}"
+ fi
+
+ echo "${rpath} batch-consistency-analysis.r ${peakfile1} ${peakfile2} 500 ${ofPrefix} 0 F p.value" >> "${submitScript}"
+
+ if [[ ${pf1gz} -eq 1 || ${pf2gz} -eq 1 ]]
+ then
+ echo "[[ -d ${TMP}/idr_${randseed} ]] && rm -rf ${TMP}/idr_${randseed}" >> ${submitScript}
+ fi
+
+ bsub -q research-rh6 -g ${JOBGROUPID} -W 48:00 -o ${logfile} -e ${logfile} < ${submitScript}
+ (( counter = counter + 1 ))
+ # bsub -g {JOBGROUPID} -W 48:00 -M 4096 -R "rusage[mem=4096]" -o ${logfile} -e ${logfile} < "${submitScript}"
+ rm "${submitScript}"
+
+done < $1
+
+exit 0
diff --git a/pRSEM/installRLib.R b/pRSEM/installRLib.R
new file mode 100644
index 0000000..605dded
--- /dev/null
+++ b/pRSEM/installRLib.R
@@ -0,0 +1,71 @@
+#
+# pliu 20160911
+#
+# install Bioconductor and pRSEM-required libraries to this directory
+#
+# require R-3.3.1
+#
+# CRAN: data.table, caTools
+# Local: spp
+# BioC v3.3: ShortRead, GenomicRanges
+#
+# install devtools first and use its' install_version to install packages in
+# particular version.
+#
+# packages repos are obtained from
+# source("http://bioconductor.org/biocLite.R")
+# biocinstallRepos()
+#
+
+main <- function() {
+ param <- list(
+ lib_loc = './',
+ repos = list(
+ BioCsoft = "http://bioconductor.org/packages/3.3/bioc/",
+ BioCann = "http://bioconductor.org/packages/3.3/data/annotation/",
+ BioCexp = "http://bioconductor.org/packages/3.3/data/experiment/",
+ BioCextra = "http://bioconductor.org/packages/3.3/extra/",
+ CRAN = "http://cran.us.r-project.org"
+ ),
+
+ pkg_spp = '../phantompeakqualtools/spp_1.10.1_on_R3.3/',
+
+ pkg2ver = list(
+ ## name version
+ caTools = '1.17.1', ## for spp
+ data.table = '1.9.6',
+ GenomicRanges = '1.24.3',
+ ShortRead = '1.30.0'
+ )
+ )
+
+ options(repos=structure(c(CRAN=param$repos$CRAN)))
+ installRLib(param)
+}
+
+
+installRLib <- function(param) {
+ prsem_installed_pkgs <- rownames(installed.packages(lib.loc=param$lib_loc))
+
+ if ( ! 'devtools' %in% prsem_installed_pkgs ) {
+ install.packages('devtools', lib=param$lib_loc, type='source')
+ }
+
+ .libPaths(c(param$lib_loc, .libPaths()))
+ library(devtools)
+
+ for ( pkg_name in names(param$pkg2ver) ) {
+ pkg_version <- param$pkg2ver[[pkg_name]]
+ if ( ! pkg_name %in% prsem_installed_pkgs ) {
+ install_version(pkg_name, version=pkg_version, repos=param$repos,
+ lib=param$lib_loc, type='source')
+ }
+ }
+
+ if ( ! 'spp' %in% prsem_installed_pkgs ) {
+ install.packages(param$pkg_spp, lib=param$lib_loc, repos=NULL,
+ type='source')
+ }
+}
+
+main()
diff --git a/pRSEM/phantompeakqualtools/README.txt b/pRSEM/phantompeakqualtools/README.txt
new file mode 100644
index 0000000..9b3b51f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/README.txt
@@ -0,0 +1,203 @@
+===========================
+Anshul Kundaje
+Date: Feb 13 2012
+Email: anshul@kundaje.net
+Version: 2.0
+===========================
+This set of programs operate on mapped Illumina single-end read datasets in tagAlign or BAM format.
+They can be used to
+(1) Compute the predominant fragment length based on strand cross-correlation peak
+(2) Compute Data quality measures based on strand cross-correlation analysis and relative phantom peak
+(3) Call Peaks and regions for punctate binding ChIP-seq datasets
+
+===========================
+CITATIONS:
+===========================
+If you are using the code or results in any formal publication please cite
+[1] Anshul Kundaje, Computer Science Dept., MIT, ENCODE Consortium, http://code.google.com/p/phantompeakqualtools, Feb 2013
+[2] Kharchenko PK, Tolstorukov MY, Park PJ, Design and analysis of ChIP-seq experiments for DNA-binding proteins Nat Biotechnol. 2008 Dec;26(12):1351-9
+
+===========================
+DEPENDENCIES:
+===========================
+unix,bash,R-2.10 and above,awk,samtools,boost C++ libraries
+R packages: SPP, caTools, snow
+NOTE: The current package does not run on a MAC or WINDOWS.
+
+===========================
+FILES:
+===========================
+(1) spp_1.10.1.tar.gz : modified SPP peak-caller package (The original SPP-peak caller package was written by Peter Kharchenko[2])
+(2) run_spp.R : The script to compute the frag length, data quality characteristics based on cross-correlation analysis and/or peak calling
+(3) run_spp_nodups.R : (FOR DATASETS WHERE DUPLICATES ARE REMOVED i.e. MAX 1 READ STARTING AT ANY GENOMIC LOCATION) The script to compute the frag length, data quality characteristics based on cross-correlation analysis and/or peak calling
+(4) README.txt : This README
+
+============================
+INSTALLATION:
+============================
+(1) First make sure that you have installed R (version 2.10 or higher)
+
+(2) Also, you must have the Boost C++ libraries installed. Most linux distributions have these preinstalled.
+If not, you can easily get these from your standard package manager for your linux distribution.
+e.g synaptic package manager (apt-get) for ubuntu or emerge for gentoo.
+
+(3) Install the following R packages
+ - caTools
+ - snow (if you want parallel processing)
+from within R
+install.packages([packageName],dependencies=TRUE)
+
+(4) You can then install the SPP package spp_1.10.X.tar.gz
+<From your bash shell>
+ R CMD INSTALL spp_1.10.X.tar.gz
+
+<From within R>
+ install.packages('spp_1.10.X.tar.gz',dependencies=TRUE)
+
+(5) If your alignment files are BAM, you must have the samtools executable in your path so that the R script run_spp.R can call it using the system() command
+You can get samtools from here http://samtools.sourceforge.net/
+You can add the following line to your .bashrc file
+ export PATH="<path_to_samtools_executable>:${PATH}"
+
+(6) Run run_spp.R
+ Rscript run_spp.R <options>
+
+===========================
+GENERAL USAGE
+===========================
+Usage: Rscript run_spp.R <options>
+
+MANDATORY ARGUMENTS
+-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
+
+MANDATORY ARGUMENTS FOR PEAK CALLING
+-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
+
+OPTIONAL ARGUMENTS
+-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600
+-speak=<strPeak>, user-defined cross-correlation peak strandshift
+-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)
+-p=<nodes> , number of parallel processing nodes, default=0
+-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling
+-npeak=<numPeaks>, threshold on number of peaks to call
+-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)
+-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name
+
+OUTPUT ARGUMENTS
+-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)
+-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)
+-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment around peak summits)
+-savd=<rdatafile> OR -savd, save Rdata file
+-savp=<plotdatafile> OR -savp, save cross-correlation plot
+-out=<resultfile>, append peakshift/phantomPeak results to a file
+-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists
+-clean, if used it will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files
+
+===========================
+TYPICAL USAGE
+===========================
+(1) Determine strand cross-correlation peak / predominant fragment length OR print out quality measures
+
+ Rscript run_spp.R -c=<tagAlign/BAMfile> -savp -out=<outFile>
+
+-savp will create a pdf showing the cross-correlation plot
+-out=<outFile> will create and/or append to a file named <outFile> several important characteristics of the dataset.
+The file contains 11 tab delimited columns
+
+COL1: Filename: tagAlign/BAM filename
+COL2: numReads: effective sequencing depth i.e. total number of mapped reads in input file
+COL3: estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation.
+ The top 3 local maxima locations that are within 90% of the maximum cross-correlation value are output.
+ In almost all cases, the top (first) value in the list represents the predominant fragment length.
+ If you want to keep only the top value simply run
+ sed -r 's/,[^\t]+//g' <outFile> > <newOutFile>
+COL4: corr_estFragLen: comma separated strand cross-correlation value(s) in decreasing order (col2 follows the same order)
+COL5: phantomPeak: Read length/phantom peak strand shift
+COL6: corr_phantomPeak: Correlation value at phantom peak
+COL7: argmin_corr: strand shift at which cross-correlation is lowest
+COL8: min_corr: minimum value of cross-correlation
+COL9: Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8
+COL10: Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8)
+COL11: QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow,-1:Low,0:Medium,1:High,2:veryHigh)
+
+You can run the program on multiple datasets in parallel and append all the quality information to the same <outFile> for a summary analysis.
+
+NSC values range from a minimum of 1 to larger positive numbers. 1.1 is the critical threshold.
+Datasets with NSC values much less than 1.1 (< 1.05) tend to have low signal to noise or few peaks (this could be biological eg.a factor that truly binds only a few sites in a particular tissue type OR it could be due to poor quality)
+
+RSC values range from 0 to larger positive values. 1 is the critical threshold.
+RSC values significantly lower than 1 (< 0.8) tend to have low signal to noise. The low scores can be due to failed and poor quality ChIP, low read sequence quality and hence lots of mismappings, shallow sequencing depth (significantly below saturation) or a combination of these. Like the NSC, datasets with few binding sites (< 200) which is biologically justifiable also show low RSC scores.
+
+Qtag is a thresholded version of RSC.
+
+(2) Peak calling
+
+Rscript run_spp.R -c=<ChIP_tagalign/BAM_file> -i=<control_tagalign/BAM_file> -fdr=<fdr> -odir=<peak_call_output_dir> -savr -savp -savd -rf
+Rscript run_spp.R -c=<ChIP_tagalign/BAM_file> -i=<control_tagalign/BAM_file> -npeak=<npeaks> -odir=<peak_call_output_dir> -savr -savp -savd -rf
+
+(3) For IDR analysis you want to call a large number of peaks (relaxed threshold) so that the IDR model has access to a sufficient noise component.
+
+Rscript run_spp.R -c=<ChIP_tagalign/BAM_file> -i=<control_tagalign/BAM_file> -npeak=300000 -odir=<peak_call_output_dir> -savr -savp -rf -out=<resultFile>
+
+===========================
+NOTES:
+===========================
+- It is EXTREMELY important to filter out multi-mapping reads from the BAM/tagAlign files. Large number of multimapping reads can severly affect the phantom peak coefficient and peak calling results.
+
+- If a dataset seems to have high PCR bottlenecking, then you might want to actually clamp the number of unique mappping reads per position to 1 or upto 5. If not the phantom peak coefficient can be artificially good.
+
+- For the IDR rescue strategy, one needs to pool reads from replicates and then shuffle and subsample the mapped reads to create two balanced pseudoReplicates. This is much easier to implement on tagAlign/BED read-mapping files using the unix 'shuf' command. So it is recommended to use the tagAlign format.
+
+- In most cases, you can simply use the maximum reported strand correlation peak as the predominant fragment length.
+However, it is useful to manually take a look at the cross-correlation plot to make sure the selected max peak is not an artifact.
+
+- Also, if there are problems with library size-selection, a dataset's cross-correlation profile can have multiple strong cross-correlation peaks. This is currently not autodetected.
+
+===========================
+INPUT FILE FORMATS:
+===========================
+(1) BAM format
+This is a binary alignment format specified in http://samtools.sourceforge.net/SAM-1.3.pdf
+You MUST have samtools installed to use run_spp.R with BAM files
+
+(2) TagAlign files
+This a text-based BED3+3 alignment format that is easier to manipulate. It contains 6 tab delimited columns.
+
+chrom string Name of the chromosome
+chromStart int The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0.
+chromEnd int The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as
+ chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+sequence string Sequence of this read
+score int Indicates uniqueness or quality (preferably 1000/alignmentCount).
+strand char Orientation of this read (+ or -)
+
+NOTE: You dont have to store the sequence of reads in the sequence field as the peak caller never really uses that field. You can just put the letter 'N' in that field. This saves space significantly.
+
+For the IDR rescue strategy, one needs to use shuffled and subsampled version of the alignment files. This is much easier to implement on tagAlign text files using the unix 'shuf' command.
+So it is recommended to preferably use the tagAlign format.
+
+----------------------------------
+CONVERTING BAM TO TAGALIGN FILES
+----------------------------------
+It is very quick to convert BAM files to gzipped tagAlign files using
+
+samtools view -F 0x0204 -o - <bamFile> | awk 'BEGIN{OFS="\t"}{if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"} }' | gzip -c > <gzip_TagAlignFileName>
+
+===========================
+OUTPUT FILE FORMATS:
+===========================
+(1) NarrowPeak/RegionPeak format
+
+The output peak file is in BED6+4 format known as tagAlign. It consists of 10 tab-delimited columns
+
+chrom string Name of the chromosome
+chromStart int The starting position of the feature in the chromosome. The first base in a chromosome is numbered 0.
+chromEnd int The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature.
+ For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+name string Name given to a region (preferably unique). Use '.' if no name is assigned.
+score int Indicates how dark the peak will be displayed in the browser (1-1000). If '0', the DCC will assign this based on signal value. Ideally average signalValue per base spread between 100-1000.
+strand char +/- to denote strand or orientation (whenever applicable). Use '.' if no orientation is assigned.
+signalValue float Measurement of overall (usually, average) enrichment for the region.
+pValue float Measurement of statistical signficance (-log10). Use -1 if no pValue is assigned.
+qValue float Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+peak int Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
diff --git a/pRSEM/phantompeakqualtools/peakCallingPipelineForIdr.txt b/pRSEM/phantompeakqualtools/peakCallingPipelineForIdr.txt
new file mode 100644
index 0000000..89a7325
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/peakCallingPipelineForIdr.txt
@@ -0,0 +1 @@
+See https://sites.google.com/site/anshulkundaje/projects/idr for the ENCODE peak calling pipeline using SPP + IDR \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/run_spp.R b/pRSEM/phantompeakqualtools/run_spp.R
new file mode 100644
index 0000000..bef94fe
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/run_spp.R
@@ -0,0 +1,885 @@
+# run_spp.R
+# =============
+# Author: Anshul Kundaje, Computer Science Dept., MIT
+# Email: anshul@kundaje.net
+# Last updated: Feb 12, 2012
+# =============
+# MANDATORY ARGUMENTS
+# -c=<ChIP_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
+# MANDATORY ARGUMENT FOR PEAK CALLING
+# -i=<Input_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
+# OPTIONAL ARGUMENTS
+# -s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-500:5:1500
+# -speak=<strPeak>, user-defined cross-correlation peak strandshift
+# -x=<min>:<max>, strand shifts to exclude (This is mainly to avoid phantom peaks) default=10:(readlen+10)
+# -p=<nodes> , number of parallel processing nodes, default=NULL
+# -fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling
+# -npeak=<numPeaks>, threshold on number of peaks to call
+# -tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)
+# -filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name
+# OUTPUT PARAMETERS
+# -odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)
+# -savn=<narrowpeakfilename> OR -savn NarrowPeak file name
+# -savr=<regionpeakfilename> OR -savr RegionPeak file name
+# -savd=<rdatafile> OR -savd , save Rdata file
+# -savp=<plotdatafile> OR -savp , save cross-correlation plot
+# -out=<resultfile>, append peakshift result to a file
+# format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>Normalized SCC (NSC)<tab>Relative SCC (RSC)<tab>QualityTag
+# -rf , if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists
+# -clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files
+
+args <- commandArgs(trailingOnly=TRUE); # Read Arguments from command line
+nargs = length(args); # number of arguments
+
+# ###########################################################################
+# AUXILIARY FUNCTIONS
+# ###########################################################################
+
+print.usage <- function() {
+# ===================================
+# Function will print function usage
+# ===================================
+ cat('Usage: Rscript run_spp.R <options>\n',file=stderr())
+ cat('MANDATORY ARGUMENTS\n',file=stderr())
+ cat('-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped)(FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
+ cat('MANDATORY ARGUMENTS FOR PEAK CALLING\n',file=stderr())
+ cat('-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
+ cat('OPTIONAL ARGUMENTS\n',file=stderr())
+ cat('-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-500:5:1500\n',file=stderr())
+ cat('-speak=<strPeak>, user-defined cross-correlation peak strandshift\n',file=stderr())
+ cat('-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)\n',file=stderr())
+ cat('-p=<nodes> , number of parallel processing nodes, default=0\n',file=stderr())
+ cat('-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling\n',file=stderr())
+ cat('-npeak=<numPeaks>, threshold on number of peaks to call\n',file=stderr())
+ cat('-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)\n',file=stderr())
+ cat('-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name\n',file=stderr())
+ cat('OUTPUT ARGUMENTS\n',file=stderr())
+ cat('-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)\n',file=stderr())
+ cat('-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)\n',file=stderr())
+ cat('-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment)\n',file=stderr())
+ cat('-savd=<rdatafile> OR -savd, save Rdata file\n',file=stderr())
+ cat('-savp=<plotdatafile> OR -savp, save cross-correlation plot\n',file=stderr())
+ cat('-out=<resultfile>, append peakshift/phantomPeak results to a file\n',file=stderr())
+ cat(' format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>Normalized SCC (NSC)<tab>Relative SCC (RSC)<tab>QualityTag)\n',file=stderr())
+ cat('-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists\n',file=stderr())
+ cat('-clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files\n',file=stderr())
+} # end: print.usage()
+
+get.file.parts <- function(file.fullpath) {
+# ===================================
+# Function will take a file name with path and split the file name into
+# path, fullname, name and ext
+# ===================================
+ if (! is.character(file.fullpath)) {
+ stop('File name must be a string')
+ }
+
+ file.parts <- strsplit(as.character(file.fullpath), .Platform$file.sep, fixed=TRUE)[[1]] # split on file separator
+
+ if (length(file.parts) == 0) { # if empty file name
+ return(list(path='',
+ fullname='',
+ name='',
+ ext='')
+ )
+ } else {
+ if (length(file.parts) == 1) { # if no path then just the file name itself
+ file.path <- '.'
+ file.fullname <- file.parts
+ } else {
+ file.path <- paste(file.parts[1:(length(file.parts)-1)], collapse=.Platform$file.sep) # 1:last-1 token is path
+ file.fullname <- file.parts[length(file.parts)] # last token is filename
+ }
+ file.fullname.parts <- strsplit(file.fullname,'.',fixed=TRUE)[[1]] # split on .
+ if (length(file.fullname.parts) == 1) { # if no extension
+ file.ext <- ''
+ file.name <- file.fullname.parts
+ } else {
+ file.ext <- paste('.', file.fullname.parts[length(file.fullname.parts)], sep="") # add the . to the last token
+ file.name <- paste(file.fullname.parts[1:(length(file.fullname.parts)-1)], collapse=".")
+ }
+ return(list(path=file.path,
+ fullname=file.fullname,
+ name=file.name,
+ ext=file.ext))
+ }
+} # end: get.file.parts()
+
+parse.arguments <- function(args) {
+# ===================================
+# Function will parse arguments
+# ===================================
+ # Set arguments to default values
+ chip.file <- NA # main ChIP tagAlign/BAM file name
+ isurl.chip.file <- FALSE # flag indicating whether ChIP file is a URL
+ control.file <- NA # control tagAlign/BAM file name
+ isurl.control.file <- FALSE # flag indicating whether control file is a URL
+ sep.min <- -500 # min strand shift
+ sep.max <- 1500 # max strand shift
+ sep.bin <- 5 # increment for strand shift
+ sep.peak <- NA # user-defined peak shift
+ exclude.min <- 10 # lowerbound of strand shift exclusion region
+ exclude.max <- NaN # upperbound of strand shift exclusion region
+ n.nodes <- NA # number of parallel processing nodes
+ fdr <- 0.01 # false discovery rate threshold for peak calling
+ npeak <- NA # threshold on number of peaks to call
+ temp.dir <- tempdir() # temporary directory
+ chrname.rm.pattern <- NA # chromosome name pattern used to remove tags
+ output.odir <- NA # Output directory name
+ output.npeak.file <- NA # Output narrowPeak file name
+ output.rpeak.file <- NA # Output regionPeak file name
+ output.rdata.file <- NA # Rdata file
+ output.plot.file <- NA # cross correlation plot file
+ output.result.file <- NA # result file
+ replace.flag <- FALSE # replace file flag
+ clean.files.flag <- FALSE # file deletion flag
+
+ # Parse arguments
+ for (each.arg in args) {
+
+ if (grepl('^-c=',each.arg)) { #-c=<chip.file>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ chip.file <- arg.split[2] # second part is chip.file
+ } else {
+ stop('No tagAlign/BAM file name provided for parameter -c=')
+ }
+
+ } else if (grepl('^-i=',each.arg)) { #-i=<control.file>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ control.file <- arg.split[2] # second part is control.file
+ } else {
+ stop('No tagAlign/BAM file name provided for parameter -i=')
+ }
+
+ } else if (grepl('^-s=',each.arg)) { #-s=<sep.min>:<sep.bin>:<sep.max>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ sep.vals <- arg.split[2] # second part is sepmin:sepbin:sepmax
+ sep.vals.split <- strsplit(sep.vals,':',fixed=TRUE)[[1]] # split on :
+ if (length(sep.vals.split) != 3) { # must have 3 parts
+ stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
+ } else {
+ if (any(is.na(as.numeric(sep.vals.split)))) { # check that sep vals are numeric
+ stop('Strand shift limits must be numeric values')
+ }
+ sep.min <- round(as.numeric(sep.vals.split[1]))
+ sep.bin <- round(as.numeric(sep.vals.split[2]))
+ sep.max <- round(as.numeric(sep.vals.split[3]))
+ if ((sep.min > sep.max) || (sep.bin > (sep.max - sep.min)) || (sep.bin < 0)) {
+ stop('Illegal separation values -s=sepmin:sepbin:sepmax')
+ }
+ }
+ } else {
+ stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
+ }
+
+ } else if (grepl('^-speak=',each.arg)) { #-speak=<sep.peak> , user-defined cross-correlation peak strandshift
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ sep.peak <- arg.split[2] # second part is <sep.peak>
+ if (is.na(as.numeric(sep.peak))) { # check that sep.peak is numeric
+ stop('-speak=<sep.peak>: User defined peak shift must be numeric')
+ }
+ sep.peak <- as.numeric(sep.peak)
+ } else {
+ stop('User defined peak shift must be provided as -speak=<sep.peak>')
+ }
+
+ } else if (grepl('^-x=',each.arg)) { #-x=<exclude.min>:<exclude.max>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ exclude.vals <- arg.split[2] # second part is excludemin:excludemax
+ exclude.vals.split <- strsplit(exclude.vals,':',fixed=TRUE)[[1]] # split on :
+ if (length(exclude.vals.split) != 2) { # must have 2 parts
+ stop('Exclusion limits must be specified as -x=excludemin:excludemax')
+ } else {
+ if (any(is.na(as.numeric(exclude.vals.split)))) { # check that exclude vals are numeric
+ stop('Exclusion limits must be numeric values')
+ }
+ exclude.min <- round(as.numeric(exclude.vals.split[1]))
+ exclude.max <- round(as.numeric(exclude.vals.split[2]))
+ if (exclude.min > exclude.max) {
+ stop('Illegal exclusion limits -x=excludemin:excludemax')
+ }
+ }
+ } else {
+ stop('Exclusion limits must be specified as -x=excludemin:excludemax')
+ }
+
+ } else if (grepl('^-p=',each.arg)) { #-p=<n.nodes> , number of parallel processing nodes, default=NULL
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ n.nodes <- arg.split[2] # second part is numnodes
+ if (is.na(as.numeric(n.nodes))) { # check that n.nodes is numeric
+ stop('-p=<numnodes>: numnodes must be numeric')
+ }
+ n.nodes <- round(as.numeric(n.nodes))
+ } else {
+ stop('Number of parallel nodes must be provided as -p=<numnodes>')
+ }
+
+ } else if (grepl('^-fdr=',each.arg)) { #-fdr=<fdr> , false discovery rate, default=0.01
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ fdr <- arg.split[2] # second part is fdr
+ if (is.na(as.numeric(fdr))) { # check that fdr is numeric
+ stop('-fdr=<falseDiscoveryRate>: false discovery rate must be numeric')
+ }
+ fdr <- as.numeric(fdr)
+ } else {
+ stop('False discovery rate must be provided as -fdr=<fdr>')
+ }
+
+ } else if (grepl('^-npeak=',each.arg)) { #-npeak=<numPeaks> , number of peaks threshold, default=NA
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ npeak <- arg.split[2] # second part is npeak
+ if (is.na(as.numeric(npeak))) { # check that npeak is numeric
+ stop('-npeak=<numPeaks>: threshold on number of peaks must be numeric')
+ }
+ npeak <- round(as.numeric(npeak))
+ } else {
+ stop('Threshold on number of peaks must be provided as -npeak=<numPeaks>')
+ }
+
+ } else if (grepl('^-tmpdir=',each.arg)) { #-tmpdir=<temp.dir>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ temp.dir <- arg.split[2] # second part is temp.dir
+ } else {
+ stop('No temporary directory provided for parameter -tmpdir=')
+ }
+
+ } else if (grepl('^-filtchr=',each.arg)) { #-filtchr=<chrname.rm.pattern>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ chrname.rm.pattern <- arg.split[2] # second part is chrname.rm.pattern
+ } else {
+ stop('No pattern provided for parameter -filtchr=')
+ }
+
+ } else if (grepl('^-odir=',each.arg)) { #-odir=<output.odir>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ output.odir <- arg.split[2] # second part is output.odir
+ } else {
+ stop('No output directory provided for parameter -odir=')
+ }
+
+ } else if (grepl('^-savn',each.arg)) { # -savn=<output.npeak.file> OR -savn , save narrowpeak
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.npeak.file <- arg.split[2] #-savn=
+ } else if (each.arg=='-savn') {
+ output.npeak.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving narrowPeak file must be -savn or -savn=<filename>')
+ }
+
+ } else if (grepl('^-savr',each.arg)) { # -savr=<output.rpeak.file> OR -savr , save regionpeak
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.rpeak.file <- arg.split[2] #-savr=
+ } else if (each.arg=='-savr') {
+ output.rpeak.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving regionPeak file must be -savr or -savr=<filename>')
+ }
+
+ } else if (grepl('^-savd',each.arg)) { # -savd=<output.rdata.file> OR -savd , save Rdata file
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.rdata.file <- arg.split[2] #-savd=
+ } else if (each.arg=='-savd') {
+ output.rdata.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving Rdata file must be -savd or -savd=<filename>')
+ }
+
+ } else if (grepl('^-savp',each.arg)) { # -savp=<output.plot.file> OR -savp , save cross-correlation plot
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.plot.file <- arg.split[2] #-savp=
+ } else if (each.arg=='-savp') {
+ output.plot.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving Rdata file must be -savp or -savp=<filename>')
+ }
+
+ } else if (grepl('^-out=',each.arg)) { #-out=<output.result.file>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ output.result.file <- arg.split[2] # second part is output.result.file
+ } else {
+ stop('No result file provided for parameter -out=')
+ }
+
+ } else if (each.arg == '-rf') {
+
+ replace.flag <- TRUE
+
+ } else if (each.arg == '-clean') {
+
+ clean.files.flag <- TRUE
+
+ } else {
+
+ stop('Illegal argument ',each.arg)
+ }
+ }
+ # End: for loop
+
+ # Check mandatory arguments
+ if (is.na(chip.file)) {
+ stop('-c=<tagAlign/BAMFileName> is a mandatory argument')
+ }
+
+ if (is.na(control.file) && ! is.na(output.npeak.file)) {
+ stop('-i=<tagAlign/BAMFileName> is required for peak calling')
+ }
+
+ # Check if ChIP and control files are URLs
+ if (grepl('^http://',chip.file)) {
+ isurl.chip.file <- TRUE
+ }
+ if (grepl('^http://',control.file)) {
+ isurl.control.file <- TRUE
+ }
+
+ # If ChIP file is a URL output.odir MUST be specified
+ if (isurl.chip.file && is.na(output.odir)) {
+ stop('If ChIP file is a URL, then output directory MUST be specified')
+ }
+
+ # Check that ChIP and control files exist
+ if (isurl.chip.file) {
+ if (system(paste('wget -q --spider',chip.file)) != 0) {
+ stop('ChIP file URL not valid: ',chip.file)
+ }
+ } else if (!file.exists(chip.file)) {
+ stop('ChIP File:',chip.file,' does not exist')
+ }
+
+ if (!is.na(control.file)) {
+ if (isurl.control.file) {
+ if (system(paste('wget -q --spider',control.file)) != 0) {
+ stop('Control file URL not valid: ',control.file)
+ }
+ } else if (!file.exists(control.file)) {
+ stop('Control File:',control.file,' does not exist')
+ }
+ }
+
+ # Correct other arguments
+ if (is.na(output.odir)) { # Reconstruct output.odir if not provided
+ output.odir <- get.file.parts(chip.file)$path
+ }
+
+ if (is.null(output.npeak.file)) { # Reconstruct output.npeak.file if NULL
+ output.npeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.narrowPeak', sep=""))
+ }
+
+ if (is.null(output.rpeak.file)) { # Reconstruct output.rpeak.file if NULL
+ output.rpeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.regionPeak', sep=""))
+ }
+
+ if (is.null(output.rdata.file)) { # Reconstruct output.rdata.file if NULL
+ output.rdata.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.Rdata', sep=""))
+ }
+
+ if (is.null(output.plot.file)) { # Reconstruct output.plot.file if NULL
+ output.plot.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.pdf', sep=""))
+ }
+
+ return(list(chip.file=chip.file,
+ isurl.chip.file=isurl.chip.file,
+ control.file=control.file,
+ isurl.control.file=isurl.control.file,
+ sep.range=c(sep.min,sep.bin,sep.max),
+ sep.peak=sep.peak,
+ ex.range=c(exclude.min,exclude.max),
+ n.nodes=n.nodes,
+ fdr=fdr,
+ npeak=npeak,
+ temp.dir=temp.dir,
+ chrname.rm.pattern=chrname.rm.pattern,
+ output.odir=output.odir,
+ output.npeak.file=output.npeak.file,
+ output.rpeak.file=output.rpeak.file,
+ output.rdata.file=output.rdata.file,
+ output.plot.file=output.plot.file,
+ output.result.file=output.result.file,
+ replace.flag=replace.flag,
+ clean.files.flag=clean.files.flag))
+} # end: parse.arguments()
+
+read.align <- function(align.filename) {
+# ===================================
+# Function will read a tagAlign or BAM file
+# ===================================
+ if (grepl('(\\.bam)?.*(\\.tagAlign)',align.filename)) { # if tagalign file
+ chip.data <- read.tagalign.tags(align.filename)
+ # get readlength info
+ tmpDataRows <- read.table(align.filename,nrows=500)
+ chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
+ } else if (grepl('(\\.tagAlign)?.*(\\.bam)',align.filename)) { # if bam file
+ # create BAM file name
+ bam2align.filename <- sub('\\.bam','.tagAlign',align.filename)
+ # generate command to convert bam to tagalign
+ command <- vector(length=2)
+ command[1] <- sprintf("samtools view -F 0x0204 -o - %s",align.filename)
+ command[2] <- paste("awk 'BEGIN{FS=" , '"\t"' , ";OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
+ # command[2] <- paste("awk 'BEGIN{OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
+ command <- paste(command,collapse=" | ")
+ # Run command
+ status <- system(command,intern=FALSE,ignore.stderr=FALSE)
+ if ((status != 0) || !file.exists(bam2align.filename)) {
+ cat(sprintf("Error converting BAM to tagalign file: %s\n",align.filename),file=stderr())
+ q(save="no",status=1)
+ }
+ # read converted BAM file
+ chip.data <- read.tagalign.tags(bam2align.filename)
+ # get readlength info
+ tmpDataRows <- read.table(bam2align.filename,nrows=500)
+ chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
+ # delete temporary tagalign file
+ file.remove(bam2align.filename)
+ } else {
+ cat(sprintf("Error:Unknown file format for file:%s\n",align.fname),file=stderr())
+ q(save="no",status=1)
+ }
+ return(chip.data)
+} # end: read.align()
+
+print.run.params <- function(params){
+# ===================================
+# Output run parameters
+# ===================================
+ cat('################\n',file=stdout())
+ cat(iparams$chip.file,
+ iparams$control.file,
+ iparams$sep.range,
+ iparams$sep.peak,
+ iparams$ex.range,
+ iparams$n.nodes,
+ iparams$fdr,
+ iparams$npeak,
+ iparams$output.odir,
+ iparams$output.npeak.file,
+ iparams$output.rpeak.file,
+ iparams$output.rdata.file,
+ iparams$output.plot.file,
+ iparams$output.result.file,
+ iparams$replace.flag,
+ labels=c('ChIP data:','Control data:', 'strandshift(min):','strandshift(step):','strandshift(max)','user-defined peak shift',
+ 'exclusion(min):','exclusion(max):','num parallel nodes:','FDR threshold:','NumPeaks Threshold:','Output Directory:',
+ 'narrowPeak output file name:', 'regionPeak output file name:', 'Rdata filename:',
+ 'plot pdf filename:','result filename:','Overwrite files?:'),
+ fill=18,
+ file=stdout())
+ cat('\n',file=stdout())
+} # end: print.run.parameters()
+
+check.replace.flag <- function(params){
+# ===================================
+# Check if files exist
+# ===================================
+# If replace.flag is NOT set, check if output files exist and abort if necessary
+ if (! iparams$replace.flag) {
+ if (! is.na(iparams$output.npeak.file)) {
+ if (file.exists(iparams$output.npeak.file)) {
+ cat('narrowPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ if (! is.na(iparams$output.rpeak.file)) {
+ if (file.exists(iparams$output.rpeak.file)) {
+ cat('regionPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ if (! is.na(iparams$output.plot.file)) {
+ if (file.exists(iparams$output.plot.file)) {
+ cat('Plot file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ if (! is.na(iparams$output.rdata.file)) {
+ if (file.exists(iparams$output.rdata.file)) {
+ cat('Rdata file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ }
+}
+
+# #############################################################################
+# MAIN FUNCTION
+# #############################################################################
+
+# Check number of arguments
+minargs = 1;
+maxargs = 17;
+if (nargs < minargs | nargs > maxargs) {
+ print.usage()
+ q(save="no",status=1)
+}
+
+# Parse arguments
+# iparams$chip.file
+# iparams$isurl.chip.file
+# iparams$control.file
+# iparams$isurl.control.file
+# iparams$sep.range
+# iparams$sep.peak
+# iparams$ex.range
+# iparams$n.nodes
+# iparams$fdr
+# iparams$npeak
+# iparams$temp.dir
+# iparams$output.odir
+# iparams$output.npeak.file
+# iparams$output.rpeak.file
+# iparams$output.rdata.file
+# iparams$output.plot.file
+# iparams$output.result.file
+# iparams$replace.flag
+# iparams$clean.files.flag
+iparams <- parse.arguments(args)
+
+# Print run parameters
+print.run.params(iparams)
+
+# Check if output files exist
+check.replace.flag(iparams)
+
+# curr.chip.file and curr.control.file always point to the original ChIP and control files on disk
+# ta.chip.filename & ta.control.filename always point to the final but temporary versions of the ChIP and control files that will be passed to read.align
+
+# Download ChIP and control files if necessary to temp.dir
+if (iparams$isurl.chip.file) {
+ curr.chip.file <- file.path(iparams$temp.dir, get.file.parts(iparams$chip.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
+ cat('Downloading ChIP file:',iparams$chip.file,"\n",file=stdout())
+ if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$chip.file)) != 0) {
+ stop('Error downloading ChIP file:',iparams$chip.file)
+ }
+} else {
+ curr.chip.file <- iparams$chip.file # file is in original directory
+}
+
+if (iparams$isurl.control.file) {
+ curr.control.file <- file.path(iparams$temp.dir, get.file.parts(iparams$control.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
+ cat('Downloading control file:',iparams$control.file,"\n",file=stdout())
+ if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$control.file)) != 0) {
+ stop('Error downloading Control file:',iparams$control.file)
+ }
+} else {
+ curr.control.file <- iparams$control.file # file is in original directory
+}
+
+# unzip ChIP and input files if required AND copy to temp directory
+if (get.file.parts(curr.chip.file)$ext == '.gz') {
+ ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
+ cat('Decompressing ChIP file\n',file=stdout())
+ if (system(paste("gunzip -c",curr.chip.file,">",ta.chip.filename)) != 0) {
+ stop('Unable to decompress file:', iparams$chip.file)
+ }
+ if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
+ file.remove(curr.chip.file)
+ }
+} else {
+ ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$fullname, tmpdir=iparams$temp.dir)
+ if (iparams$clean.files.flag) {
+ file.rename(curr.chip.file,ta.chip.filename) # move file to temp.dir/[filename][randsuffix]
+ } else {
+ file.copy(curr.chip.file,ta.chip.filename) # copy file to temp.dir/[filename][randsuffix]
+ }
+}
+
+if (! is.na(iparams$control.file)) {
+ if (get.file.parts(curr.control.file)$ext == '.gz') {
+ ta.control.filename <- tempfile(get.file.parts(curr.control.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
+ cat('Decompressing control file\n',file=stdout())
+ if (system(paste("gunzip -c",curr.control.file,">",ta.control.filename)) != 0) {
+ stop('Unable to decompress file:', iparams$control.file)
+ }
+ if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
+ file.remove(curr.control.file)
+ }
+ } else {
+ ta.control.filename <- tempfile(get.file.parts(curr.control.file)$fullname, tmpdir=iparams$temp.dir) # copy file to temp.dir/[filename][randsuffix]
+
+ if (iparams$clean.files.flag) {
+ file.rename(curr.control.file,ta.control.filename) # move file to temp.dir/[filename][randsuffix]
+ } else {
+ file.copy(curr.control.file,ta.control.filename) # copy file to temp.dir/[filename][randsuffix]
+ }
+ }
+}
+
+# Remove downloaded files
+if (iparams$isurl.chip.file & file.exists(curr.chip.file)) {
+ file.remove(curr.chip.file)
+}
+
+if (! is.na(iparams$control.file)) {
+ if (iparams$isurl.control.file & file.exists(curr.control.file)) {
+ file.remove(curr.control.file)
+ }
+}
+
+# Load SPP library
+library(spp)
+
+# Read ChIP tagAlign/BAM files
+cat("Reading ChIP tagAlign/BAM file",iparams$chip.file,"\n",file=stdout())
+chip.data <- read.align(ta.chip.filename)
+cat("ChIP data read length",chip.data$read.length,"\n",file=stdout())
+file.remove(ta.chip.filename) # Delete temporary file
+if (length(chip.data$tags)==0) {
+ stop('Error in ChIP file format:', iparams$chip.file)
+}
+# Remove illegal chromosome names
+if (! is.na(iparams$chrname.rm.pattern)) {
+ selectidx <- which(grepl(iparams$chrname.rm.pattern,names(chip.data$tags))==FALSE)
+ chip.data$tags <- chip.data$tags[selectidx]
+ chip.data$quality <- chip.data$quality[selectidx]
+}
+chip.data$num.tags <- sum(unlist(lapply(chip.data$tags,function(d) length(d))))
+
+# Read Control tagAlign/BAM files
+if (! is.na(iparams$control.file)) {
+ cat("Reading Control tagAlign/BAM file",iparams$control.file,"\n",file=stdout())
+ control.data <- read.align(ta.control.filename)
+ file.remove(ta.control.filename) # Delete temporary file
+ if (length(control.data$tags)==0) {
+ stop('Error in control file format:', iparams$chip.file)
+ }
+ cat("Control data read length",control.data$read.length,"\n",file=stdout())
+ # Remove illegal chromosome names
+ if (! is.na(iparams$chrname.rm.pattern)) {
+ selectidx <- which(grepl(iparams$chrname.rm.pattern,names(control.data$tags))==FALSE)
+ control.data$tags <- control.data$tags[selectidx]
+ control.data$quality <- control.data$quality[selectidx]
+ }
+ control.data$num.tags <- sum(unlist(lapply(control.data$tags,function(d) length(d))))
+}
+
+# Open multiple processes if required
+if (is.na(iparams$n.nodes)) {
+ cluster.nodes <- NULL
+} else {
+ library(snow)
+ cluster.nodes <- makeCluster(iparams$n.nodes)
+}
+
+# #################################
+# Calculate cross-correlation for various strand shifts
+# #################################
+cat("Calculating peak characteristics\n",file=stdout())
+# crosscorr
+# $cross.correlation : Cross-correlation profile as an $x/$y data.frame
+# $peak : Position ($x) and height ($y) of automatically detected cross-correlation peak.
+# $whs: Optimized window half-size for binding detection (based on the width of the cross-correlation peak)
+crosscorr <- get.binding.characteristics(chip.data,
+ srange=iparams$sep.range[c(1,3)],
+ bin=iparams$sep.range[2],
+ accept.all.tags=T,
+ cluster=cluster.nodes)
+if (!is.na(iparams$n.nodes)) {
+ stopCluster(cluster.nodes)
+}
+
+# Smooth the cross-correlation curve if required
+cc <- crosscorr$cross.correlation
+crosscorr$min.cc <- crosscorr$cross.correlation[ length(crosscorr$cross.correlation$y) , ] # minimum value and shift of cross-correlation
+cat("Minimum cross-correlation value", crosscorr$min.cc$y,"\n",file=stdout())
+cat("Minimum cross-correlation shift", crosscorr$min.cc$x,"\n",file=stdout())
+sbw <- 2*floor(ceiling(5/iparams$sep.range[2]) / 2) + 1 # smoothing bandwidth
+cc$y <- runmean(cc$y,sbw,alg="fast")
+
+# Compute cross-correlation peak
+bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
+peakidx <- (diff(cc$y,bw)>=0) # cc[i] > cc[i-bw]
+peakidx <- diff(peakidx,bw)
+peakidx <- which(peakidx==-1) + bw
+
+# exclude peaks from the excluded region
+if ( is.nan(iparams$ex.range[2]) ) {
+ iparams$ex.range[2] <- chip.data$read.length+10
+}
+peakidx <- peakidx[(cc$x[peakidx] < iparams$ex.range[1]) | (cc$x[peakidx] > iparams$ex.range[2]) | (cc$x[peakidx] < 0) ]
+cc <- cc[peakidx,]
+
+# Find max peak position and other peaks within 0.9*max_peakvalue that are further away from maxpeakposition
+maxpeakidx <- which.max(cc$y)
+maxpeakshift <- cc$x[maxpeakidx]
+maxpeakval <- cc$y[maxpeakidx]
+peakidx <-which((cc$y >= 0.9*maxpeakval) & (cc$x >= maxpeakshift))
+cc <- cc[peakidx,]
+
+# sort the peaks and get the top 3
+sortidx <- order(cc$y,decreasing=TRUE)
+sortidx <- sortidx[c(1:min(3,length(sortidx)))]
+cc.peak <- cc[sortidx,]
+
+# Override peak shift if user supplies peak shift
+if (! is.na(iparams$sep.peak)) {
+ cc.peak <- approx(crosscorr$cross.correlation$x,crosscorr$cross.correlation$y,iparams$sep.peak,rule=2)
+}
+cat("Top 3 cross-correlation values", paste(cc.peak$y,collapse=","),"\n",file=stdout())
+cat("Top 3 estimates for fragment length",paste(cc.peak$x,collapse=","),"\n",file=stdout())
+
+# Reset values in crosscorr
+crosscorr$peak$x <- cc.peak$x[1]
+crosscorr$peak$y <- cc.peak$y[1]
+
+# Compute window half size
+whs.thresh <- crosscorr$min.cc$y + (crosscorr$peak$y - crosscorr$min.cc$y)/3
+crosscorr$whs <- max(crosscorr$cross.correlation$x[crosscorr$cross.correlation$y >= whs.thresh])
+cat("Window half size",crosscorr$whs,"\n",file=stdout())
+
+# Compute phantom peak coefficient
+ph.peakidx <- which( ( crosscorr$cross.correlation$x >= ( chip.data$read.length - round(2*iparams$sep.range[2]) ) ) &
+ ( crosscorr$cross.correlation$x <= ( chip.data$read.length + round(2*iparams$sep.range[2]) ) ) )
+ph.peakidx <- ph.peakidx[ which.max(crosscorr$cross.correlation$y[ph.peakidx]) ]
+crosscorr$phantom.cc <- crosscorr$cross.correlation[ph.peakidx,]
+cat("Phantom peak location",crosscorr$phantom.cc$x,"\n",file=stdout())
+cat("Phantom peak Correlation",crosscorr$phantom.cc$y,"\n",file=stdout())
+crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$phantom.cc$y
+crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$min.cc$y
+cat("Normalized Strand cross-correlation coefficient (NSC)",crosscorr$phantom.coeff,"\n",file=stdout())
+crosscorr$rel.phantom.coeff <- (crosscorr$peak$y - crosscorr$min.cc$y) / (crosscorr$phantom.cc$y - crosscorr$min.cc$y)
+cat("Relative Strand cross-correlation Coefficient (RSC)",crosscorr$rel.phantom.coeff,"\n",file=stdout())
+crosscorr$phantom.quality.tag <- NA
+if ( (crosscorr$rel.phantom.coeff >= 0) & (crosscorr$rel.phantom.coeff < 0.25) ) {
+ crosscorr$phantom.quality.tag <- -2
+} else if ( (crosscorr$rel.phantom.coeff >= 0.25) & (crosscorr$rel.phantom.coeff < 0.5) ) {
+ crosscorr$phantom.quality.tag <- -1
+} else if ( (crosscorr$rel.phantom.coeff >= 0.5) & (crosscorr$rel.phantom.coeff < 1) ) {
+ crosscorr$phantom.quality.tag <- 0
+} else if ( (crosscorr$rel.phantom.coeff >= 1) & (crosscorr$rel.phantom.coeff < 1.5) ) {
+ crosscorr$phantom.quality.tag <- 1
+} else if ( (crosscorr$rel.phantom.coeff >= 1.5) ) {
+ crosscorr$phantom.quality.tag <- 2
+}
+cat("Phantom Peak Quality Tag",crosscorr$phantom.quality.tag,"\n",file=stdout())
+
+# Output result to result file if required
+#Filename\tnumReads\tPeak_shift\tPeak_Correlation\tRead_length\tPhantomPeak_Correlation\tMin_Correlation_Shift\tMin_Correlation\tNormalized_CrossCorrelation_Coefficient\tRelative_CrossCorrelation_Coefficient\tQualityTag)
+if (! is.na(iparams$output.result.file)) {
+ cat(get.file.parts(iparams$chip.file)$fullname,
+ chip.data$num.tags,
+ paste(cc.peak$x,collapse=","),
+ paste(cc.peak$y,collapse=","),
+ crosscorr$phantom.cc$x,
+ crosscorr$phantom.cc$y,
+ crosscorr$min.cc$x,
+ crosscorr$min.cc$y,
+ crosscorr$phantom.coeff,
+ crosscorr$rel.phantom.coeff,
+ crosscorr$phantom.quality.tag,
+ sep="\t",
+ file=iparams$output.result.file,
+ append=TRUE)
+ cat("\n",
+ file=iparams$output.result.file,
+ append=TRUE)
+}
+
+# Save figure if required
+if (! is.na(iparams$output.plot.file)) {
+ pdf(file=iparams$output.plot.file,width=5,height=5)
+ par(mar = c(4,3.5,2,0.5), mgp = c(1.5,0.5,0), cex = 0.8);
+ plot(crosscorr$cross.correlation,
+ type='l',
+ xlab=sprintf("strand-shift (%s)",paste(cc.peak$x,collapse=",")),
+ ylab="cross-correlation")
+ abline(v=cc.peak$x,lty=2,col=2)
+ abline(v=crosscorr$phantom.cc$x,lty=2,col=4)
+ title(main=get.file.parts(iparams$chip.file)$fullname,
+ sub=sprintf("NSC=%g,RSC=%g,Qtag=%d",crosscorr$phantom.coeff,crosscorr$rel.phantom.coeff,crosscorr$phantom.quality.tag))
+ dev.off();
+}
+
+# Save RData file if required
+if (! is.na(iparams$output.rdata.file)) {
+ save(iparams,
+ crosscorr,
+ cc.peak,
+ file=iparams$output.rdata.file);
+}
+
+# #################################
+# Call peaks
+# #################################
+
+if ( !is.na(iparams$output.npeak.file) || !is.na(iparams$output.rpeak.file) ) {
+
+ # Remove local tag anomalies
+ cat('Removing read stacks\n',file=stdout())
+ chip.data <- remove.local.tag.anomalies(chip.data$tags)
+ control.data <- remove.local.tag.anomalies(control.data$tags)
+
+ # Open multiple processes if required
+ if (is.na(iparams$n.nodes)) {
+ cluster.nodes <- NULL
+ } else {
+ cluster.nodes <- makeCluster(iparams$n.nodes)
+ }
+
+ # Find peaks
+ cat('Finding peaks\n',file=stdout())
+ if (!is.na(iparams$npeak)) {
+ iparams$fdr <- 0.99
+ }
+ narrow.peaks <- find.binding.positions(signal.data=chip.data,control.data=control.data,fdr=iparams$fdr,method=tag.lwcc,whs=crosscorr$whs,cluster=cluster.nodes)
+ if (!is.na(iparams$n.nodes)) {
+ stopCluster(cluster.nodes)
+ }
+ cat(paste("Detected",sum(unlist(lapply(narrow.peaks$npl,function(d) length(d$x)))),"peaks"),"\n",file=stdout())
+
+ # Write to narrowPeak file
+ if (!is.na(iparams$output.npeak.file)) {
+ write.narrowpeak.binding(narrow.peaks,iparams$output.npeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
+ system(paste('gzip -f ',iparams$output.npeak.file))
+ }
+
+ # Compute and write regionPeak file
+ if (!is.na(iparams$output.rpeak.file)) {
+ region.peaks <- add.broad.peak.regions(chip.data,control.data,narrow.peaks,window.size=max(50,round(crosscorr$whs/4)),z.thr=10)
+ write.narrowpeak.binding(region.peaks,iparams$output.rpeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
+ system(paste('gzip -f ',iparams$output.rpeak.file))
+ }
+
+ # Save Rdata file
+ if (! is.na(iparams$output.rdata.file)) {
+ save(iparams,
+ crosscorr,
+ cc.peak,
+ narrow.peaks,
+ region.peaks,
+ file=iparams$output.rdata.file);
+ }
+
+}
+
+
diff --git a/pRSEM/phantompeakqualtools/run_spp_nodups.R b/pRSEM/phantompeakqualtools/run_spp_nodups.R
new file mode 100644
index 0000000..4fdbb47
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/run_spp_nodups.R
@@ -0,0 +1,886 @@
+# run_spp_nodups.R
+# =============
+# Author: Anshul Kundaje, Computer Science Dept., MIT
+# Email: anshul@kundaje.net
+# Last updated: Feb 13, 2012
+# =============
+# MANDATORY ARGUMENTS
+# -c=<ChIP_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
+# MANDATORY ARGUMENT FOR PEAK CALLING
+# -i=<Input_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
+# OPTIONAL ARGUMENTS
+# -s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-500:5:1500
+# -speak=<strPeak>, user-defined cross-correlation peak strandshift
+# -x=<min>:<max>, strand shifts to exclude (This is mainly to avoid phantom peaks) default=10:(readlen+10)
+# -p=<nodes> , number of parallel processing nodes, default=NULL
+# -fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling
+# -npeak=<numPeaks>, threshold on number of peaks to call
+# -tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)
+# -filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name
+# OUTPUT PARAMETERS
+# -odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)
+# -savn=<narrowpeakfilename> OR -savn NarrowPeak file name
+# -savr=<regionpeakfilename> OR -savr RegionPeak file name
+# -savd=<rdatafile> OR -savd , save Rdata file
+# -savp=<plotdatafile> OR -savp , save cross-correlation plot
+# -out=<resultfile>, append peakshift result to a file
+# format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>Normalized SCC (NSC)<tab>Relative SCC (RSC)<tab>QualityTag
+# -rf , if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists
+# -clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files
+
+args <- commandArgs(trailingOnly=TRUE); # Read Arguments from command line
+nargs = length(args); # number of arguments
+
+# ###########################################################################
+# AUXILIARY FUNCTIONS
+# ###########################################################################
+
+print.usage <- function() {
+# ===================================
+# Function will print function usage
+# ===================================
+ cat('Usage: Rscript run_spp_nodups.R <options>\n',file=stderr())
+ cat('MANDATORY ARGUMENTS\n',file=stderr())
+ cat('-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
+ cat('MANDATORY ARGUMENTS FOR PEAK CALLING\n',file=stderr())
+ cat('-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)\n',file=stderr())
+ cat('OPTIONAL ARGUMENTS\n',file=stderr())
+ cat('-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-500:5:1500\n',file=stderr())
+ cat('-speak=<strPeak>, user-defined cross-correlation peak strandshift\n',file=stderr())
+ cat('-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)\n',file=stderr())
+ cat('-p=<nodes> , number of parallel processing nodes, default=0\n',file=stderr())
+ cat('-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling\n',file=stderr())
+ cat('-npeak=<numPeaks>, threshold on number of peaks to call\n',file=stderr())
+ cat('-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)\n',file=stderr())
+ cat('-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name\n',file=stderr())
+ cat('OUTPUT ARGUMENTS\n',file=stderr())
+ cat('-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)\n',file=stderr())
+ cat('-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)\n',file=stderr())
+ cat('-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment)\n',file=stderr())
+ cat('-savd=<rdatafile> OR -savd, save Rdata file\n',file=stderr())
+ cat('-savp=<plotdatafile> OR -savp, save cross-correlation plot\n',file=stderr())
+ cat('-out=<resultfile>, append peakshift/phantomPeak results to a file\n',file=stderr())
+ cat(' format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>Normalized SCC (NSC)<tab>Relative SCC (RSC)<tab>QualityTag)\n',file=stderr())
+ cat('-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists\n',file=stderr())
+ cat('-clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files\n',file=stderr())
+} # end: print.usage()
+
+get.file.parts <- function(file.fullpath) {
+# ===================================
+# Function will take a file name with path and split the file name into
+# path, fullname, name and ext
+# ===================================
+ if (! is.character(file.fullpath)) {
+ stop('File name must be a string')
+ }
+
+ file.parts <- strsplit(as.character(file.fullpath), .Platform$file.sep, fixed=TRUE)[[1]] # split on file separator
+
+ if (length(file.parts) == 0) { # if empty file name
+ return(list(path='',
+ fullname='',
+ name='',
+ ext='')
+ )
+ } else {
+ if (length(file.parts) == 1) { # if no path then just the file name itself
+ file.path <- '.'
+ file.fullname <- file.parts
+ } else {
+ file.path <- paste(file.parts[1:(length(file.parts)-1)], collapse=.Platform$file.sep) # 1:last-1 token is path
+ file.fullname <- file.parts[length(file.parts)] # last token is filename
+ }
+ file.fullname.parts <- strsplit(file.fullname,'.',fixed=TRUE)[[1]] # split on .
+ if (length(file.fullname.parts) == 1) { # if no extension
+ file.ext <- ''
+ file.name <- file.fullname.parts
+ } else {
+ file.ext <- paste('.', file.fullname.parts[length(file.fullname.parts)], sep="") # add the . to the last token
+ file.name <- paste(file.fullname.parts[1:(length(file.fullname.parts)-1)], collapse=".")
+ }
+ return(list(path=file.path,
+ fullname=file.fullname,
+ name=file.name,
+ ext=file.ext))
+ }
+} # end: get.file.parts()
+
+parse.arguments <- function(args) {
+# ===================================
+# Function will parse arguments
+# ===================================
+ # Set arguments to default values
+ chip.file <- NA # main ChIP tagAlign/BAM file name
+ isurl.chip.file <- FALSE # flag indicating whether ChIP file is a URL
+ control.file <- NA # control tagAlign/BAM file name
+ isurl.control.file <- FALSE # flag indicating whether control file is a URL
+ sep.min <- -500 # min strand shift
+ sep.max <- 1500 # max strand shift
+ sep.bin <- 5 # increment for strand shift
+ sep.peak <- NA # user-defined peak shift
+ exclude.min <- 10 # lowerbound of strand shift exclusion region
+ exclude.max <- NaN # upperbound of strand shift exclusion region
+ n.nodes <- NA # number of parallel processing nodes
+ fdr <- 0.01 # false discovery rate threshold for peak calling
+ npeak <- NA # threshold on number of peaks to call
+ temp.dir <- tempdir() # temporary directory
+ chrname.rm.pattern <- NA # chromosome name pattern used to remove tags
+ output.odir <- NA # Output directory name
+ output.npeak.file <- NA # Output narrowPeak file name
+ output.rpeak.file <- NA # Output regionPeak file name
+ output.rdata.file <- NA # Rdata file
+ output.plot.file <- NA # cross correlation plot file
+ output.result.file <- NA # result file
+ replace.flag <- FALSE # replace file flag
+ clean.files.flag <- FALSE # file deletion flag
+
+ # Parse arguments
+ for (each.arg in args) {
+
+ if (grepl('^-c=',each.arg)) { #-c=<chip.file>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ chip.file <- arg.split[2] # second part is chip.file
+ } else {
+ stop('No tagAlign/BAM file name provided for parameter -c=')
+ }
+
+ } else if (grepl('^-i=',each.arg)) { #-i=<control.file>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ control.file <- arg.split[2] # second part is control.file
+ } else {
+ stop('No tagAlign/BAM file name provided for parameter -i=')
+ }
+
+ } else if (grepl('^-s=',each.arg)) { #-s=<sep.min>:<sep.bin>:<sep.max>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ sep.vals <- arg.split[2] # second part is sepmin:sepbin:sepmax
+ sep.vals.split <- strsplit(sep.vals,':',fixed=TRUE)[[1]] # split on :
+ if (length(sep.vals.split) != 3) { # must have 3 parts
+ stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
+ } else {
+ if (any(is.na(as.numeric(sep.vals.split)))) { # check that sep vals are numeric
+ stop('Strand shift limits must be numeric values')
+ }
+ sep.min <- round(as.numeric(sep.vals.split[1]))
+ sep.bin <- round(as.numeric(sep.vals.split[2]))
+ sep.max <- round(as.numeric(sep.vals.split[3]))
+ if ((sep.min > sep.max) || (sep.bin > (sep.max - sep.min)) || (sep.bin < 0)) {
+ stop('Illegal separation values -s=sepmin:sepbin:sepmax')
+ }
+ }
+ } else {
+ stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
+ }
+
+ } else if (grepl('^-speak=',each.arg)) { #-speak=<sep.peak> , user-defined cross-correlation peak strandshift
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ sep.peak <- arg.split[2] # second part is <sep.peak>
+ if (is.na(as.numeric(sep.peak))) { # check that sep.peak is numeric
+ stop('-speak=<sep.peak>: User defined peak shift must be numeric')
+ }
+ sep.peak <- as.numeric(sep.peak)
+ } else {
+ stop('User defined peak shift must be provided as -speak=<sep.peak>')
+ }
+
+ } else if (grepl('^-x=',each.arg)) { #-x=<exclude.min>:<exclude.max>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ exclude.vals <- arg.split[2] # second part is excludemin:excludemax
+ exclude.vals.split <- strsplit(exclude.vals,':',fixed=TRUE)[[1]] # split on :
+ if (length(exclude.vals.split) != 2) { # must have 2 parts
+ stop('Exclusion limits must be specified as -x=excludemin:excludemax')
+ } else {
+ if (any(is.na(as.numeric(exclude.vals.split)))) { # check that exclude vals are numeric
+ stop('Exclusion limits must be numeric values')
+ }
+ exclude.min <- round(as.numeric(exclude.vals.split[1]))
+ exclude.max <- round(as.numeric(exclude.vals.split[2]))
+ if (exclude.min > exclude.max) {
+ stop('Illegal exclusion limits -x=excludemin:excludemax')
+ }
+ }
+ } else {
+ stop('Exclusion limits must be specified as -x=excludemin:excludemax')
+ }
+
+ } else if (grepl('^-p=',each.arg)) { #-p=<n.nodes> , number of parallel processing nodes, default=NULL
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ n.nodes <- arg.split[2] # second part is numnodes
+ if (is.na(as.numeric(n.nodes))) { # check that n.nodes is numeric
+ stop('-p=<numnodes>: numnodes must be numeric')
+ }
+ n.nodes <- round(as.numeric(n.nodes))
+ } else {
+ stop('Number of parallel nodes must be provided as -p=<numnodes>')
+ }
+
+ } else if (grepl('^-fdr=',each.arg)) { #-fdr=<fdr> , false discovery rate, default=0.01
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ fdr <- arg.split[2] # second part is fdr
+ if (is.na(as.numeric(fdr))) { # check that fdr is numeric
+ stop('-fdr=<falseDiscoveryRate>: false discovery rate must be numeric')
+ }
+ fdr <- as.numeric(fdr)
+ } else {
+ stop('False discovery rate must be provided as -fdr=<fdr>')
+ }
+
+ } else if (grepl('^-npeak=',each.arg)) { #-npeak=<numPeaks> , number of peaks threshold, default=NA
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ npeak <- arg.split[2] # second part is npeak
+ if (is.na(as.numeric(npeak))) { # check that npeak is numeric
+ stop('-npeak=<numPeaks>: threshold on number of peaks must be numeric')
+ }
+ npeak <- round(as.numeric(npeak))
+ } else {
+ stop('Threshold on number of peaks must be provided as -npeak=<numPeaks>')
+ }
+
+ } else if (grepl('^-tmpdir=',each.arg)) { #-tmpdir=<temp.dir>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ temp.dir <- arg.split[2] # second part is temp.dir
+ } else {
+ stop('No temporary directory provided for parameter -tmpdir=')
+ }
+
+ } else if (grepl('^-filtchr=',each.arg)) { #-filtchr=<chrname.rm.pattern>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ chrname.rm.pattern <- arg.split[2] # second part is chrname.rm.pattern
+ } else {
+ stop('No pattern provided for parameter -filtchr=')
+ }
+
+ } else if (grepl('^-odir=',each.arg)) { #-odir=<output.odir>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ output.odir <- arg.split[2] # second part is output.odir
+ } else {
+ stop('No output directory provided for parameter -odir=')
+ }
+
+ } else if (grepl('^-savn',each.arg)) { # -savn=<output.npeak.file> OR -savn , save narrowpeak
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.npeak.file <- arg.split[2] #-savn=
+ } else if (each.arg=='-savn') {
+ output.npeak.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving narrowPeak file must be -savn or -savn=<filename>')
+ }
+
+ } else if (grepl('^-savr',each.arg)) { # -savr=<output.rpeak.file> OR -savr , save regionpeak
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.rpeak.file <- arg.split[2] #-savr=
+ } else if (each.arg=='-savr') {
+ output.rpeak.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving regionPeak file must be -savr or -savr=<filename>')
+ }
+
+ } else if (grepl('^-savd',each.arg)) { # -savd=<output.rdata.file> OR -savd , save Rdata file
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.rdata.file <- arg.split[2] #-savd=
+ } else if (each.arg=='-savd') {
+ output.rdata.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving Rdata file must be -savd or -savd=<filename>')
+ }
+
+ } else if (grepl('^-savp',each.arg)) { # -savp=<output.plot.file> OR -savp , save cross-correlation plot
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2])) {
+ output.plot.file <- arg.split[2] #-savp=
+ } else if (each.arg=='-savp') {
+ output.plot.file <- NULL # NULL indicates get the name from the main file name
+ } else {
+ stop('Argument for saving Rdata file must be -savp or -savp=<filename>')
+ }
+
+ } else if (grepl('^-out=',each.arg)) { #-out=<output.result.file>
+
+ arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
+ if (! is.na(arg.split[2]) ) {
+ output.result.file <- arg.split[2] # second part is output.result.file
+ } else {
+ stop('No result file provided for parameter -out=')
+ }
+
+ } else if (each.arg == '-rf') {
+
+ replace.flag <- TRUE
+
+ } else if (each.arg == '-clean') {
+
+ clean.files.flag <- TRUE
+
+ } else {
+
+ stop('Illegal argument ',each.arg)
+ }
+ }
+ # End: for loop
+
+ # Check mandatory arguments
+ if (is.na(chip.file)) {
+ stop('-c=<tagAlign/BAMFileName> is a mandatory argument')
+ }
+
+ if (is.na(control.file) && ! is.na(output.npeak.file)) {
+ stop('-i=<tagAlign/BAMFileName> is required for peak calling')
+ }
+
+ # Check if ChIP and control files are URLs
+ if (grepl('^http://',chip.file)) {
+ isurl.chip.file <- TRUE
+ }
+ if (grepl('^http://',control.file)) {
+ isurl.control.file <- TRUE
+ }
+
+ # If ChIP file is a URL output.odir MUST be specified
+ if (isurl.chip.file && is.na(output.odir)) {
+ stop('If ChIP file is a URL, then output directory MUST be specified')
+ }
+
+ # Check that ChIP and control files exist
+ if (isurl.chip.file) {
+ if (system(paste('wget -q --spider',chip.file)) != 0) {
+ stop('ChIP file URL not valid: ',chip.file)
+ }
+ } else if (!file.exists(chip.file)) {
+ stop('ChIP File:',chip.file,' does not exist')
+ }
+
+ if (!is.na(control.file)) {
+ if (isurl.control.file) {
+ if (system(paste('wget -q --spider',control.file)) != 0) {
+ stop('Control file URL not valid: ',control.file)
+ }
+ } else if (!file.exists(control.file)) {
+ stop('Control File:',control.file,' does not exist')
+ }
+ }
+
+ # Correct other arguments
+ if (is.na(output.odir)) { # Reconstruct output.odir if not provided
+ output.odir <- get.file.parts(chip.file)$path
+ }
+
+ if (is.null(output.npeak.file)) { # Reconstruct output.npeak.file if NULL
+ output.npeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.narrowPeak', sep=""))
+ }
+
+ if (is.null(output.rpeak.file)) { # Reconstruct output.rpeak.file if NULL
+ output.rpeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.regionPeak', sep=""))
+ }
+
+ if (is.null(output.rdata.file)) { # Reconstruct output.rdata.file if NULL
+ output.rdata.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.Rdata', sep=""))
+ }
+
+ if (is.null(output.plot.file)) { # Reconstruct output.plot.file if NULL
+ output.plot.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.pdf', sep=""))
+ }
+
+ return(list(chip.file=chip.file,
+ isurl.chip.file=isurl.chip.file,
+ control.file=control.file,
+ isurl.control.file=isurl.control.file,
+ sep.range=c(sep.min,sep.bin,sep.max),
+ sep.peak=sep.peak,
+ ex.range=c(exclude.min,exclude.max),
+ n.nodes=n.nodes,
+ fdr=fdr,
+ npeak=npeak,
+ temp.dir=temp.dir,
+ chrname.rm.pattern=chrname.rm.pattern,
+ output.odir=output.odir,
+ output.npeak.file=output.npeak.file,
+ output.rpeak.file=output.rpeak.file,
+ output.rdata.file=output.rdata.file,
+ output.plot.file=output.plot.file,
+ output.result.file=output.result.file,
+ replace.flag=replace.flag,
+ clean.files.flag=clean.files.flag))
+} # end: parse.arguments()
+
+read.align <- function(align.filename) {
+# ===================================
+# Function will read a tagAlign or BAM file
+# ===================================
+ if (grepl('(\\.bam)?.*(\\.tagAlign)',align.filename)) { # if tagalign file
+ chip.data <- read.tagalign.tags(align.filename)
+ # get readlength info
+ tmpDataRows <- read.table(align.filename,nrows=500)
+ chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
+ } else if (grepl('(\\.tagAlign)?.*(\\.bam)',align.filename)) { # if bam file
+ # create BAM file name
+ bam2align.filename <- sub('\\.bam','.tagAlign',align.filename)
+ # generate command to convert bam to tagalign
+ command <- vector(length=2)
+ command[1] <- sprintf("samtools view -F 0x0204 -o - %s",align.filename)
+ command[2] <- paste("awk 'BEGIN{FS=" , '"\t"' , ";OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
+ # command[2] <- paste("awk 'BEGIN{OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
+ command <- paste(command,collapse=" | ")
+ # Run command
+ status <- system(command,intern=FALSE,ignore.stderr=FALSE)
+ if ((status != 0) || !file.exists(bam2align.filename)) {
+ cat(sprintf("Error converting BAM to tagalign file: %s\n",align.filename),file=stderr())
+ q(save="no",status=1)
+ }
+ # read converted BAM file
+ chip.data <- read.tagalign.tags(bam2align.filename)
+ # get readlength info
+ tmpDataRows <- read.table(bam2align.filename,nrows=500)
+ chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
+ # delete temporary tagalign file
+ file.remove(bam2align.filename)
+ } else {
+ cat(sprintf("Error:Unknown file format for file:%s\n",align.fname),file=stderr())
+ q(save="no",status=1)
+ }
+ return(chip.data)
+} # end: read.align()
+
+print.run.params <- function(params){
+# ===================================
+# Output run parameters
+# ===================================
+ cat('################\n',file=stdout())
+ cat(iparams$chip.file,
+ iparams$control.file,
+ iparams$sep.range,
+ iparams$sep.peak,
+ iparams$ex.range,
+ iparams$n.nodes,
+ iparams$fdr,
+ iparams$npeak,
+ iparams$output.odir,
+ iparams$output.npeak.file,
+ iparams$output.rpeak.file,
+ iparams$output.rdata.file,
+ iparams$output.plot.file,
+ iparams$output.result.file,
+ iparams$replace.flag,
+ labels=c('ChIP data:','Control data:', 'strandshift(min):','strandshift(step):','strandshift(max)','user-defined peak shift',
+ 'exclusion(min):','exclusion(max):','num parallel nodes:','FDR threshold:','NumPeaks Threshold:','Output Directory:',
+ 'narrowPeak output file name:', 'regionPeak output file name:', 'Rdata filename:',
+ 'plot pdf filename:','result filename:','Overwrite files?:'),
+ fill=18,
+ file=stdout())
+ cat('\n',file=stdout())
+} # end: print.run.parameters()
+
+check.replace.flag <- function(params){
+# ===================================
+# Check if files exist
+# ===================================
+# If replace.flag is NOT set, check if output files exist and abort if necessary
+ if (! iparams$replace.flag) {
+ if (! is.na(iparams$output.npeak.file)) {
+ if (file.exists(iparams$output.npeak.file)) {
+ cat('narrowPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ if (! is.na(iparams$output.rpeak.file)) {
+ if (file.exists(iparams$output.rpeak.file)) {
+ cat('regionPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ if (! is.na(iparams$output.plot.file)) {
+ if (file.exists(iparams$output.plot.file)) {
+ cat('Plot file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ if (! is.na(iparams$output.rdata.file)) {
+ if (file.exists(iparams$output.rdata.file)) {
+ cat('Rdata file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
+ q(save="no",status=1)
+ }
+ }
+ }
+}
+
+# #############################################################################
+# MAIN FUNCTION
+# #############################################################################
+
+# Check number of arguments
+minargs = 1;
+maxargs = 17;
+if (nargs < minargs | nargs > maxargs) {
+ print.usage()
+ q(save="no",status=1)
+}
+
+# Parse arguments
+# iparams$chip.file
+# iparams$isurl.chip.file
+# iparams$control.file
+# iparams$isurl.control.file
+# iparams$sep.range
+# iparams$sep.peak
+# iparams$ex.range
+# iparams$n.nodes
+# iparams$fdr
+# iparams$npeak
+# iparams$temp.dir
+# iparams$output.odir
+# iparams$output.npeak.file
+# iparams$output.rpeak.file
+# iparams$output.rdata.file
+# iparams$output.plot.file
+# iparams$output.result.file
+# iparams$replace.flag
+# iparams$clean.files.flag
+iparams <- parse.arguments(args)
+
+# Print run parameters
+print.run.params(iparams)
+
+# Check if output files exist
+check.replace.flag(iparams)
+
+# curr.chip.file and curr.control.file always point to the original ChIP and control files on disk
+# ta.chip.filename & ta.control.filename always point to the final but temporary versions of the ChIP and control files that will be passed to read.align
+
+# Download ChIP and control files if necessary to temp.dir
+if (iparams$isurl.chip.file) {
+ curr.chip.file <- file.path(iparams$temp.dir, get.file.parts(iparams$chip.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
+ cat('Downloading ChIP file:',iparams$chip.file,"\n",file=stdout())
+ if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$chip.file)) != 0) {
+ stop('Error downloading ChIP file:',iparams$chip.file)
+ }
+} else {
+ curr.chip.file <- iparams$chip.file # file is in original directory
+}
+
+if (iparams$isurl.control.file) {
+ curr.control.file <- file.path(iparams$temp.dir, get.file.parts(iparams$control.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
+ cat('Downloading control file:',iparams$control.file,"\n",file=stdout())
+ if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$control.file)) != 0) {
+ stop('Error downloading Control file:',iparams$control.file)
+ }
+} else {
+ curr.control.file <- iparams$control.file # file is in original directory
+}
+
+# unzip ChIP and input files if required AND copy to temp directory
+if (get.file.parts(curr.chip.file)$ext == '.gz') {
+ ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
+ cat('Decompressing ChIP file\n',file=stdout())
+ if (system(paste("gunzip -c",curr.chip.file,">",ta.chip.filename)) != 0) {
+ stop('Unable to decompress file:', iparams$chip.file)
+ }
+ if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
+ file.remove(curr.chip.file)
+ }
+} else {
+ ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$fullname, tmpdir=iparams$temp.dir)
+ if (iparams$clean.files.flag) {
+ file.rename(curr.chip.file,ta.chip.filename) # move file to temp.dir/[filename][randsuffix]
+ } else {
+ file.copy(curr.chip.file,ta.chip.filename) # copy file to temp.dir/[filename][randsuffix]
+ }
+}
+
+if (! is.na(iparams$control.file)) {
+ if (get.file.parts(curr.control.file)$ext == '.gz') {
+ ta.control.filename <- tempfile(get.file.parts(curr.control.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
+ cat('Decompressing control file\n',file=stdout())
+ if (system(paste("gunzip -c",curr.control.file,">",ta.control.filename)) != 0) {
+ stop('Unable to decompress file:', iparams$control.file)
+ }
+ if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
+ file.remove(curr.control.file)
+ }
+ } else {
+ ta.control.filename <- tempfile(get.file.parts(curr.control.file)$fullname, tmpdir=iparams$temp.dir) # copy file to temp.dir/[filename][randsuffix]
+
+ if (iparams$clean.files.flag) {
+ file.rename(curr.control.file,ta.control.filename) # move file to temp.dir/[filename][randsuffix]
+ } else {
+ file.copy(curr.control.file,ta.control.filename) # copy file to temp.dir/[filename][randsuffix]
+ }
+ }
+}
+
+# Remove downloaded files
+if (iparams$isurl.chip.file & file.exists(curr.chip.file)) {
+ file.remove(curr.chip.file)
+}
+
+if (! is.na(iparams$control.file)) {
+ if (iparams$isurl.control.file & file.exists(curr.control.file)) {
+ file.remove(curr.control.file)
+ }
+}
+
+# Load SPP library
+library(spp)
+
+# Read ChIP tagAlign/BAM files
+cat("Reading ChIP tagAlign/BAM file",iparams$chip.file,"\n",file=stdout())
+chip.data <- read.align(ta.chip.filename)
+cat("ChIP data read length",chip.data$read.length,"\n",file=stdout())
+file.remove(ta.chip.filename) # Delete temporary file
+if (length(chip.data$tags)==0) {
+ stop('Error in ChIP file format:', iparams$chip.file)
+}
+# Remove illegal chromosome names
+if (! is.na(iparams$chrname.rm.pattern)) {
+ selectidx <- which(grepl(iparams$chrname.rm.pattern,names(chip.data$tags))==FALSE)
+ chip.data$tags <- chip.data$tags[selectidx]
+ chip.data$quality <- chip.data$quality[selectidx]
+}
+chip.data$num.tags <- sum(unlist(lapply(chip.data$tags,function(d) length(d))))
+
+# Read Control tagAlign/BAM files
+if (! is.na(iparams$control.file)) {
+ cat("Reading Control tagAlign/BAM file",iparams$control.file,"\n",file=stdout())
+ control.data <- read.align(ta.control.filename)
+ file.remove(ta.control.filename) # Delete temporary file
+ if (length(control.data$tags)==0) {
+ stop('Error in control file format:', iparams$chip.file)
+ }
+ cat("Control data read length",control.data$read.length,"\n",file=stdout())
+ # Remove illegal chromosome names
+ if (! is.na(iparams$chrname.rm.pattern)) {
+ selectidx <- which(grepl(iparams$chrname.rm.pattern,names(control.data$tags))==FALSE)
+ control.data$tags <- control.data$tags[selectidx]
+ control.data$quality <- control.data$quality[selectidx]
+ }
+ control.data$num.tags <- sum(unlist(lapply(control.data$tags,function(d) length(d))))
+}
+
+# Open multiple processes if required
+if (is.na(iparams$n.nodes)) {
+ cluster.nodes <- NULL
+} else {
+ library(snow)
+ cluster.nodes <- makeCluster(iparams$n.nodes)
+}
+
+# #################################
+# Calculate cross-correlation for various strand shifts
+# #################################
+cat("Calculating peak characteristics\n",file=stdout())
+# crosscorr
+# $cross.correlation : Cross-correlation profile as an $x/$y data.frame
+# $peak : Position ($x) and height ($y) of automatically detected cross-correlation peak.
+# $whs: Optimized window half-size for binding detection (based on the width of the cross-correlation peak)
+crosscorr <- get.binding.characteristics(chip.data,
+ remove.tag.anomalies = F,
+ srange=iparams$sep.range[c(1,3)],
+ bin=iparams$sep.range[2],
+ accept.all.tags=T,
+ cluster=cluster.nodes)
+if (!is.na(iparams$n.nodes)) {
+ stopCluster(cluster.nodes)
+}
+
+# Smooth the cross-correlation curve if required
+cc <- crosscorr$cross.correlation
+crosscorr$min.cc <- crosscorr$cross.correlation[ length(crosscorr$cross.correlation$y) , ] # minimum value and shift of cross-correlation
+cat("Minimum cross-correlation value", crosscorr$min.cc$y,"\n",file=stdout())
+cat("Minimum cross-correlation shift", crosscorr$min.cc$x,"\n",file=stdout())
+sbw <- 2*floor(ceiling(5/iparams$sep.range[2]) / 2) + 1 # smoothing bandwidth
+cc$y <- runmean(cc$y,sbw,alg="fast")
+
+# Compute cross-correlation peak
+bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
+peakidx <- (diff(cc$y,bw)>=0) # cc[i] > cc[i-bw]
+peakidx <- diff(peakidx,bw)
+peakidx <- which(peakidx==-1) + bw
+
+# exclude peaks from the excluded region
+if ( is.nan(iparams$ex.range[2]) ) {
+ iparams$ex.range[2] <- chip.data$read.length+10
+}
+peakidx <- peakidx[(cc$x[peakidx] < iparams$ex.range[1]) | (cc$x[peakidx] > iparams$ex.range[2]) | (cc$x[peakidx] < 0) ]
+cc <- cc[peakidx,]
+
+# Find max peak position and other peaks within 0.9*max_peakvalue that are further away from maxpeakposition
+maxpeakidx <- which.max(cc$y)
+maxpeakshift <- cc$x[maxpeakidx]
+maxpeakval <- cc$y[maxpeakidx]
+peakidx <-which((cc$y >= 0.9*maxpeakval) & (cc$x >= maxpeakshift))
+cc <- cc[peakidx,]
+
+# sort the peaks and get the top 3
+sortidx <- order(cc$y,decreasing=TRUE)
+sortidx <- sortidx[c(1:min(3,length(sortidx)))]
+cc.peak <- cc[sortidx,]
+
+# Override peak shift if user supplies peak shift
+if (! is.na(iparams$sep.peak)) {
+ cc.peak <- approx(crosscorr$cross.correlation$x,crosscorr$cross.correlation$y,iparams$sep.peak,rule=2)
+}
+cat("Top 3 cross-correlation values", paste(cc.peak$y,collapse=","),"\n",file=stdout())
+cat("Top 3 estimates for fragment length",paste(cc.peak$x,collapse=","),"\n",file=stdout())
+
+# Reset values in crosscorr
+crosscorr$peak$x <- cc.peak$x[1]
+crosscorr$peak$y <- cc.peak$y[1]
+
+# Compute window half size
+whs.thresh <- crosscorr$min.cc$y + (crosscorr$peak$y - crosscorr$min.cc$y)/3
+crosscorr$whs <- max(crosscorr$cross.correlation$x[crosscorr$cross.correlation$y >= whs.thresh])
+cat("Window half size",crosscorr$whs,"\n",file=stdout())
+
+# Compute phantom peak coefficient
+ph.peakidx <- which( ( crosscorr$cross.correlation$x >= ( chip.data$read.length - round(2*iparams$sep.range[2]) ) ) &
+ ( crosscorr$cross.correlation$x <= ( chip.data$read.length + round(1.5*iparams$sep.range[2]) ) ) )
+ph.peakidx <- ph.peakidx[ which.max(crosscorr$cross.correlation$y[ph.peakidx]) ]
+crosscorr$phantom.cc <- crosscorr$cross.correlation[ph.peakidx,]
+cat("Phantom peak location",crosscorr$phantom.cc$x,"\n",file=stdout())
+cat("Phantom peak Correlation",crosscorr$phantom.cc$y,"\n",file=stdout())
+crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$phantom.cc$y
+crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$min.cc$y
+cat("Normalized Strand cross-correlation coefficient (NSC)",crosscorr$phantom.coeff,"\n",file=stdout())
+crosscorr$rel.phantom.coeff <- (crosscorr$peak$y - crosscorr$min.cc$y) / (crosscorr$phantom.cc$y - crosscorr$min.cc$y)
+cat("Relative Strand Cross correlation Coefficient (RSC)",crosscorr$rel.phantom.coeff,"\n",file=stdout())
+crosscorr$phantom.quality.tag <- NA
+if ( (crosscorr$rel.phantom.coeff >= 0) & (crosscorr$rel.phantom.coeff < 0.25) ) {
+ crosscorr$phantom.quality.tag <- -2
+} else if ( (crosscorr$rel.phantom.coeff >= 0.25) & (crosscorr$rel.phantom.coeff < 0.5) ) {
+ crosscorr$phantom.quality.tag <- -1
+} else if ( (crosscorr$rel.phantom.coeff >= 0.5) & (crosscorr$rel.phantom.coeff < 1) ) {
+ crosscorr$phantom.quality.tag <- 0
+} else if ( (crosscorr$rel.phantom.coeff >= 1) & (crosscorr$rel.phantom.coeff < 1.5) ) {
+ crosscorr$phantom.quality.tag <- 1
+} else if ( (crosscorr$rel.phantom.coeff >= 1.5) ) {
+ crosscorr$phantom.quality.tag <- 2
+}
+cat("Phantom Peak Quality Tag",crosscorr$phantom.quality.tag,"\n",file=stdout())
+
+# Output result to result file if required
+#Filename\tnumReads\tPeak_shift\tPeak_Correlation\tRead_length\tPhantomPeak_Correlation\tMin_Correlation_Shift\tMin_Correlation\tNormalized_CrossCorrelation_Coefficient\tRelative_CrossCorrelation_Coefficient\tQualityTag)
+if (! is.na(iparams$output.result.file)) {
+ cat(get.file.parts(iparams$chip.file)$fullname,
+ chip.data$num.tags,
+ paste(cc.peak$x,collapse=","),
+ paste(cc.peak$y,collapse=","),
+ crosscorr$phantom.cc$x,
+ crosscorr$phantom.cc$y,
+ crosscorr$min.cc$x,
+ crosscorr$min.cc$y,
+ crosscorr$phantom.coeff,
+ crosscorr$rel.phantom.coeff,
+ crosscorr$phantom.quality.tag,
+ sep="\t",
+ file=iparams$output.result.file,
+ append=TRUE)
+ cat("\n",
+ file=iparams$output.result.file,
+ append=TRUE)
+}
+
+# Save figure if required
+if (! is.na(iparams$output.plot.file)) {
+ pdf(file=iparams$output.plot.file,width=5,height=5)
+ par(mar = c(4,3.5,2,0.5), mgp = c(1.5,0.5,0), cex = 0.8);
+ plot(crosscorr$cross.correlation,
+ type='l',
+ xlab=sprintf("strand-shift (%s)",paste(cc.peak$x,collapse=",")),
+ ylab="cross-correlation")
+ abline(v=cc.peak$x,lty=2,col=2)
+ abline(v=crosscorr$phantom.cc$x,lty=2,col=4)
+ title(main=get.file.parts(iparams$chip.file)$fullname,
+ sub=sprintf("NSC=%g,RSC=%g,Qtag=%d",crosscorr$phantom.coeff,crosscorr$rel.phantom.coeff,crosscorr$phantom.quality.tag))
+ dev.off();
+}
+
+# Save RData file if required
+if (! is.na(iparams$output.rdata.file)) {
+ save(iparams,
+ crosscorr,
+ cc.peak,
+ file=iparams$output.rdata.file);
+}
+
+# #################################
+# Call peaks
+# #################################
+
+if ( !is.na(iparams$output.npeak.file) || !is.na(iparams$output.rpeak.file) ) {
+
+ # Remove local tag anomalies
+ cat('Removing read stacks\n',file=stdout())
+ chip.data <- remove.local.tag.anomalies(chip.data$tags)
+ control.data <- remove.local.tag.anomalies(control.data$tags)
+
+ # Open multiple processes if required
+ if (is.na(iparams$n.nodes)) {
+ cluster.nodes <- NULL
+ } else {
+ cluster.nodes <- makeCluster(iparams$n.nodes)
+ }
+
+ # Find peaks
+ cat('Finding peaks\n',file=stdout())
+ if (!is.na(iparams$npeak)) {
+ iparams$fdr <- 0.99
+ }
+ narrow.peaks <- find.binding.positions(signal.data=chip.data,control.data=control.data,fdr=iparams$fdr,method=tag.lwcc,whs=crosscorr$whs,cluster=cluster.nodes)
+ if (!is.na(iparams$n.nodes)) {
+ stopCluster(cluster.nodes)
+ }
+ cat(paste("Detected",sum(unlist(lapply(narrow.peaks$npl,function(d) length(d$x)))),"peaks"),"\n",file=stdout())
+
+ # Write to narrowPeak file
+ if (!is.na(iparams$output.npeak.file)) {
+ write.narrowpeak.binding(narrow.peaks,iparams$output.npeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
+ system(paste('gzip -f ',iparams$output.npeak.file))
+ }
+
+ # Compute and write regionPeak file
+ if (!is.na(iparams$output.rpeak.file)) {
+ region.peaks <- add.broad.peak.regions(chip.data,control.data,narrow.peaks,window.size=max(50,round(crosscorr$whs/4)),z.thr=9)
+ write.narrowpeak.binding(region.peaks,iparams$output.rpeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
+ system(paste('gzip -f ',iparams$output.rpeak.file))
+ }
+
+ # Save Rdata file
+ if (! is.na(iparams$output.rdata.file)) {
+ save(iparams,
+ crosscorr,
+ cc.peak,
+ narrow.peaks,
+ region.peaks,
+ file=iparams$output.rdata.file);
+ }
+
+}
+
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/DESCRIPTION b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/DESCRIPTION
new file mode 100755
index 0000000..59eeb71
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: spp
+Type: Package
+Title: some description
+Version: 1.0
+Date: 2008-11-10
+Author: Peter K
+Depends: caTools
+Maintainer: peterK<peterk@compbio.med.harvard.edu>
+Description: Describe the package
+License: GPL-2
+LazyLoad: yes
+Packaged: Wed Nov 12 10:42:54 2008; vidhuch
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/NAMESPACE b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/NAMESPACE
new file mode 100755
index 0000000..caf30e6
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/NAMESPACE
@@ -0,0 +1,3 @@
+useDynLib(spp)
+
+exportPattern("^[^\\.]")
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/R/zroutines.R b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/R/zroutines.R
new file mode 100755
index 0000000..ece76f3
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/R/zroutines.R
@@ -0,0 +1,2501 @@
+#library(caTools)
+#dyn.load("src/bed2vector.so");
+#dyn.load("src/wdl.so");
+#dyn.load("src/peaks.so");
+#dyn.load("src/cdensum.so");
+
+
+# -------- ROUTINES FOR READING IN THE DATA FILES ------------
+# fix.chromosome.names : remove ".fa" suffix from match sequence names
+read.eland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T,max.eland.tag.length=-1,extended=F,multi=F) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ storage.mode(max.eland.tag.length) <- "integer";
+ callfunction <- "read_eland";
+ if(extended) { callfunction <- "read_eland_extended"; };
+ if(multi) { callfunction <- "read_eland_multi"; };
+ tl <- lapply(.Call(callfunction,filename,rtn,max.eland.tag.length),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+read.tagalign.tags <- function(filename,fix.chromosome.names=T,fix.quality=T) {
+ tl <- lapply(.Call("read_tagalign",filename),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ #if(fix.quality) {
+ # d$n <- 4-cut(d$n,breaks=c(0,250,500,750,1000),labels=F)
+ #}
+ if(fix.quality) { # Anshul: changed the way the quality field is processed
+ if (min(d$n)<0.5){
+ d$n = ceiling(1000/4^d$n);
+ }
+ break.vals <- unique(sort(c(0,unique(d$n))));
+ d$n <- length(break.vals)-1-cut(d$n,breaks=break.vals,labels=F);
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+}
+
+
+read.short.arachne.tags <- function(filename,fix.chromosome.names=F) {
+ tl <- lapply(.Call("read_arachne",filename),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+}
+
+
+read.arachne.tags <- function(filename,fix.chromosome.names=F) {
+ tl <- lapply(.Call("read_arachne_long",filename),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ d$l <- d$l[xo];
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l)));
+}
+
+read.bowtie.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_bowtie",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+read.bam.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_bam",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+
+read.helicos.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F,include.length.info=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_helicostabf",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ d$l <- d$l[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l)));
+ }
+}
+
+read.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_maqmap",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+
+read.bin.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_binmaqmap",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+
+# read in tags from an extended eland format with match length information
+read.meland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_meland",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ d$l <- d$l[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ chrl <- names(tl); names(chrl) <- chrl;
+ # reformulate quality scores into monotonic integers
+ ml <- max(unlist(lapply(tl,function(d) max(d$l))));
+ qual <- lapply(chrl,function(chr) (ml-tl[[chr]]$l)+tl[[chr]]$n/10);
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=qual,names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=qual));
+ }
+}
+
+# -------- ROUTINES FOR ASSESSING BINDING PATTERN AND SELECTING INFORMATIVE TAGS ------------
+
+# removes tag positions that have anomalously high counts on both strands
+# z - z-score used to determine anomalous bins
+# zo - z used to filter out one-strand matches
+# trim.fraction - fraction of top bins to discard when calculating overall background density
+remove.tag.anomalies <- function(data, bin=1,trim.fraction=1e-3,z=5,zo=3*z) {
+
+ t.remove.tag.anomalies <- function(tv,bin=1,trim.fraction=1e-3,z=5,zo=3*z,return.indecies=F) {
+ tt <- table(floor(tv/bin));
+
+ # trim value
+ stt <- sort(as.numeric(tt));
+ stt <- stt[1:(length(stt)*(1-trim.fraction))];
+ mtc <- mean(stt); tcd <- sqrt(var(stt));
+
+ thr <- max(1,ceiling(mtc+z*tcd));
+ thr.o <- max(1,ceiling(mtc+zo*tcd));
+ # filter tt
+ tt <- tt[tt>=thr]
+ # get + and - tags
+ tp <- as.numeric(names(tt));
+ pti <- tp>0;
+ it <- intersect(tp[pti],(-1)*tp[!pti]);
+ # add one-strand matches
+ it <- unique(c(it,tp[tt>=thr.o]));
+ sit <- c(it,(-1)*it);
+
+ if(bin>1) {
+ sit <- sit*bin;
+ sit <- c(sit,unlist(lapply(1:bin,function(i) sit+i)))
+ }
+ if(return.indecies) {
+ return(!tv %in% sit);
+ } else {
+ return(tv[!tv %in% sit]);
+ }
+ }
+
+ vil <- lapply(data$tags,t.remove.tag.anomalies,return.indecies=T,bin=bin,trim.fraction=trim.fraction,z=z,zo=zo);
+ chrl <- names(data$tags); names(chrl) <- chrl;
+ data$tags <- lapply(chrl,function(chr) data$tags[[chr]][vil[[chr]]]);
+ # count tags to remove empty chromosomes
+ nt <- unlist(lapply(data$tags,length));
+ if(any(nt==0)) {
+ data$tags <- data$tags[nt!=0]
+ }
+
+ if(!is.null(data$quality)) {
+ data$quality <- lapply(chrl,function(chr) data$quality[[chr]][vil[[chr]]]);
+ data$quality <- data$quality[nt!=0];
+ }
+ if(!is.null(data$names)) {
+ data$names <- lapply(chrl,function(chr) data$names[[chr]][vil[[chr]]]);
+ data$names <- data$names[nt!=0];
+ }
+
+ return(data);
+}
+
+# caps or removes tag positions that are significantly higher than local background
+remove.local.tag.anomalies <- function(tags,window.size=200,eliminate.fold=10,cap.fold=4,z.threshold=3) {
+ lapply(tags,filter.singular.positions.by.local.density,window.size=2e2,eliminate.fold=10,cap.fold=4,z.threshold=3);
+}
+
+
+
+# assess strand cross-correlation, determine peak position, determine appropriate window size
+# for binding detection.
+get.binding.characteristics <- function(data,srange=c(50,500),bin=5,cluster=NULL,debug=F,min.tag.count=1e3,acceptance.z.score=3,remove.tag.anomalies=T,anomalies.z=5,accept.all.tags=F) {
+ if(remove.tag.anomalies) {
+ data <- remove.tag.anomalies(data,z=anomalies.z);
+ }
+
+ # take highest quality tag bin
+ if(!is.null(data$quality) & !accept.all.tags) {
+ min.bin <- min(unlist(lapply(data$quality,min)))
+ chrl <- names(data$tags); names(chrl) <- chrl;
+ otl <- lapply(chrl,function(chr) data$tags[[chr]][data$quality[[chr]]==min.bin]);
+ } else {
+ otl <- data$tags;
+ }
+ # remove empty chromosomes
+ otl <- otl[unlist(lapply(otl,length))!=0];
+
+
+ # calculate strand scc
+ if(!is.null(cluster)) {
+ cc <- clusterApplyLB(cluster,otl,tag.scc,srange=srange,bin=bin);
+ names(cc) <- names(otl);
+ } else {
+ cc <- lapply(otl,tag.scc,srange=srange,bin=bin);
+ }
+ ccl<-list(sample=cc);
+ ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,return.ac=T,ttl=list(sample=otl),plot=F)[[1]]
+ ccl.av <- data.frame(x=as.numeric(names(ccl.av)),y=as.numeric(ccl.av));
+
+ # find peak
+ pi <- which.max(ccl.av$y);
+
+ # determine width at third-height
+ th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/3+ccl.av$y[length(ccl.av$y)]
+ whs <- max(ccl.av$x[ccl.av$y>=th]);
+
+ if (! is.integer(whs)) { # Anshul: added this to avoid situations where whs ends up being -Inf
+ whs <- ccl.av$x[ min(c(2*pi,length(ccl.av$y))) ]
+ }
+
+ # determine acceptance of different quality bins
+
+ # calculates tag scc for the best tags, and combinations of best tag category with every other category
+ # for subsequent selection of acceptable categories
+ scc.acceptance.calc <- function() {
+
+ qr <- range(unlist(lapply(data$quality,range)))
+
+ # start with best tags
+
+ # determine half-width for scc calculations
+ pi <- which.max(ccl.av$y);
+
+ # determine width at half-height
+ th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/2+ccl.av$y[length(ccl.av$y)]
+ lwhs <- max(ccl.av$x[ccl.av$y>=th])-ccl.av$x[pi];
+ lwhs <- max(c(20,bin*10,lwhs));
+ srange <- ccl.av$x[pi]+c(-lwhs,lwhs)
+
+ # calculate chromosome-average scc
+ t.scc <- function(tags) {
+ if(is.null(cluster)) {
+ cc <- lapply(tags,tag.scc,srange=srange,bin=bin);
+ } else {
+ cc <- clusterApplyLB(cluster,tags,tag.scc,srange=srange,bin=bin); names(cc) <- names(tags);
+ }
+ return(t.plotavcc(1,type='l',ccl=list(cc),ttl=list(tags),plot=F,return.ac=T))
+ }
+
+
+ # returns info list for a given tag length (lv), mismatch count (nv)
+ t.cat <- function(qual) {
+ # construct tag set
+ if(qual==qr[1]) {
+ ts <- otl;
+ } else {
+ nts <- names(otl); names(nts) <- nts;
+ # select tags
+ at <- lapply(nts,function(chr) data$tags[[chr]][data$quality[[chr]]==qual]);
+ ntags <- sum(unlist(lapply(at,length)));
+ if(ntags<min.tag.count) { return(NULL); }
+
+ # append to otl
+ ts <- lapply(nts,function(nam) c(otl[[nam]],at[[nam]]));
+ }
+
+ return(t.scc(ts));
+ }
+
+
+ # calculate cross-correlation values for each quality bin
+ ql <- sort(unique(unlist(lapply(data$quality,unique)))); names(ql) <- ql;
+
+ qccl <- lapply(ql,t.cat);
+
+ # acceptance tests
+ ac <- c(T,unlist(lapply(qccl[-1],function(d) if(is.null(d)) { return(F) } else { t.test(d-qccl[[as.character(min.bin)]],alternative="greater")$p.value<pnorm(acceptance.z.score,lower.tail=F) }))); names(ac) <- names(qccl);
+ return(list(informative.bins=ac,quality.cc=qccl))
+ }
+
+ if(accept.all.tags | is.null(data$quality)) {
+ return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs))
+ } else {
+ acc <- scc.acceptance.calc();
+ return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs,quality.bin.acceptance=acc));
+ }
+
+}
+
+
+# select a set of informative tags based on the pre-calculated binding characteristics
+select.informative.tags <- function(data,binding.characteristics=NULL) {
+ if(is.null(binding.characteristics)) {
+ return(data$tags);
+ }
+ if(is.null(binding.characteristics$quality.bin.acceptance)) {
+ cat("binding characteristics doesn't contain quality selection info, accepting all tags\n");
+ return(data$tags);
+ }
+
+ ib <- binding.characteristics$quality.bin.acceptance$informative.bins;
+ abn <- names(ib)[ib]
+
+ chrl <- names(data$tags); names(chrl) <- chrl;
+ lapply(chrl,function(chr) {
+ data$tags[[chr]][as.character(data$quality[[chr]]) %in% abn]
+ })
+}
+
+# -------- ROUTINES FOR CALLING BINDING POSITIONS ------------
+
+# determine binding positions
+# signal.data - IP tag lists
+# control.data - input tag lists
+# e.value - desired E-value threshold (either E-value or FDR threshold must be provided)
+# fdr - desired FDR threshold
+# min.dist - minimal distance between detected positions
+# tag.count.whs - size of the window to be used to estimate confidence interval of the peak fold enrichment ratios
+# enrichmnent.z - Z-score defining the desired confidence level for enrichment interval estimates
+# enrichment.background.scales - define how many tiems larger should be the window for estimating background
+# tag density when evaluating peak enrichment confidence intervals.
+# If multiple values are given, multiple independent interval estimates will be
+# calculated.
+# tec.filter - whether to mask out the regions that exhibit significant background enrichment
+# tec.window.size, tec.z - window size and Z-score for maksing out significant background enrichment regions
+#
+# If the control.data is not provided, the method will assess significance of the determined binding positions
+# based on the randomizations of the original data. The following paramters control such randomizations:
+# n.randomizations - number of randomizations to be performed
+# shuffle.window - size of the bin that defines the tags that are kept together during randomization.
+# value of 0 means that all tags are shuffled independently
+#
+# Binding detection methods:
+# tag.wtd - default method.
+# must specify parameter "whs", which is the half-size of the window used to calculate binding scores
+# tag.lwcc - LWCC method;
+# must specify whs - a size of the window used to calculate binding scores
+# can specify isize (default=15bp) - size of the internal window that is masked out
+find.binding.positions <- function(signal.data,f=1,e.value=NULL,fdr=NULL, masked.data=NULL,control.data=NULL,whs=200,min.dist=200,window.size=4e7,cluster=NULL,debug=T,n.randomizations=3,shuffle.window=1,min.thr=2,topN=NULL, tag.count.whs=100, enrichment.z=2, method=tag.wtd, tec.filter=T,tec.window.size=1e4,tec.z=5,tec.masking.window.size=tec.window.size, tec.poisson.z=5,tec.poisson.ratio=5, tec=NULL, n.control.samples=1, enrichment.scale.down.control=F, enrichment.background.scales=c(1,5,10), use.randomized.controls=F, background.density.scaling=T, mle.filter=F, min.mle.threshold=1, ...) {
+
+ if(f<1) {
+ if(debug) { cat("subsampling signal ... "); }
+ signal.data <- lapply(signal.data,function(x) sample(x,length(x)*f))
+ if(debug) { cat("done\n"); }
+ }
+
+
+ if(!is.null(control.data) & !use.randomized.controls) {
+ # limit both control and signal data to a common set of chromosomes
+ chrl <- intersect(names(signal.data),names(control.data));
+ signal.data <- signal.data[chrl];
+ control.data <- control.data[chrl];
+ control <- list(control.data);
+ } else {
+ control <- NULL;
+ }
+
+ prd <- lwcc.prediction(signal.data,min.dist=min.dist,whs=whs,window.size=window.size,e.value=e.value,fdr=fdr,debug=debug,n.randomizations=n.randomizations,shuffle.window=shuffle.window,min.thr=min.thr,cluster=cluster,method=method,bg.tl=control.data,mask.tl=masked.data, topN=topN, control=control,tec.filter=tec.filter,tec.z=tec.z,tec.window.size=tec.window.size, tec.masking.window.size=tec.masking.window.size, tec.poisson.z=tec.poisson.z,tec.poisson.ratio=tec.poisson.ratio, background.density.scaling=background.density.scaling, ...);
+
+ # add tag counts
+ chrl <- names(prd$npl); names(chrl) <- chrl;
+ prd$npl <- lapply(chrl,function(chr) {
+ pd <- prd$npl[[chr]];
+ pd$nt <- points.within(abs(signal.data[[chr]]),pd$x-tag.count.whs,pd$x+tag.count.whs,return.point.counts=T);
+ return(pd);
+ });
+ prd$f <- f;
+ prd$n <- sum(unlist(lapply(signal.data,length)));
+ if(!is.null(control.data)) {
+ prd$n.bg <- sum(unlist(lapply(control.data,length)));
+ }
+
+ # calculate enrichment ratios
+ prd <- calculate.enrichment.estimates(prd,signal.data,control.data=control.data,fraction=1,tag.count.whs=tag.count.whs,z=enrichment.z,scale.down.control=enrichment.scale.down.control,background.scales=enrichment.background.scales);
+
+ if(mle.filter) {
+ if(!is.null(prd$npl)) {
+ if(length(prd$npl)>1) {
+ mle.columns <- grep("enr.mle",colnames(prd$npl[[1]]));
+ if(length(mle.columns)>1) {
+ prd$npl <- lapply(prd$npl,function(d) d[apply(d[,mle.columns],1,function(x) all(x>min.mle.threshold)),])
+ }
+ }
+ }
+ }
+
+ prd$whs <- whs;
+
+ return(prd);
+}
+
+
+
+# -------- ROUTINES FOR WRITING OUT TAG DENSITY AND ENRICHMENT PROFILES ------------
+# calculate smoothed tag density, optionally subtracting the background
+get.smoothed.tag.density <- function(signal.tags,control.tags=NULL,bandwidth=150,bg.weight=NULL,tag.shift=146/2,step=round(bandwidth/3),background.density.scaling=T,rngl=NULL,scale.by.dataset.size=F) {
+ chrl <- names(signal.tags); names(chrl) <- chrl;
+
+ if(!is.null(control.tags)) {
+ bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling);
+ }
+
+ if(scale.by.dataset.size) {
+ den.scaling <- dataset.density.size(signal.tags,background.density.scaling=background.density.scaling)/1e6;
+ } else {
+ den.scaling <- 1;
+ }
+
+ lapply(chrl,function(chr) {
+ ad <- abs(signal.tags[[chr]]+tag.shift);
+ rng <- NULL;
+ if(!is.null(rngl)) {
+ rng <- rngl[[chr]];
+ }
+ if(is.null(rng)) {
+ rng <- range(ad);
+ }
+
+ ds <- densum(ad,bw=bandwidth,from=rng[1],to=rng[2],return.x=T,step=step);
+ if(!is.null(control.tags)) {
+ if(!is.null(control.tags[[chr]])) {
+ bsd <- densum(abs(control.tags[[chr]]+tag.shift),bw=bandwidth,from=rng[1],to=rng[2],return.x=F,step=step);
+ ds$y <- ds$y-bsd*bg.weight;
+ }
+ }
+ return(data.frame(x=seq(ds$x[1],ds$x[2],by=step),y=den.scaling*ds$y))
+ })
+}
+
+# get smoothed maximum likelihood estimate of the log2 signal to control enrichment ratio
+get.smoothed.enrichment.mle <- function(signal.tags, control.tags, tag.shift=146/2, background.density.scaling=F, pseudocount=1,bg.weight=NULL, ... ) {
+ # determine common range
+ chrl <- intersect(names(signal.tags),names(control.tags)); names(chrl) <- chrl;
+ rngl <- lapply(chrl,function(chr) range(c(range(abs(signal.tags[[chr]]+tag.shift)),range(abs(control.tags[[chr]]+tag.shift)))))
+ ssd <- get.smoothed.tag.density(signal.tags, rngl=rngl, ..., scale.by.dataset.size=F)
+ csd <- get.smoothed.tag.density(control.tags, rngl=rngl, ..., scale.by.dataset.size=F)
+ if(is.null(bg.weight)) {
+ bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling);
+ }
+ cmle <- lapply(chrl,function(chr) { d <- ssd[[chr]]; d$y <- log2(d$y+pseudocount) - log2(csd[[chr]]$y+pseudocount) - log2(bg.weight); return(d); })
+}
+
+
+# returns a conservative upper/lower bound profile (log2) given signal tag list, background tag list and window scales
+get.conservative.fold.enrichment.profile <- function(ftl,btl,fws,bwsl=c(1,5,25,50)*fws,step=50,tag.shift=146/2,alpha=0.05,use.most.informative.scale=F,quick.calculation=T,background.density.scaling=T,bg.weight=NULL,posl=NULL,return.mle=F) {
+ # include only chromosomes with more than 2 reads
+ ftl <- ftl[unlist(lapply(ftl,length))>2]
+ chrl <- names(ftl); names(chrl) <- chrl;
+ if(!is.null(posl)) {
+ chrl <- chrl[chrl %in% names(posl)];
+ }
+ # calculate background tag ratio
+ if(is.null(bg.weight)) {
+ bg.weight <- dataset.density.ratio(ftl,btl,background.density.scaling=background.density.scaling);
+ }
+ lapply(chrl,function(chr) {
+ if(is.null(btl[[chr]])) { bt <- c(); } else { bt <- abs(btl[[chr]]+tag.shift); }
+ if(is.null(posl)) {
+ x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha);
+ } else {
+ x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha,pos=posl[[chr]]);
+ }
+ # compose profile showing lower bound for enriched, upper bound for depleted regions
+ ps <- rep(1,length(x$mle));
+ vi <- which(!is.na(x$lb) & x$lb>1);
+ ps[vi] <- x$lb[vi];
+ vi <- which(!is.na(x$ub) & x$ub<1);
+ ps[vi] <- x$ub[vi];
+ ps <- log2(ps);
+ if(is.null(posl)) {
+ if(return.mle) {
+ return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub)));
+ } else {
+ return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps));
+ }
+ } else {
+ if(return.mle) {
+ return(data.frame(x=posl[[chr]],y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub)));
+ } else {
+ return(data.frame(x=posl[[chr]],y=ps));
+ }
+ }
+ })
+}
+
+
+# write a per-chromosome $x/$y data structure into a wig file
+writewig <- function(dat,fname,feature,threshold=5,zip=F) {
+ chrl <- names(dat); names(chrl) <- chrl;
+ invisible(lapply(chrl,function(chr) {
+ bdiff <- dat[[chr]];
+ ind <- seq(1,length(bdiff$x));
+ ind <- ind[!is.na(bdiff$y[ind])];
+ header <- chr==chrl[1];
+ write.probe.wig(chr,bdiff$x[ind],bdiff$y[ind],fname,append=!header,feature=feature,header=header);
+ }))
+ if(zip) {
+ zf <- paste(fname,"zip",sep=".");
+ system(paste("zip \"",zf,"\" \"",fname,"\"",sep=""));
+ system(paste("rm \"",fname,"\"",sep=""));
+ return(zf);
+ } else {
+ return(fname);
+ }
+}
+
+
+
+# -------- ROUTINES FOR ANALYZING SATURATION PROPERTIES ------------
+
+# PUBLIC
+# calculate minimal saturation enrichment ratios (MSER)
+get.mser <- function(signal.data,control.data,n.chains=5,step.size=1e5, chains=NULL, cluster=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), n.steps=1, ...) {
+ if(is.null(chains)) {
+ ci <- c(1:n.chains); names(ci) <- ci;
+ if(is.null(cluster)) {
+ chains <- lapply(ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...);
+ } else {
+ chains <- clusterApplyLB(cluster,ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...);
+ names(chains) <- ci;
+ }
+ }
+ cvl <- mser.chain.interpolation(chains=chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=F);
+ if(n.steps>1) {
+ msers <- cvl;
+ } else {
+ msers <- unlist(lapply(cvl,function(d) d$me))
+ }
+ if(return.chains) {
+ return(list(mser=msers,chains=chains));
+ } else {
+ return(msers);
+ }
+}
+
+# PUBLIC
+# interpolate MSER dependency on tag counts
+get.mser.interpolation <- function(signal.data,control.data,target.fold.enrichment=5,n.chains=10,n.steps=6,step.size=1e5, chains=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), excluded.steps=c(seq(2,n.steps-2)), ...) {
+ msers <- get.mser(signal.data,control.data,n.chains=n.chains,n.steps=n.steps,step.size=step.size,chains=chains,test.agrement=test.agreement,return.chains=T,enrichment.background.scales=enrichment.background.scales,excluded.steps=excluded.steps, ...);
+
+ # adjust sizes in case a subset of chromosomes was used
+ mser <- mser.chain.interpolation(chains=msers$chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=T);
+ sr <- sum(unlist(lapply(signal.data,length)))/mser[[1]][[1]]$n[1];
+
+ # Subsampling each chain requires removing a fraction of each chromosome's
+ # tag list. To get the exact step.size, this often leaves chromosomes with
+ # a non-integer number of tags. The non-integer values are floored, so each
+ # chr can contribute at most 0.999.. <= 1 error to the step.size.
+ floor.error <- length(msers$chains[[1]][[1]]$npl)
+ intpn <- lapply(mser,function(ms) {
+ lmvo <- do.call(rbind,ms)
+ lmvo$n <- lmvo$n*sr;
+ # Don't select rows corresponding to excluded.steps
+ # Keep in mind that nd values are negative.
+ lmvo <- lmvo[lmvo$nd <= (lmvo$nd[1] + floor.error) & lmvo$nd >= (lmvo$nd[1] - floor.error),];
+ lmvo <- na.omit(lmvo);
+ if(any(lmvo$me==1)) {
+ return(list(prediction=NA));
+ }
+ lmvo$n <- log10(lmvo$n); lmvo$me <- log10(lmvo$me-1)
+ # remove non-standard steps
+ emvf <- lm(me ~ n,data=lmvo);
+ tfe <- (log10(target.fold.enrichment-1)-coef(emvf)[[1]])/coef(emvf)[[2]];
+ tfen <- 10^tfe;
+ return(list(prediction=tfen,log10.fit=emvf));
+ })
+
+ if(return.chains) {
+ return(list(interpolation=intpn,chains=msers$chains))
+ } else {
+ return(intpn);
+ }
+
+ return(msers);
+
+}
+
+
+# output binding detection results to a text file
+# the file will contain a table with each row corresponding
+# to a detected position, with the following columns:
+# chr - chromosome or target sequence
+# pos - position of detected binding site on the chromosome/sequence
+# score - a score reflecting magnitude of the binding
+# Evalue - E-value corresponding to the peak magnitude
+# FDR - FDR corresponding to the peak magnitude
+# enrichment.lb - lower bound of the fold-enrichment ratio
+# enrichment.mle - maximum likelihood estimate of the fold-enrichment ratio
+output.binding.results <- function(results,filename) {
+ write(file=filename,"chr\tpos\tscore\tEvalue\tFDR\tenrichment.lb\tenrichment.mle",append=F);
+ chrl <- names(results$npl); names(chrl) <- chrl;
+ x <- lapply(chrl,function(chr) {
+ d <- results$npl[[chr]];
+ if(dim(d)[1]>0) {
+ if(results$thr$type=="topN") {
+ od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,enr,enr.mle)))
+ } else {
+ od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,evalue,fdr,enr,enr.mle)))
+ }
+ write.table(od,file=filename,col.names=F,row.names=F,sep="\t",append=T,quote=F)
+ }
+ })
+}
+
+
+# -------- LOW-LEVEL ROUTINES ------------
+
+# calculates tag strand cross-correlation for a range of shifts (on positive strand)
+tag.scc <- function(tags,srange=c(50,250),bin=1,tt=NULL,llim=10) {
+ if(is.null(tt)) {
+ tt <- table(sign(tags)*as.integer(floor(abs(tags)/bin+0.5)));
+ }
+ if(!is.null(llim)) { l <- mean(tt); tt <- tt[tt<llim*l] }
+ tc <- as.integer(names(tt));
+ tt <- as.numeric(tt);
+
+ pv <- tt; pv[tc<0]<-0;
+ nv <- tt; nv[tc>0]<-0;
+
+ pti <- which(tc>0)
+ nti <- which(tc<0);
+
+ ptc <- tc[pti];
+ ntc <- (-1)*tc[nti];
+
+ ptv <- tt[pti];
+ ntv <- tt[nti];
+
+ trng <- range(c(range(ptc),range(ntc)))
+ l <- diff(trng)+1;
+ rm(tc,tt);
+
+ mp <- sum(ptv)*bin/l; mn <- sum(ntv)*bin/l;
+ ptv <- ptv-mp; ntv <- ntv-mn;
+ ss <- sqrt((sum(ptv*ptv)+(l-length(ptv))*mp^2) * (sum(ntv*ntv)+(l-length(ntv))*mn^2));
+
+ t.cor <- function(s) {
+ smi <- match(ptc+s,ntc);
+ return((sum(ptv[!is.na(smi)]*ntv[na.omit(smi)]) -
+ mn*sum(ptv[is.na(smi)]) -
+ mp*sum(ntv[-na.omit(smi)]) +
+ mp*mn*(l-length(ptv)-length(ntv)+length(which(!is.na(smi)))))/ss);
+ }
+ shifts <- floor(seq(srange[1],srange[2],by=bin)/bin+0.5);
+ scc <- unlist(lapply(shifts,t.cor)); names(scc) <- shifts*bin;
+ return(scc);
+}
+
+
+# plot tag cross-correlation
+t.plotcc <- function(ac, lab=c(10,5,7), ylab="correlation", xlab="lag", pch=19, grid.i=c(-5:5), grid.s=10, type='b', plot.grid=F, cols=c(1,2,4,"orange",8,"pink"), min.peak.x=NULL, xlim=NULL, plot.147=F, plot.max=T, rmw=1, rescale=F, legendx="right", ltys=rep(1,length(ac)), ...) {
+ if(is.list(ac)) {
+ cols <- cols[1:length(ac)];
+
+ if(!is.null(xlim)) {
+ vx <- as.numeric(names(ac[[1]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]);
+ ac[[1]] <- (ac[[1]])[vx];
+ } else {
+ xlim <- range(as.numeric(names(ac[[1]])));
+ }
+
+
+ plot(as.numeric(names(ac[[1]])),runmean(ac[[1]],rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, col=cols[1], xlim=xlim, lty=ltys[1], ...);
+ if(length(ac)>1) {
+ for(i in seq(2,length(ac))) {
+ irng <- range(ac[[i]]);
+ vx <- as.numeric(names(ac[[i]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]);
+ if(rescale) {
+ lines(as.numeric(names(ac[[i]])[vx]),runmean((ac[[i]][vx]-irng[1])/diff(irng)*diff(range(ac[[1]]))+min(ac[[1]]),rmw),col=cols[i],lty=ltys[i]);
+ } else {
+ lines(as.numeric(names(ac[[i]]))[vx],runmean(ac[[i]][vx],rmw),col=cols[i],lty=ltys[i]);
+ }
+ }
+ }
+ if(is.null(min.peak.x)) {
+ m <- as.numeric(names(ac[[1]])[which.max(ac[[1]])]);
+ } else {
+ sac <- (ac[[1]])[which(as.numeric(names(ac[[1]]))>min.peak.x)]
+ m <- as.numeric(names(sac)[which.max(sac)]);
+ }
+ legend(x="topright",bty="n",legend=c(names(ac)),col=cols,lty=ltys)
+ } else {
+ if(!is.null(xlim)) {
+ vx <- as.numeric(names(ac));
+ vx <- which(vx>=xlim[1] & vx<=xlim[2]);
+ ac <- ac[vx];
+ } else {
+ xlim <- range(as.numeric(names(ac)));
+ }
+
+ plot(names(ac),runmean(ac,rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, xlim=xlim, ...);
+ if(is.null(min.peak.x)) {
+ m <- as.numeric(names(ac)[which.max(ac)]);
+ } else {
+ sac <- ac[which(names(ac)>min.peak.x)]
+ m <- as.numeric(names(sac)[which.max(sac)]);
+ }
+ }
+ if(plot.147) {
+ abline(v=147,lty=2,col=8);
+ }
+ if(plot.grid) {
+ abline(v=m+grid.i*grid.s,lty=3,col="pink");
+ }
+ if(plot.max) {
+ abline(v=m,lty=2,col=2);
+ legend(x=legendx,bty="n",legend=c(paste("max at ",m,"bp",sep="")));
+ return(m);
+ }
+ }
+
+ # plot chromosome-acerage cross-correlation
+ t.plotavcc <- function(ci, main=paste(ci,"chromosome average"), ccl=tl.cc, return.ac=F, ttl=tl, plot=T, ... ) {
+ cc <- ccl[[ci]];
+ if(length(cc)==1) { return(cc[[1]]) };
+ if(length(cc)==0) { return(c()) };
+ ac <- do.call(rbind,cc);
+ # omit NA chromosomes
+ ina <- apply(ac,1,function(d) any(is.na(d)));
+
+ tags <- ttl[[ci]];
+ avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw);
+ ac <- ac[!ina,]; avw <- avw[!ina];
+ ac <- apply(ac,2,function(x) sum(x*avw));
+ if(plot) {
+ m <- t.plotcc(ac, main=main, ...);
+ if(!return.ac) { return(m) }
+ }
+ if(return.ac) { return(ac) }
+ }
+
+ t.plotchrcc <- function(ci,ncol=4, ccl=tl.cc, ... ) {
+ cc <- ccl[[ci]];
+ ac <- do.call(rbind,cc);
+ par(mfrow = c(length(cc)/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8)
+ lapply(names(cc),function(ch) { t.plotcc(cc[[ch]],main=paste(ci,": chr",ch,sep=""), ...) })
+ }
+
+ t.plotavccl <- function(ci, ccl=tl.ccl, main=paste(ci,"chromosome average"), rtl=tl, ... ) {
+ #cc <- lapply(ccl[[ci]],function(x) { if(!is.null(x$M)) { x$M <- NULL;}; return(x); });
+ cc <- ccl[[ci]];
+ chrs <- names(cc[[1]]); names(chrs) <- chrs;
+ acl <- lapply(cc,function(x) do.call(rbind,x));
+ tags <- rtl[[ci]][chrs];
+ avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw);
+ acl <- lapply(acl,function(ac) apply(ac,2,function(x) sum(x*avw)))
+ t.plotcc(acl, main=main, ...);
+ }
+
+ t.plotchrccl <- function(ci,ccl=tl.ccl,ncol=4, ... ) {
+ par(mfrow = c(length(cc[[1]])/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8)
+ lapply(names(cc[[1]]),function(ch) { t.plotcc(lapply(cc,function(x) x[[ch]]),main=paste(ci,": chr",ch,sep=""), ...) })
+ }
+
+
+
+show.scc <- function(tl,srange,cluster=NULL) {
+ if(!is.null(cluster)) {
+ cc <- clusterApplyLB(cluster,tl,tag.scc,srange=srange);
+ names(cc) <- names(tl);
+ } else {
+ cc <- lapply(tl,tag.scc,srange=srange);
+ }
+ par(mfrow = c(1,1), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
+ ccl<-list(sample=cc);
+ ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,xlim=srange,return.ac=F,ttl=list(sample=tl),main="")[[1]]
+}
+
+# find regions of significant tag enrichment
+find.significantly.enriched.regions <- function(signal.data,control.data,window.size=500,multiplier=1,z.thr=3,mcs=0,debug=F,background.density.scaling=T,masking.window.size=window.size,poisson.z=0,poisson.ratio=4,either=F,tag.shift=146/2,bg.weight=NULL) {
+ if(is.null(bg.weight)) {
+ bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling);
+ }
+
+ if(debug) {
+ cat("bg.weight=",bg.weight,"\n");
+ }
+ chrl <- names(signal.data); names(chrl) <- chrl;
+ tec <- lapply(chrl,function(chr) {
+ d <- tag.enrichment.clusters(signal.data[[chr]],control.data[[chr]],bg.weight=bg.weight*multiplier,thr=z.thr,wsize=window.size,mcs=mcs,min.tag.count.z=poisson.z,min.tag.count.ratio=poisson.ratio,either=either,tag.shift=tag.shift);
+ d$s <- d$s-masking.window.size/2; d$e <- d$e+masking.window.size/2;
+ return(d);
+ })
+}
+
+
+# given tag position vectors, find contigs of significant enrichment of signal over background
+# thr - z score threshold
+# mcs - minimal cluster size
+# bg.weight - fraction by which background counts should be multipled
+# min.tag.count.z will impose a poisson constraint based on randomized signal in parallel of background constaint (0 - no constraint)
+tag.enrichment.clusters <- function(signal,background,wsize=200,thr=3,mcs=1,bg.weight=1,min.tag.count.z=0,tag.av.den=NULL,min.tag.count.thr=0,min.tag.count.ratio=4,either=F,tag.shift=146/2) {
+ if(is.null(tag.av.den)) {
+ tag.av.den <- length(signal)/diff(range(abs(signal)));
+ }
+ if(min.tag.count.z>0) {
+ min.tag.count.thr <- qpois(pnorm(min.tag.count.z,lower.tail=F),min.tag.count.ratio*tag.av.den*wsize,lower.tail=F)
+ } else {
+ min.tag.count.thr <- 0;
+ }
+
+ #if(bg.weight!=1) {
+ # background <- sample(background,length(background)*(bg.weight),replace=T);
+ #}
+ # make up combined position, flag vectors
+ pv <- abs(c(signal,background)+tag.shift);
+ fv <- c(rep(1,length(signal)),rep(0,length(background)));
+ po <- order(pv);
+ pv <- pv[po];
+ fv <- fv[po];
+
+ #thr <- pnorm(thr,lower.tail=F);
+
+ storage.mode(wsize) <- storage.mode(mcs) <- storage.mode(fv) <- "integer";
+ storage.mode(thr) <- storage.mode(pv) <- "double";
+ storage.mode(bg.weight) <- "double";
+ storage.mode(min.tag.count.thr) <- "double";
+ either <- as.integer(either);
+ storage.mode(either) <- "integer";
+
+ z <- .Call("find_poisson_enrichment_clusters",pv,fv,wsize,thr,mcs,bg.weight,min.tag.count.thr,either)
+ return(z);
+}
+
+
+
+
+
+# estimates threshold, calculates predictions on complete data and randomized data
+# input: tvl
+# control - a list of control tag datasets
+# no randomization is done if control is supplied
+# return.rtp - return randomized tag peaks - do not fit thresholds or do actual predictions
+# topN - use min threshold to do a run, return topN peaks from entire genome
+# threshold - specify a user-defined threshold
+lwcc.prediction <- function(tvl,e.value=NULL, fdr=0.01, chrl=names(tvl), min.thr=0, n.randomizations=1, shuffle.window=1, debug=T, predict.on.random=F, shuffle.both.strands=T,strand.shuffle.only=F, return.rtp=F, control=NULL, print.level=0, threshold=NULL, topN=NULL, bg.tl=NULL, tec.filter=T, tec.window.size=1e3,tec.z=3, tec.masking.window.size=tec.window.size, tec.poisson.z=3,tec.poisson.ratio=4, bg.reverse=T, return.control.predictions=F, return.core.data=F, background.density.scaling=T, ... ) {
+
+ control.predictions <- NULL;
+ core.data <- list();
+
+ if(!is.null(bg.tl) & tec.filter) {
+ if(debug) { cat("finding background exclusion regions ... "); }
+ tec <- find.significantly.enriched.regions(bg.tl,tvl,window.size=tec.window.size,z.thr=tec.z,masking.window.size=tec.masking.window.size,poisson.z=tec.poisson.z,poisson.ratio=tec.poisson.ratio,background.density.scaling=background.density.scaling,either=T);
+ if(return.core.data) {
+ core.data <- c(core.data,list(tec=tec));
+ }
+ if(debug) { cat("done\n"); }
+ }
+
+
+ if(is.null(threshold) & is.null(topN)) { # threshold determination is needed
+ # generate control predictions
+ if(!is.null(control)) {
+ if(debug) { cat("determining peaks on provided",length(control),"control datasets:\n"); }
+ if(!is.null(bg.tl)) {
+ if(bg.reverse) {
+ if(debug) { cat("using reversed signal for FDR calculations\n"); }
+ rbg.tl <- tvl;
+ } else {
+ if(debug) { cat("generating randomized (within chromosome) background ... "); }
+ rbg.tl <- lapply(bg.tl,function(d) {
+ if(length(d)<2) { return(d); }
+ rng <- range(abs(d));
+ rd <- round(runif(length(d),rng[1],rng[2]));
+ nrd <- sample(1:length(rd),length(which(d<0)));
+ rd[nrd] <- rd[nrd]*(-1);
+ return(rd);
+ })
+ if(debug) { cat("done\n"); }
+ }
+ } else {
+ rbg.tl <- NULL;
+ }
+ n.randomizations <- length(control);
+ #signal.size <- sum(unlist(lapply(tvl,length)));
+ rtp <- lapply(control,function(d) {
+ # calculate tag.weight
+ #tag.weight <- sum(unlist(lapply(tvl,length)))/sum(unlist(lapply(d,length)));
+ tag.weight <- dataset.density.ratio(tvl,d,background.density.scaling=background.density.scaling);
+ #cat("tag.weight=",tag.weight," ");
+ return(window.call.mirror.binding(d,min.thr=min.thr, tag.weight=tag.weight,bg.tl=rbg.tl, debug=debug, round.up=T,background.density.scaling=background.density.scaling, ...));
+ #return(window.call.mirror.binding(d,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster))
+ });
+ if(return.core.data) {
+ core.data <- c(core.data,list(rtp.unfiltered=rtp));
+ }
+ if(tec.filter) {
+ if(debug) { cat("excluding systematic background anomalies ... "); }
+ rtp <- lapply(rtp,filter.binding.sites,tec,exclude=T);
+ if(debug) { cat("done\n"); }
+ }
+ } else {
+ if(debug) { cat("determining peaks on ",n.randomizations,"randomized datasets:\n"); }
+ rtp <- lapply(1:n.randomizations,function(i) {
+ rd <- generate.randomized.data(tvl,shuffle.window=shuffle.window,shuffle.both.strands=shuffle.both.strands,strand.shuffle.only=strand.shuffle.only);
+ return(window.call.mirror.binding(rd,min.thr=min.thr,bg.tl=bg.tl, debug=debug, ...));
+ #return(window.call.mirror.binding(rd,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist))
+ });
+ }
+ if(return.control.predictions) {
+ control.predictions <- rtp;
+ }
+ rtp <- do.call(rbind,lapply(rtp,function(d) do.call(rbind,d))); # merge tables
+
+ # generate real data predictions
+ if(debug) { cat("determining peaks on real data:\n"); }
+ npl <- window.call.mirror.binding(tvl,min.thr=min.thr,bg.tl=bg.tl, debug=debug, background.density.scaling=background.density.scaling, ...);
+ #npl <- window.call.mirror.binding(tvl,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster);
+ if(return.core.data) {
+ core.data <- c(core.data,list(npl.unfiltered=npl));
+ }
+
+ if(!is.null(bg.tl) & tec.filter) {
+ if(debug) { cat("excluding systematic background anomalies ... "); }
+ npl <- filter.binding.sites(npl,tec,exclude=T);
+ if(debug) { cat("done\n"); }
+ }
+
+ # calculate E-value and FDRs for all of the peaks
+ if(debug) { cat("calculating statistical thresholds\n"); }
+ chrl <- names(npl); names(chrl) <- chrl;
+ npld <- do.call(rbind,lapply(names(npl),function(chr) { k <- npl[[chr]]; if(!is.null(k) & dim(k)[1]>0) { k$chr <- rep(chr,dim(k)[1]) }; return(k) }))
+ npld <- cbind(npld,get.eval.fdr.vectors(npld$y,rtp$y));
+ # correct for n.randomizations
+ npld$fdr <- npld$fdr/n.randomizations;
+ npld$evalue <- npld$evalue/n.randomizations;
+
+ if(return.core.data) {
+ core.data <- c(core.data,list(npld=npld));
+ }
+
+ # determine actual thresholds
+ if(is.null(e.value)) {
+ if(is.null(fdr)) { fdr <- 0.01; }
+ thr <- list(root=min(npld$y[npld$fdr<=fdr]),type="FDR",fdr=fdr)
+ if(debug) { cat("FDR",fdr,"threshold=",thr$root,"\n"); }
+ } else {
+ # determine threshold based on e-value
+ thr <- list(root=min(npld$y[npld$evalue<=e.value]),type="Evalue",e.value=e.value)
+ if(debug) { cat("E-value",e.value,"threshold=",thr$root,"\n"); }
+ }
+
+
+ npld <- npld[npld$y>=thr$root,];
+ if(dim(npld)[1]>0) {
+ npl <- tapply(c(1:dim(npld)[1]),as.factor(npld$chr),function(ii) {df <- npld[ii,]; df$chr <- NULL; return(df) });
+ } else {
+ npl <- list();
+ }
+ } else {
+ if(is.null(threshold)) {
+ thr <- list(root=min.thr,type="minimal");
+ } else {
+ thr <- list(root=threshold,type="user specified");
+ }
+
+ cat("calling binding positions using",thr$type,"threshold (",thr$root,") :\n");
+ npl <- window.call.mirror.binding(tvl=tvl,min.thr=thr$root,bg.tl=bg.tl, debug=debug, ...);
+ if(!is.null(bg.tl) & tec.filter) {
+ if(debug) { cat("excluding systematic background anomalies ... "); }
+ npl <- filter.binding.sites(npl,tec,exclude=T);
+ if(debug) { cat("done\n"); }
+ }
+
+ if(!is.null(topN)) {
+ # determine threshold based on topN peaks
+ ay <- unlist(lapply(npl,function(d) d$y));
+ if(length(ay)>topN) {
+ thr <- list(root=sort(ay,decreasing=T)[topN],type="topN",topN=topN);
+ cat(paste("determined topN threshold :",thr$root,"\n"));
+ npl <- lapply(npl,function(d) d[d$y>thr$root,]);
+ }
+ }
+ }
+
+ if(return.core.data) {
+ return(c(list(npl=npl,thr=thr),core.data));
+ }
+ if(return.control.predictions & !is.null(control.predictions)) {
+ return(list(npl=npl,thr=thr,control.predictions=control.predictions));
+ }
+ return(list(npl=npl,thr=thr));
+}
+
+# window tag difference method
+wtd <- function(x,y,s,e,whs=200,return.peaks=T,min.thr=5,min.dist=200,step=1,direct.count=F,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=1,mask.x=NULL,mask.y=NULL,ignore.masking=F, bg.whs=whs, round.up=F, ...) {
+ ignore.masking <- ignore.masking | (is.null(mask.x) & is.null(mask.y));
+ if(step>1) {
+ x <- floor(x/step+0.5); y <- floor(y/step+0.5)
+
+ if(!is.null(bg.x)) {
+ bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5)
+ }
+
+ if(!is.null(mask.x)) {
+ mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5)
+ }
+
+
+ whs <- floor(whs/step+0.5);
+ bg.whs <- floor(bg.whs/step+0.5);
+ min.dist <- floor(min.dist/step +0.5);
+ s <- floor(s/step+0.5)
+ e <- floor(e/step+0.5)
+ }
+
+ # scale bg.weight, since within calculation they are considered independent
+ bg.weight <- bg.weight*tag.weight;
+
+ rx <- c(s-whs,e+whs);
+
+ # compile tag vectors
+ xt <- table(x);
+ xh <- integer(diff(rx)+1);
+ xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt);
+
+ yt <- table(y);
+ yh <- integer(diff(rx)+1);
+ yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt);
+
+ # compile background vectors
+ if(!is.null(bg.x) & length(bg.x)>0) {
+ bg.subtract <- 1;
+
+ bg.xt <- table(bg.x);
+ bg.xh <- integer(diff(rx)+1);
+ bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt);
+ rm(bg.xt);
+
+ bg.yt <- table(bg.y);
+ bg.yh <- integer(diff(rx)+1);
+ bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt);
+ rm(bg.yt);
+
+ # adjust bg.weight according to bg.whs
+ if(bg.whs!=whs) {
+ bg.weight <- bg.weight*whs/bg.whs;
+ }
+ } else {
+ bg.subtract <- 0;
+ bg.xh <- bg.yh <- c();
+ }
+
+ # record masked positions
+ if(!ignore.masking) {
+ if(!is.null(mask.x) & length(mask.x)>0) {
+ mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt)));
+ mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]];
+ xh[mvx-rx[1]+1] <- -1;
+ }
+
+ if(!is.null(mask.y) & length(mask.y)>0) {
+ mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt)));
+ mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]];
+ yh[mvy-rx[1]+1] <- -1;
+ }
+ }
+
+ rm(xt,yt);
+
+ if(round.up) { round.up <- 1; } else { round.up <- 0; }
+
+ storage.mode(xh) <- storage.mode(yh) <- "integer";
+ storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer";
+ nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(bg.whs) <- "integer";
+ rp <- as.integer(return.peaks);
+ dcon <- as.integer(direct.count);
+ storage.mode(rp) <- storage.mode(min.dist) <- "integer";
+ storage.mode(min.thr) <- "double";
+ storage.mode(dcon) <- "integer";
+ storage.mode(tag.weight) <- "double";
+ storage.mode(bg.weight) <- "double";
+ storage.mode(bg.subtract) <- "integer";
+ storage.mode(round.up) <- "integer";
+ im <- as.integer(ignore.masking);
+ storage.mode(im) <- "integer";
+ z <- .Call("wtd",xh,yh,whs,rp,min.dist,min.thr,dcon,tag.weight,im,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up);
+ if(return.peaks) {
+ return(data.frame(x=(z$x+rx[1])*step,y=z$v));
+ } else {
+ return(list(x=rx*step,y=z));
+ }
+}
+
+
+tag.wtd <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) {
+ x <- ctv[ctv>=s & ctv<=e];
+ y <- (-1)*ctv[ctv<=-s & ctv>=-e];
+
+ if(!is.null(bg.ctv)) {
+ bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e];
+ bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e];
+ } else {
+ bg.x <- bg.y <- NULL;
+ }
+
+ if(!is.null(mask.ctv)) {
+ mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e];
+ mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e];
+ } else {
+ mask.x <- mask.y <- NULL;
+ }
+
+ if(length(x)==0 | length(y) ==0) {
+ if(return.peaks) {
+ return(data.frame(x=c(),y=c()));
+ } else {
+ rx <- range(c(x,y));
+ return(list(x=rx,y=numeric(diff(rx)+1)));
+ }
+ } else {
+ return(wtd(x,y,s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...))
+ }
+}
+
+# shuffles tags in chromosome blocks of a specified size
+# note: all coordinates should be positive
+tag.block.shuffle <- function(tags,window.size=100) {
+ if(length(tags)<3) {
+ warning("too few tags for shuffling");
+ return(tags);
+ }
+ rng <- range(tags);
+ #if(rng[1]<0) { stop("negative tag coordinates found") }
+ if(diff(rng)<=window.size) {
+ warning(paste("tag range (",diff(rng),") is smaller than shuffle window size"));
+ return(tags);
+ }
+
+ if(window.size==0) {
+ return(as.integer(runif(length(tags),min=rng[1],max=rng[2])))
+ } else if(window.size==1) {
+ tt <- table(tags);
+ return(rep(runif(length(tt),min=rng[1],max=rng[2]),as.integer(tt)))
+ } else {
+ # block positions
+ bp <- tags %/% window.size;
+ # block-relative tag positions
+ rp <- tags %% window.size;
+
+ # shuffle block positions
+ bpu <- unique(bp);
+ rbp <- range(bpu);
+ bps <- as.integer(runif(length(bpu),min=rbp[1],max=rbp[2]));
+ bpi <- match(bp,bpu);
+ sbp <- bps[bpi];
+ #sbp <- rbp[1]+match(bp,sample(rbp[1]:rbp[2]))
+ return(sbp*window.size+rp);
+ }
+}
+
+
+# calculate window cross-correlation
+lwcc <- function(x,y,s,e,whs=100,isize=20,return.peaks=T,min.thr=1,min.dist=100,step=1,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=NULL,mask.x=NULL,mask.y=NULL,bg.whs=whs,round.up=F) {
+ if(step>1) {
+ x <- floor(x/step+0.5); y <- floor(y/step+0.5)
+
+ if(!is.null(bg.x)) {
+ bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5)
+ }
+
+ if(!is.null(mask.x)) {
+ mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5)
+ }
+
+ whs <- floor(whs/step+0.5);
+ bg.whs <- floor(bg.whs/step+0.5);
+ isize <- floor(isize/step+0.5);
+ min.dist <- floor(min.dist/step +0.5);
+ s <- floor(s/step+0.5)
+ e <- floor(e/step+0.5)
+ }
+
+ # scale bg.weight, since within calculation they are considered independent
+ bg.weight <- bg.weight*tag.weight;
+
+
+ rx <- c(s-whs,e+whs);
+ xt <- table(x);
+ xh <- integer(diff(rx)+1);
+ xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt);
+
+ yt <- table(y);
+
+ yh <- integer(diff(rx)+1);
+ yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt);
+
+ # compile background vectors
+ if(!is.null(bg.x) & length(bg.x)>0) {
+ bg.subtract <- 1;
+
+ bg.xt <- table(bg.x);
+ bg.xh <- integer(diff(rx)+1);
+ bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt);
+ rm(bg.xt);
+
+ bg.yt <- table(bg.y);
+ bg.yh <- integer(diff(rx)+1);
+ bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt);
+ rm(bg.yt);
+
+ # adjust bg.weight according to bg.whs
+ bg.weight <- bg.weight*(whs-isize)/bg.whs;
+ } else {
+ bg.subtract <- 0;
+ bg.xh <- bg.yh <- c();
+ }
+
+ # record masked positions
+ if(!is.null(mask.x) & length(mask.x)>0) {
+ mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt)));
+ mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]];
+
+ xh[mvx-rx[1]+1] <- -1;
+ }
+
+ if(!is.null(mask.y) & length(mask.y)>0) {
+ mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt)));
+ mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]];
+ yh[mvy-rx[1]+1] <- -1;
+ }
+
+ rm(xt,yt);
+ if(round.up) { round.up <- 1; } else { round.up <- 0; }
+
+ storage.mode(xh) <- storage.mode(yh) <- "integer";
+ storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer";
+ nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(isize) <- storage.mode(bg.whs) <- "integer";
+ rp <- as.integer(return.peaks);
+ storage.mode(rp) <- storage.mode(min.dist) <- "integer";
+ storage.mode(min.thr) <- "double";
+ storage.mode(tag.weight) <- "double";
+ storage.mode(bg.weight) <- "double";
+ storage.mode(bg.subtract) <- "integer";
+ storage.mode(round.up) <- "integer";
+
+ # allocate return arrays
+ #cc <- numeric(nx); storage.mode(cc) <- "double";
+ z <- .Call("lwcc",xh,yh,whs,isize,rp,min.dist,min.thr,tag.weight,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up);
+ if(return.peaks) {
+ return(data.frame(x=(z$x+rx[1])*step,y=z$v));
+ } else {
+ return(list(x=rx*step,y=z));
+ }
+}
+
+
+tag.lwcc <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) {
+ x <- ctv[ctv>=s & ctv<=e];
+ y <- (-1)*ctv[ctv<=-s & ctv>=-e];
+
+ if(!is.null(bg.ctv)) {
+ bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e];
+ bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e];
+ } else {
+ bg.x <- bg.y <- NULL;
+ }
+
+ if(!is.null(mask.ctv)) {
+ mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e];
+ mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e];
+ } else {
+ mask.x <- mask.y <- NULL;
+ }
+
+ if(length(x)==0 | length(y) ==0) {
+ if(return.peaks) {
+ return(data.frame(x=c(),y=c()));
+ } else {
+ rx <- range(c(x,y));
+ return(list(x=rx,y=numeric(diff(rx)+1)));
+ }
+ } else {
+ return(lwcc(x,y, s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...))
+ }
+}
+
+# determine mirror-based binding positions using sliding window along each chromosome
+# extra parameters are passed on to call.nucleosomes()
+window.call.mirror.binding <- function(tvl,window.size=4e7, debug=T, cluster=NULL, bg.tl=NULL, mask.tl=NULL, background.density.scaling=T, ...) {
+ chrl <- names(tvl);
+ # determine bg.weight
+ if(!is.null(bg.tl)) {
+ bg.weight <- dataset.density.ratio(tvl,bg.tl,background.density.scaling=background.density.scaling);
+ } else {
+ bg.weight <- NULL;
+ }
+ if(debug) {
+ cat("bg.weight=",bg.weight," ");
+ }
+
+ names(chrl) <- chrl;
+
+ if(is.null(cluster)) {
+ return(lapply(chrl,function(chr) {
+ bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; };
+ mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; };
+
+ window.chr.call.mirror.binding(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv),window.size=window.size,chr=chr,debug=debug, bg.weight=bg.weight, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ...);
+ }));
+ } else {
+ # add bg.ctv and mask.ctv to parallel call
+ tvll <- lapply(chrl,function(chr) {
+ bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; };
+ mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; };
+ return(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv))
+ });
+ bl <- clusterApplyLB(cluster,tvll,window.chr.call.mirror.binding,window.size=window.size,debug=debug, bg.weight=bg.weight, ...);
+ names(bl) <- chrl;
+ return(bl);
+ }
+}
+
+window.chr.call.mirror.binding <- function(ctvl,window.size,debug=T, chr="NA", cluster=NULL, method=tag.wtd, bg.ctv=NULL, mask.ctv=NULL, ...) {
+ ctv <- ctvl$ctv; bg.ctv <- ctvl$bg.ctv; mask.ctv <- ctvl$mask.ctv;
+ if(is.null(ctv)) { return(data.frame(x=c(),y=c())) }
+ if(length(ctv)<2) { return(data.frame(x=c(),y=c())) }
+
+ dr <- range(unlist(lapply(ctv,function(x) range(abs(x)))))
+ n.windows <- ceiling(diff(dr)/window.size);
+
+
+ pinfo <- c();
+ if(debug) {
+ cat(paste("processing ",chr," in ",n.windows," steps [",sep=""));
+ }
+ for(i in 1:n.windows) {
+ s <- dr[1]+(i-1)*window.size;
+ npn <- method(s=s, e=s+window.size,ctv=ctv, return.peaks=T, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ... );
+ if(length(npn) > 0) { pinfo <- rbind(pinfo,npn) }
+ if(debug) {
+ cat(".");
+ }
+ }
+ if(debug) {
+ cat(paste("] done (",dim(pinfo)[1],"positions)\n"));
+ } else {
+ cat(".");
+ }
+ return(data.frame(x=pinfo[,1],y=pinfo[,2]));
+}
+
+generate.randomized.data <- function(data,shuffle.window=1,shuffle.both.strands=T,strand.shuffle.only=F,chrl=names(data)) {
+ names(chrl) <- unlist(chrl);
+ if(strand.shuffle.only) {
+ # shuffle just strand assignment, not tag positions
+ rt <- lapply(data[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T));
+ } else {
+ if(shuffle.both.strands) {
+ rt <- lapply(data[unlist(chrl)],function(tv) {
+ pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window)))
+ });
+ } else {
+ rt <- lapply(data[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))});
+ }
+ }
+}
+
+# determine threshold based on E value
+# for efficiency chrl should include just one or two small chromosomes
+# optional parameters are passed to call.nucleosomes()
+determine.lwcc.threshold <- function(tvl,chrl=names(tvl),e.value=100, n.randomizations=1, min.thr=1, debug=F, tol=1e-2, shuffle.window=1, shuffle.both.strands=T, return.rtp=F, control=NULL, strand.shuffle=F, ...) {
+ names(chrl) <- unlist(chrl);
+
+ # determine fraction of total tags contained in the specified nucleosomes
+ ntags <- sum(unlist(lapply(tvl,function(cv) length(cv))));
+ nctags <- sum(unlist(lapply(chrl, function(cn) length(tvl[[cn]]))));
+ # calculate actual target E value
+ if(!is.null(control)) {
+ n.randomizations <- length(control);
+ }
+ eval <- e.value*n.randomizations*nctags/ntags
+ if(eval<1) {
+ warning("specified e.value and set of chromosomes results in target e.value of less than 1");
+ eval <- 1;
+ }
+
+ if(debug) {
+ cat(paste("randomizations =",n.randomizations," chromosomes =",length(chrl),"\n"))
+ cat(paste("adjusted target eval =",eval,"\ngenerating randomized tag peaks ..."));
+ }
+
+ # get peaks on randomized tags
+ if(is.null(control)) {
+ rtp <- data.frame(do.call(rbind,lapply(1:n.randomizations,function(i) {
+ if(strand.shuffle) {
+ # shuffle just strand assignment, not tag positions
+ rt <- lapply(tvl[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T));
+ } else {
+ if(shuffle.both.strands) {
+ rt <- lapply(tvl[unlist(chrl)],function(tv) {
+ pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window)))
+ });
+ } else {
+ rt <- lapply(tvl[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))});
+ }
+ }
+ if(debug) {
+ cat(".");
+ }
+ rl <- window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...);
+
+ return(do.call(rbind,rl))
+ #return(do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, whs=100,isize=10,window.size=3e7,min.dist=200)))
+ })));
+
+ } else {
+ if(debug) {
+ cat(" using provided controls ");
+ }
+ rtp <- data.frame(do.call(rbind,lapply(control,function(rt) do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...)))))
+ }
+
+ if(return.rtp) {
+ return(rtp)
+ }
+
+ if(debug) {
+ cat(" done\nfinding threshold .");
+ }
+
+ # determine range and starting value
+ rng <- c(min.thr,max(na.omit(rtp$y)))
+
+ # find E value threshold
+ count.nucs.f <- function(nthr) {
+ return(eval-length(which(rtp$y>=nthr)));
+ }
+
+ # estimate position of the root by downward bisection iterations
+ mv <- c(eval); mvp <- c(rng[2]); ni <- 1;
+ max.it <- 2*as.integer(log2(rng[2]/rng[1])+0.5);
+ while((ni<=max.it) & (mv[1]>=0)) {
+ np <- mvp[1]/2;
+ npv <- count.nucs.f(np);
+ mv <- c(npv,mv);
+ mvp <- c(np,mvp);
+ ni <- ni+1;
+ }
+
+
+ if(ni>max.it) {
+ # determine lowest value
+ if(debug) {
+ cat(paste("exceeded max.it (",max.it,"), returning lowest point",signif(mvp[1],4)));
+ }
+ return(list(root=mvp[1]))
+ } else {
+ rng <- mvp[1:2];
+ if(mv[2]==0) rng[2] <- mvp[3];
+ if(debug) {
+ cat(paste("bound to (",signif(rng[1],4),signif(rng[2],4),") "));
+ }
+ }
+
+ # find root on the right side
+ x <- uniroot(count.nucs.f,rng,tol=tol);
+ #x$max <- o$par;
+ #x$f.max <- (-1)*o$value;
+ if(debug) {
+ cat(paste(" done (thr=",signif(x$root,4),")\n"));
+ }
+ return(x);
+
+}
+
+
+# determine membership of points in fragments
+points.within <- function(x,fs,fe,return.list=F,return.unique=F,sorted=F,return.point.counts=F) {
+ if(is.null(x) | length(x) < 1) { return(c()) };
+ if(!sorted) {
+ ox <- rank(x,ties="first");
+ x <- sort(x);
+ }
+
+ se <- c(fs,fe);
+ fi <- seq(1:length(fs));
+ fi <- c(fi,-1*fi);
+
+ fi <- fi[order(se)];
+ se <- sort(se);
+
+ storage.mode(x) <- storage.mode(fi) <- storage.mode(se) <- "integer";
+ if(return.unique) { iu <- 1; } else { iu <- 0; }
+ if(return.list) { il <- 1; } else { il <- 0; }
+ if(return.point.counts) { rpc <- 1; } else { rpc <- 0; }
+ storage.mode(iu) <- storage.mode(il) <- storage.mode(rpc) <- "integer";
+ result <- .Call("points_within",x,se,fi,il,iu,rpc);
+ if(!sorted & !return.point.counts) {
+ result <- result[ox];
+ }
+ return(result);
+}
+
+
+# determine cooridnates of points x relative to signed
+# positions pos within size range
+get.relative.coordinates <- function(x,pos,size,sorted=F) {
+ if(!sorted) {
+ op <- order(abs(pos));
+ x <- sort(x); pos <- pos[op];
+ }
+ #dyn.load("~/zhao/sc/peaks.so");
+ storage.mode(x) <- storage.mode(pos) <- storage.mode(size) <- "integer";
+ rf <- .Call("get_relative_coordinates",x,pos,size);
+ if(!sorted) {
+ rf$i <- op[rf$i];
+ } else {
+ return(rf$i);
+ }
+ return(rf);
+}
+
+# given list of magnitude values for signal(x) and control (y),
+# return a dataframe with $e.val and $fdr
+get.eval.fdr.vectors <- function(x,y) {
+ nx <- length(x); ny <- length(y);
+ if(nx==0) { return(data.frame(evalue=c(),fdr=c())) }
+ if(ny==0) { return(data.frame(evalue=rep(0,nx),fdr=rep(1,nx))) }
+ ex <- ecdf(x); ey <- ecdf(y);
+
+ evals <- (1-ey(x))*ny;
+ yvals <- (1-ex(x))*nx;
+ fdr <- (evals+0.5)/(yvals+0.5); # with pseudo-counts
+ fdr[yvals==0] <- min(fdr); # correct for undercounts
+ # find a min x corresponding to a minimal FDR
+ mfdr <- min(fdr);
+ mfdrmx <- min(x[fdr==mfdr]);
+ # correct
+ fdr[x>=mfdrmx] <- mfdr;
+ return(data.frame(evalue=(evals+1),fdr=fdr));
+}
+
+
+# filter predictions to remove calls failling into the tag enrichment clusters ( chr list of $s/$e dfs)
+filter.binding.sites <- function(bd,tec,exclude=F) {
+ chrl <- names(bd); names(chrl) <- chrl;
+ lapply(chrl,function(chr) {
+ cbd <- bd[[chr]];
+ if(is.null(cbd)) { return(NULL) };
+ if(length(cbd)==0) { return(NULL) };
+ if(dim(cbd)[1]>0) {
+ ctec <- tec[[chr]];
+ if(length(ctec$s)>0) {
+ if(exclude) {
+ pwi <- which(points.within(cbd$x,ctec$s,ctec$e)== -1);
+ } else {
+ pwi <- which(points.within(cbd$x,ctec$s,ctec$e)> -1);
+ }
+ return(cbd[pwi,]);
+ } else {
+ if(exclude) {
+ return(cbd);
+ } else {
+ return(data.frame(x=c(),y=c()));
+ }
+ }
+ } else {
+ return(cbd);
+ }
+ });
+}
+
+
+# PUBLIC
+# generate predictions on sequential (chained) subsamples of data
+# if step.size <1, it is intepreted as a fraciton and a each subsequent subsample
+# is of a size (1-fraction.step)*N (N - size of the signal data);
+# otherwise the step.size is interpreted as a number of tags, and each subsample is of the size N-step.size
+get.subsample.chain.calls <- function(signal.data,control.data,n.steps=NULL,step.size=1e6,subsample.control=F,debug=F,min.ntags=1e3, excluded.steps=c(), test.chromosomes=NULL, ... ) {
+
+ if(!is.null(test.chromosomes)) {
+ # adjust step size
+ sz <- sum(unlist(lapply(signal.data,length)))
+ signal.data <- signal.data[test.chromosomes];
+ control.data <- control.data[test.chromosomes];
+
+ if(step.size>1) {
+ step.size <- step.size*sum(unlist(lapply(signal.data,length)))/sz;
+ # cat("adjusted step.size=",step.size,"\n");
+ }
+ }
+
+ if(is.null(n.steps)) {
+ if(step.size<1) {
+ # down to 10%
+ n.steps <- log(0.1)/log(step.size);
+ } else {
+ n.steps <- floor(sum(unlist(lapply(signal.data,length)))/step.size)
+ }
+ }
+ if(subsample.control & !is.null(control.data)) {
+ # normalize control to the signal size
+ if(debug) { cat("pre-subsampling control.\n"); }
+ bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length)))
+ control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight,replace=(bg.weight>1)))
+ }
+ calls <- list();
+ callnames <- c();
+ for(i in 0:n.steps) {
+ if(debug) { cat("chained subsample step",i,":\n"); }
+ if(!i %in% excluded.steps) {
+ ans <- list(find.binding.positions(signal.data=signal.data,control.data=control.data,debug=debug, skip.control.normalization=T, ...));
+ names(ans) <- as.character(c(i));
+ calls <- c(calls,ans);
+ callnames <- c(callnames,i);
+ }
+ # subsample
+ if(step.size<1) {
+ # fraction steps
+ f <- 1-step.size;
+ } else {
+ # bin steps
+ sz <- sum(unlist(lapply(signal.data,length)));
+ f <- (sz-step.size)/sz;
+ if(f<=0) break;
+ }
+ if(debug) { cat("chained subsampling using fraction",f,".\n"); }
+ signal.data <- lapply(signal.data,function(d) sample(d,length(d)*f));
+ if(subsample.control & !is.null(control.data)) {
+ control.data <- lapply(control.data,function(d) sample(d,length(d)*f));
+ }
+ sz <- sum(unlist(lapply(signal.data,length)));
+ if(sz<min.ntags) break;
+ }
+ names(calls) <- callnames;
+ return(calls);
+}
+
+
+# chain-subsample dataset and calculate MSER interpolation
+mser.chain.interpolation <- function(signal.data=NULL,control.data=NULL,chains=NULL,n.chains=5,debug=F, enrichment.background.scales=c(1,5), test.agreement=0.99, agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr", return.lists=F, ...) {
+ if(is.null(chains)) {
+ cn <- c(1:n.chains); names(cn) <- cn;
+ tf <- function(i, ...) get.subsample.chain.calls(signal.data,control.data,debug=debug, enrichment.background.scales=enrichment.background.scales, ...);
+ chains <- lapply(cn,tf,...);
+ }
+ names(enrichment.background.scales) <- enrichment.background.scales;
+ lapply(enrichment.background.scales,function(scale) {
+ actual.enr.field <- enr.field;
+ if(scale>1) {
+ actual.enr.field <- paste(actual.enr.field,scale,sep=".");
+ }
+
+ cvl <- lapply(chains,function(chain) {
+ nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T);
+ nd <- diff(nn);
+ nn <- nn[-length(nn)];
+ me <- lapply(c(2:length(chain)),function(i) {
+ sla <- t.precalculate.ref.peak.agreement(chain[[i-1]],chain[i],agreement.distance=agreement.distance,enr.field=actual.enr.field)
+ me <- t.find.min.saturated.enr(sla,thr=1-test.agreement)
+ menr <- max(min(na.omit(unlist(lapply(chain[[i-1]]$npl,function(d) d[actual.enr.field])))),min(na.omit(unlist(lapply(chain[[i]]$npl,function(d) d[actual.enr.field])))),1)
+ if(me<=menr) { me <- 1; };
+ return(me);
+ })
+ data.frame(n=nn,me=unlist(me),nd=nd);
+ });
+ if(return.lists) { return(cvl) }
+ cvl <- na.omit(do.call(rbind,cvl));
+ if(return.median) {
+ tv <- tapply(cvl$me,as.factor(cvl$n),median)
+ } else {
+ tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim);
+ }
+ df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv));
+ return(df[order(df$n,decreasing=T),])
+ })
+}
+
+
+
+# returns agreement as a function of dataset size, possibly filtering peaks by min.enr threshold, and by max.fdr
+chain.to.reference.comparison <- function(chains,min.enr=NULL,debug=F,agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr",max.fdr=NULL) {
+ cvl <- lapply(chains,function(chain) {
+ # filter chain by fdr
+ if(!is.null(max.fdr)) {
+ chain <- lapply(chain,function(d) { d$npl <- lapply(d$npl,function(cd) cd[cd$fdr<=max.fdr,]); return(d); });
+ }
+ nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T);
+ nn <- nn[-length(nn)];
+ me <- lapply(c(2:length(chain)),function(i) {
+ sla <- t.precalculate.ref.peak.agreement(chain[[1]],chain[i],agreement.distance=agreement.distance,enr.field=enr.field)
+ # calculate overlap
+ x <- lapply(sla,function(mpd) {
+ if(!is.null(min.enr)) {
+
+ me <- mpd$re >= min.enr;
+ me[is.na(me)] <- F;
+ mpd <- mpd[me,];
+ ome <- mpd$oe < min.enr;
+ ome[is.na(ome)] <- T;
+ mpd$ov[ome] <- 0;
+ }
+ return(mean(mpd$ov));
+ })
+ })
+
+ data.frame(n=nn,me=unlist(me));
+ });
+
+ cvl <- na.omit(do.call(rbind,cvl));
+ if(return.median) {
+ tv <- tapply(cvl$me,as.factor(cvl$n),median)
+ } else {
+ tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim);
+ }
+ df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv));
+ return(df[order(df$n,decreasing=T),])
+}
+
+
+# estimates enrichment confidence interval based on 2*tag.count.whs window around each position, and a z-score (alpha/2)
+# if(multiple.background.scales=T) the enrichment is also estimated using 5- and 10-fold increased background tag window
+# adds $enr (lower bound), $enr.ub (upper bound) and $enr.mle fields
+calculate.enrichment.estimates <- function(binding.positions,signal.data=NULL,control.data=NULL,fraction=1,tag.count.whs=100,z=2,effective.genome.size=3e9,scale.down.control=F,background.scales=c(1),bg.weight=NULL) {
+ f <- fraction;
+ qv <- pnorm(z,lower.tail=F);
+ cn <- names(binding.positions$npl); names(cn) <- cn;
+
+ if(is.null(control.data)) {
+ # estimate from gamma distribution
+ fg.lambda <- f*sum(unlist(lapply(signal.data,length)))*2*tag.count.whs/effective.genome.size;
+ binding.positions$npl <- lapply(binding.positions$npl,function(d) {
+ d$enr <- qgamma(qv,d$nt,scale=1)/fg.lambda;
+ d$enr.ub <- qgamma(1-qv,d$nt,scale=1)/fg.lambda;
+ d$enr.mle <- d$nt/fg.lambda;
+ return(d);
+ });
+ } else {
+ # estimate using beta distribution
+ if(is.null(bg.weight)) {
+ bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length)))
+ }
+
+ if(scale.down.control) {
+ # sample down control to be the same size as true signal.data (bg.weight*f)
+ control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight*f,replace=(f*bg.weight>1)))
+ #bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length)))
+ bg.weight <- 1/f;
+
+ }
+
+ binding.positions$enrichment.bg.weight <- bg.weight;
+ binding.positions$enrichment.whs <- tag.count.whs;
+ binding.positions$enrichment.z <- z;
+
+ binding.positions$npl <- lapply(cn,function(chr) {
+ d <- binding.positions$npl[[chr]];
+
+ edf <- lapply(background.scales,function(background.width.multiplier) {
+ sig.mult <- bg.weight*f/background.width.multiplier;
+ nbg <- points.within(abs(control.data[[chr]]),d$x-tag.count.whs*background.width.multiplier,d$x+tag.count.whs*background.width.multiplier,return.point.counts=T,return.unique=F);
+
+ nfg <- d$nt;
+
+
+ # Poisson ratio Bayesian LB with non-informative prior (Clopper & Pearson 1934)
+ nf <- ((nfg+0.5)/(nbg+0.5))*qf(1-qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F)
+ nf <- nf/sig.mult;
+
+ ub <- ((nfg+0.5)/(nbg+0.5))*qf(qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F)
+ ub <- ub/sig.mult;
+
+ mle <- (nfg+0.5)/(nbg+0.5);
+ mle <- mle/sig.mult;
+ if(is.null(nbg)) { nbg <- numeric(0) }
+ if(is.null(nf)) { nf <- numeric(0) }
+ if(is.null(ub)) { ub <- numeric(0) }
+ if(is.null(mle)) { mle <- numeric(0) }
+ return(data.frame(nbg=nbg,lb=nf,ub=ub,mle=mle))
+ })
+
+ adf <- do.call(cbind,lapply(c(1:length(background.scales)),function(i) {
+ df <- edf[[i]];
+ cn <- c("nbgt","enr","enr.ub","enr.mle");
+ if(background.scales[i]>1) {
+ cn <- paste(cn,as.character(background.scales[i]),sep=".");
+ }
+ names(df) <- cn;
+ return(df);
+ }))
+
+ return(cbind(d,adf));
+ });
+ }
+
+ return(binding.positions);
+}
+
+
+# precalculate peak agreement of a sampling list given a reference
+t.precalculate.ref.peak.agreement <- function(ref,sf,agreement.distance=50,enr.field="enr") {
+ ref <- ref$npl;
+ cn <- names(ref); names(cn) <- cn;
+
+ # for each sampling round
+ lapply(sf,function(sd) {
+ # calculate overlap
+
+ ov <- data.frame(do.call(rbind,lapply(cn,function(chr) {
+ if(dim(ref[[chr]])[1]<1) { return(cbind(ov=c(),re=c(),oe=c())) };
+ pwi <- points.within(ref[[chr]]$x,sd$npl[[chr]]$x-agreement.distance,sd$npl[[chr]]$x+agreement.distance);
+ pwi[pwi==-1] <- NA;
+ renr <- ref[[chr]][,enr.field]
+ oenr <- sd$npl[[chr]][,enr.field][pwi];
+ if(length(oenr)==0) { oenr <- rep(NA,length(renr)); }
+ return(cbind(ov=as.integer(!is.na(pwi)),re=renr,oe=oenr));
+ })))
+ })
+}
+
+
+# find minimal saturated enrichment given a list of replicate agreement matrices (for one fraction)
+t.find.min.saturated.enr <- function(pal,thr=0.01,plot=F,return.number.of.peaks=F,plot.individual=T,return.median=F,return.vector=F) {
+ nr <- length(pal);
+ # merge replicate data frames
+ mpd <- data.frame(do.call(rbind,pal));
+
+ mpd$re[is.na(mpd$re)] <- Inf;
+ mpd$oe[is.na(mpd$oe)] <- Inf;
+
+
+
+ # round up values to avoid miscounting
+ mpd$re <- round(mpd$re,digits=2);
+ mpd$oe <- round(mpd$oe,digits=2);
+
+ me <- pmin(mpd$re,mpd$oe);
+ ome <- order(me,decreasing=T);
+ df <- data.frame(me=me[ome],ov=mpd$ov[ome]);
+ recdf <- ecdf(-mpd$re); ren <- length(mpd$re);
+
+ # collapse equal peak heights
+ xk <- tapply(df$ov,as.factor(df$me),sum); xk <- data.frame(ov=as.numeric(xk),me=as.numeric(names(xk))); xk <- xk[order(xk$me,decreasing=T),];
+
+
+ cso <- cumsum(xk$ov)/(recdf(-xk$me)*ren);
+ cso[is.na(cso)] <- 0;
+ cso[!is.finite(cso)] <- 0;
+ mv <- max(which(cso >= 1-thr))
+ menr <- xk$me[mv];
+
+ ir <- lapply(pal,function(d) {
+ d$re[is.na(d$re)] <- Inf;
+ d$oe[is.na(d$oe)] <- Inf;
+
+ me <- pmin(d$re,d$oe);
+ ome <- order(me,decreasing=T);
+ df <- data.frame(me=me[ome],ov=d$ov[ome]);
+ cso <- cumsum(df$ov)/c(1:length(df$ov));
+ mv <- max(which(cso >= 1-thr))
+ menr <- df$me[mv];
+ return(list(df=df,menr=menr));
+ });
+
+ if(plot) {
+ par(mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
+ plot(df$me,cumsum(df$ov)/c(1:length(df$ov)),type='l',ylab="fraction of positions overlapping with reference",xlab="minimal enrichment of binding positions",xlim=c(min(df$me),2*menr));
+ abline(h=1-thr,lty=2,col=4)
+ if(plot.individual) {
+ lapply(ir,function(d) {
+ df <- d$df;
+ lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=8);
+ abline(v=menr,col="pink",lty=3)
+ });
+ lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=1);
+ }
+ abline(v=menr,col=2,lty=2)
+ legend(x="bottomright",lty=c(1,2,1,3,2),col=c(1,2,8,"pink",4),legend=c("combined samples","combined sample MSER","individual samples","individual MSERs","consistency threshold"));
+ }
+
+ if(return.number.of.peaks) {
+ mpd <- data.frame(do.call(rbind,pal));
+ return(length(which(!is.na(mpd$re) & mpd$re >=menr))/nr);
+ } else {
+ if(return.vector) {
+ return(unlist(lapply(ir,function(d) d$menr)));
+ }
+ if(return.median) {
+ return(median(unlist(lapply(ir,function(d) d$menr))));
+ } else {
+ return(menr);
+ }
+ }
+}
+
+
+
+# determine d1/d2 dataset size ratio. If background.density.scaling=F, the ratio of tag counts is returned.
+# if background.density.scaling=T, regions of significant tag enrichment are masked prior to ratio calculation.
+dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) {
+ if(!background.density.scaling) {
+ return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length))))
+ }
+
+ chrl <- intersect(names(d1),names(d2));
+ ntc <- do.call(rbind,lapply(chrl,function(chr) {
+ x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ x2 <- tag.enrichment.clusters(abs(d2[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ return(c(length(which(points.within(abs(d1[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)),length(which(points.within(abs(d2[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1))))
+ }))
+ ntcs <- apply(ntc,2,sum);
+ #print(ntcs/c(sum(unlist(lapply(d1,length))),sum(unlist(lapply(d2,length)))));
+ return(ntcs[1]/ntcs[2])
+}
+
+# returns effective size of the dataset based on the same logic as dataset.density.ratio
+dataset.density.size <- function(d1,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) {
+ if(!background.density.scaling) {
+ return(sum(unlist(lapply(d1,length))))
+ }
+
+ chrl <- names(d1);
+ ntc <- lapply(chrl,function(chr) {
+ x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ return(length(which(points.within(abs(d1[[chr]]),x1$s-wsize/2,x1$e+wsize/2)==-1)))
+ })
+ return(sum(unlist(ntc)))
+}
+
+old.dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) {
+ if(!background.density.scaling) {
+ return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length))))
+ }
+
+ t.chromosome.counts <- function(tl) {
+ lapply(tl,function(d) {
+ x <- tag.enrichment.clusters(abs(d),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ x$s <- x$s-wsize/2; x$e <- x$e+wsize/2;
+ x <- regionset.intersection.c(list(x),do.union=T)
+ return(c(n=length(which(points.within(abs(d),x$s,x$e)==-1)),s=diff(range(abs(d))),m=sum(x$e-x$s)));
+ })
+ }
+
+ l1 <- t.chromosome.counts(d1);
+ l2 <- t.chromosome.counts(d2);
+
+ l2 <- data.frame(do.call(rbind,l2[names(l1)]));
+ l1 <- data.frame(do.call(rbind,l1));
+
+ # genome size
+ gs <- sum(pmax(l1$s,l2$s))
+
+ den1 <- sum(l1$n)/(gs-sum(l1$m))
+ den2 <- sum(l2$n)/(gs-sum(l2$m))
+ return(den1/den2);
+}
+
+
+
+
+# calculate cumulative density based on sum of scaled gaussian curves
+# (by Michael Tolstorukov)
+#
+# vin - input vector; bw -- standard deviation, dw-gaussina cutoff in stdev; dout - output "density")
+# output - if return.x=F vector of cumulative density values corresponding to integer positions described by range(vin)
+# output - if return.x=T a data structure with $x and $y corresponding to the cumulative density
+# optional match.wt.f is a function that will return weights for a tag vector
+densum <- function(vin,bw=5,dw=3,match.wt.f=NULL,return.x=T,from=min(vin),to=max(vin),step=1) {
+ # construct vector of unique tags and their counts
+ tc <- table(vin[vin>=from & vin<=to]);
+ pos <- as.numeric(names(tc)); storage.mode(pos) <- "double";
+ tc <- as.numeric(tc); storage.mode(tc) <- "double";
+ n <- length(pos)
+ # weight counts
+ if(!is.null(match.wt.f)) {
+ tc <- tc*match.wt.f(pos);
+ }
+
+ rng <- c(from,to);
+ if(rng[1]<0) { stop("range extends into negative values") }
+ if(range(pos)[1]<0) { stop("position vector contains negative values") }
+
+ storage.mode(n) <- storage.mode(rng) <- storage.mode(bw) <- storage.mode(dw) <- storage.mode(step) <- "integer";
+
+ spos <- rng[1]; storage.mode(spos) <- "double";
+
+ dlength <- floor((rng[2] - rng[1])/step) + 1; # length of output array
+ if(dlength<1) { stop("zero data range") }
+ dout <- numeric(dlength); storage.mode(dout) <- "double";
+ storage.mode(dlength) <- "integer";
+ .C("cdensum",n,pos,tc,spos,bw,dw,dlength,step,dout,DUP=F);
+
+ if(return.x) {
+ return(list(x=c(rng[1],rng[1]+step*(dlength-1)),y=dout,step=step))
+ } else {
+ return(dout)
+ }
+}
+
+# count tags within sliding window of a specified size
+# vin - tag vector (postive values, pre-shifted)
+# window.size/window.step - window characteristics
+# tv - optional, pre-sorted, pre-trimmed tag vector
+window.tag.count <- function(vin,window.size,window.step=1,return.x=T,from=min(vin)+floor(window.size/2),to=max(vin)-floor(window.size/2),tv=NULL) {
+ whs <- floor(window.size/2);
+ # select tags with margins
+ if(is.null(tv)) {
+ tv <- sort(vin[vin>=from-whs-1 & vin<=to+whs+1])
+ }
+ storage.mode(tv) <- "double";
+ n <- length(tv)
+ nsteps <- ceiling((to-from)/window.step);
+
+ storage.mode(n) <- storage.mode(nsteps) <- storage.mode(window.size) <- storage.mode(window.step) <- "integer";
+
+ spos <- from; storage.mode(spos) <- "double";
+
+ if(nsteps<1) { stop("zero data range") }
+ #dout <- integer(nsteps); storage.mode(dout) <- "integer";
+ #.C("window_n_tags",n,tv,spos,window.size,window.step,nsteps,dout,DUP=F);
+ dout <- .Call("cwindow_n_tags",tv,spos,window.size,window.step,nsteps);
+
+ if(return.x) {
+ return(list(x=c(from,from+(nsteps-1)*window.step),y=dout,step=window.step))
+ } else {
+ return(dout)
+ }
+}
+
+# count tags in windows around specified positions (pos)
+window.tag.count.around <- function(vin,window.size,pos,return.x=T,tc=NULL,sorted=F) {
+ if(is.null(tc)) {
+ tc <- table(vin);
+ }
+ if(!sorted) {
+ op <- rank(pos);
+ pos <- sort(pos);
+ }
+ storage.mode(pos) <- "double";
+ tpos <- as.integer(names(tc)); storage.mode(tpos) <- "double";
+ tc <- as.integer(tc); storage.mode(tc) <- "integer";
+
+ whs <- floor(window.size/2);
+
+ storage.mode(whs) <- "integer";
+ twc <- .Call("cwindow_n_tags_around",tpos,tc,pos,whs);
+ if(return.x) {
+ if(sorted) {
+ return(data.frame(x=pos,y=twc));
+ } else {
+ return(data.frame(x=pos[op],y=twc[op]));
+ }
+ } else {
+ if(sorted) {
+ return(twc);
+ } else {
+ return(twc[op]);
+ }
+ }
+}
+
+# given a tag vector (signed), identify and clean up (either remove or cap) singular positions that exceed local tag density
+# vin - tag vector
+# cap.fold - maximal fold over enrichment over local density allowed for a single tag position, at which the tag count is capped
+# eliminate.fold - max fold enrichment that, when exceeded, results in exclusion of all the tags at that position (e.g. counted as anomaly)
+# z.threshold - Z-score used to determine max allowed counts
+filter.singular.positions.by.local.density <- function(tags,window.size=200,cap.fold=4,eliminate.fold=10,z.threshold=3) {
+ # tabulate tag positions
+ if(length(tags)<2) { return(tags); };
+
+ tc <- table(tags);
+ pos <- as.numeric(names(tc)); storage.mode(pos) <- "double";
+ tc <- as.integer(tc); storage.mode(tc) <- "integer";
+ n <- length(pos);
+
+ whs <- floor(window.size/2);
+
+ storage.mode(n) <- storage.mode(whs) <- "integer";
+ twc <- .Call("cwindow_n_tags_around",pos,tc,pos,whs);
+ twc <- (twc-tc+1)/window.size; # local density
+
+ pv <- pnorm(z.threshold,lower.tail=F)
+ # exclude
+ max.counts <- qpois(pv,twc*eliminate.fold,lower.tail=F)
+ tc[tc>max.counts] <- 0;
+ # cap
+ max.counts <- qpois(pv,twc*cap.fold,lower.tail=F)
+ ivi <- which(tc>max.counts);
+ tc[ivi] <- max.counts[ivi]+1;
+
+ # reconstruct tag vector
+ tv <- rep(pos,tc);
+ to <- order(abs(tv)); tv <- tv[to];
+ return(tv);
+}
+
+
+
+# calculates enrichment bounds using multiple background scales
+# ft - foreground tags (pre-shifted, positive)
+# bt - background tags
+# fws - foreground window size
+# bwsl - background window size list
+# step - window step
+# rng - from/to coordinates (to will be adjusted according to step)
+#
+# returns: a list with $x ($s $e $step), $lb vector and $mle vector ($ub if calculate.upper.bound=T)
+mbs.enrichment.bounds <- function(ft,bt,fws,bwsl,step=1,rng=NULL,alpha=0.05,calculate.upper.bound=F,bg.weight=length(ft)/length(bt),use.most.informative.scale=F,quick.calculation=F,pos=NULL) {
+ # determine range
+ if(is.null(rng)) {
+ rng <- range(range(ft));
+ }
+ # foreground counts
+ if(is.null(pos)) {
+ fwc <- window.tag.count(ft,fws,window.step=step,from=rng[1],to=rng[2],return.x=T);
+ } else {
+ fwc <- window.tag.count.around(ft,fws,pos,return.x=T)
+ }
+ fwc$y <- fwc$y+0.5;
+
+ zal <- qnorm(alpha/2,lower.tail=F);
+
+ # background counts
+ bt <- sort(bt);
+ if(!is.null(pos)) {
+ tc <- table(bt);
+ }
+ bgcm <- lapply(bwsl,function(bgws) {
+ if(is.null(pos)) {
+ window.tag.count(bt,bgws,window.step=step,from=rng[1],to=rng[2],return.x=F,tv=bt)+0.5;
+ } else {
+ window.tag.count.around(bt,bgws,pos,return.x=F,tc=tc)+0.5
+ }
+ })
+ if(!is.null(pos)) {
+ rm(tc);
+ }
+
+ # pick most informative scale
+ if(use.most.informative.scale) {
+ bgcm <- t(do.call(cbind,bgcm))
+ isi <- max.col(t((bgcm)/(bwsl/fws))) # add pseudo-counts to select lowest scale in case of a tie
+
+ bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)]
+
+ if(quick.calculation) {
+ rte <- fwc$y+bgc-0.25*zal*zal; rte[rte<0] <- 0;
+ dn <- bgc - 0.25*zal*zal;
+ lbm=(sqrt(fwc$y*bgc) - 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(lbm<0);
+ lbm <- lbm*lbm*bwsl[isi]/fws/bg.weight;
+ lbm[rte<=0] <- 1;
+ lbm[dn<=0] <- 1;
+ lbm[ivi] <- 1;
+ } else {
+ lbm <- (fwc$y/bgc)*qf(1-alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight;
+ }
+
+ mle <- fwc$y/bgc*bwsl[isi]/fws/bg.weight; mle[is.nan(mle)] <- Inf; mle[is.na(mle)] <- Inf;
+
+ rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle);
+
+ if(calculate.upper.bound) {
+ isi <- max.col(t((-bgcm)/(bwsl/fws))) # add pseudo-counts to select highest scale in case of a tie
+ bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)]
+
+ if(quick.calculation) {
+ ubm=(sqrt(fwc$y*bgc) + 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(ubm<0);
+ ubm <- ubm*ubm*bwsl[isi]/fws/bg.weight;
+ ubm[rte<=0] <- 1;
+ ubm[ivi] <- 1;
+ lbm[dn<=0] <- 1;
+ } else {
+ ubm <- (fwc$y/bgc)*qf(alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight;
+ }
+ rl <- c(rl,list(ub=ubm));
+ }
+ return(rl);
+
+ } else {
+ # determine lower bounds
+ lbm <- lapply(c(1:length(bgcm)),function(i) {
+ nbg <- bgcm[[i]];
+ if(quick.calculation) {
+ rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0;
+ dn <- (nbg - 0.25*zal*zal);
+ lbm=(sqrt(fwc$y*nbg) - 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(lbm<0);
+ lbm <- lbm*lbm*bwsl[i]/fws/bg.weight;
+ lbm[rte<=0] <- 1;
+ lbm[dn<=0] <- 1;
+ lbm[ivi] <- 1;
+ return(lbm);
+ } else {
+ return((fwc$y/nbg)*qf(1-alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight);
+ }
+ })
+ lbm <- do.call(pmin,lbm);
+
+ # calculate mle
+ #mle <- do.call(pmin,lapply(bgcm,function(bgc) fwc/bgc))
+ mle <- do.call(pmin,lapply(c(1:length(bgcm)),function(i) {
+ bgc <- bgcm[[i]];
+ x <- fwc$y/bgc*bwsl[i]/fws/bg.weight; x[is.nan(x)] <- Inf; x[is.na(x)] <- Inf; return(x);
+ }))
+
+ rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle);
+
+ if(calculate.upper.bound) {
+ # determine upper bound
+ ubm <- lapply(c(1:length(bgcm)),function(i) {
+ nbg <- bgcm[[i]];
+ if(quick.calculation) {
+ rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0;
+ dn <- (nbg - 0.25*zal*zal);
+ ubm=(sqrt(fwc$y*nbg) + 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(ubm<0);
+ ubm <- ubm*ubm*bwsl[i]/fws/bg.weight;
+ ubm[rte<=0] <- 1;
+ ubm[dn<=0] <- 1;
+ ubm[ivi] <- 1;
+ return(ubm);
+ } else {
+ return((fwc$y/nbg)*qf(alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight);
+ }
+ })
+ ubm <- do.call(pmax,ubm);
+ rl <- c(rl,list(ub=ubm));
+ }
+
+ return(rl);
+ }
+}
+
+write.probe.wig <- function(chr,pos,val,fname,append=F,feature="M",probe.length=35,header=T) {
+ min.dist <- min(diff(pos));
+ if(probe.length>=min.dist) {
+ probe.length <- min.dist-1;
+ cat("warning: adjusted down wig segment length to",probe.length,"\n");
+ }
+ mdat <- data.frame(chr,as.integer(pos),as.integer(pos+probe.length),val)
+
+ if(header) {
+ write(paste("track type=wiggle_0 name=\"Bed Format\" description=\"",feature,"\" visibility=dense color=200,100,0 altColor=0,100,200 priority=20",sep=""),file=fname,append=append)
+ write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=T);
+ } else {
+ write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=append);
+ }
+
+}
+
+# returns intersection of multiple region sets
+# each regionset needs to contain $s, $e and optional $v column
+regionset.intersection.c <- function(rsl,max.val=-1,do.union=F) {
+ # translate into position/flag form
+ rfl <- lapply(rsl,function(rs) {
+ rp <- c(rs$s,rs$e); rf <- c(rep(c(1,-1),each=length(rs$s)));
+
+ ro <- order(rp);
+ rp <- rp[ro]; rf <- rf[ro];
+ if(!is.null(rs$v)) {
+ rv <- c(rs$v,rs$v)[ro];
+ return(data.frame(p=as.numeric(rp),f=as.integer(rf),v=as.numeric(rv)));
+ } else {
+ return(data.frame(p=as.numeric(rp),f=as.integer(rf)));
+ }
+ })
+ rfd <- data.frame(do.call(rbind,lapply(1:length(rfl),function(i) {
+ d <- rfl[[i]]; d$f <- d$f*i; return(d);
+ })))
+ rfd <- rfd[order(rfd$p),];
+ if(is.null(rfd$v)) { max.val <- 0; }
+ if(do.union) { ur <- 1; } else { ur <- 0; };
+ rl <- .Call("region_intersection",as.integer(length(rfl)),as.numeric(rfd$p),as.integer(rfd$f),as.numeric(rfd$v),as.integer(max.val),as.integer(ur));
+ return(data.frame(do.call(cbind,rl)));
+}
+
+
+# idenfity if binding peak falls within a larger region of significant tag enrichment, and if so record its booundaries
+add.broad.peak.regions <- function(chip.tags,input.tags,bp,window.size=500,z.thr=2) {
+ se <- find.significantly.enriched.regions(chip.tags,input.tags,window.size=window.size,z.thr=z.thr,poisson.z=0,poisson.ratio=0,either=F)
+ chrl <- names(bp$npl); names(chrl) <- chrl;
+ bnpl <- lapply(chrl,function(chr) {
+ npl <- bp$npl[[chr]];
+ if(is.null(npl) | dim(npl)[1]<1) {
+ return(npl);
+ }
+ pi <- points.within(npl$x,se[[chr]]$s,se[[chr]]$e,return.list=T);
+
+ pm <- do.call(rbind,lapply(pi,function(rl) {
+ if(length(rl)>0) {
+ return(range(c(se[[chr]]$s[rl],se[[chr]]$e[rl])))
+ } else {
+ return(c(NA,NA));
+ }
+ }))
+
+ npl$rs <- pm[,1];
+ npl$re <- pm[,2];
+ return(npl);
+ })
+ bp$npl <- bnpl;
+ return(bp);
+}
+
+# writing out binding results in a narrowpeak format, incorporating broad region boundaries if they are present
+# if broad region info is not present, margin is used to determine region width. The default margin is equal
+# to the window half size used to call the binding peaks
+write.narrowpeak.binding <- function(bd,fname,margin=bd$whs,npeaks=NA) { # Anshul: added npeaks option
+ if(is.null(margin)) { margin <- 50; }
+ chrl <- names(bd$npl); names(chrl) <- chrl;
+ md <- do.call(rbind,lapply(chrl,function(chr) {
+ df <- bd$npl[[chr]];
+ x <- df$x;
+ rs <- df$rs; if(is.null(rs)) { rs <- rep(NA,length(x)) }
+ re <- df$re; if(is.null(re)) { re <- rep(NA,length(x)) }
+ #ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- x[ivi]-margin;}
+ ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- pmax(0,x[ivi]-margin);} # Anshul: added the pmax (0, ...) to avoid negative peak starts
+ ivi <- which(is.na(re)); if(any(ivi)) {re[ivi] <- x[ivi]+margin;}
+ #cbind(chr,rs,re,".","0",".",df$y,-1,format(df$fdr,scientific=T,digits=3),x-rs)
+ cbind(chr,rs,re,".","0",".",df$y,-1,-log10(df$fdr),x-rs) # Anshul: converted fdr to -log10
+ }))
+ md <- md[order(as.numeric(md[,7]),decreasing=T),]
+ if (!is.na(npeaks)) { # Anshul: added this option to print a limited number of peaks
+ npeaks <- min(nrow(md),npeaks)
+ md <- md[1:npeaks,]
+ }
+ write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F);
+}
+
+
+get.broad.enrichment.clusters <- function(signal.data,control.data,window.size=1e3,z.thr=3, tag.shift=146/2,background.density.scaling=F, ... ) {
+ # find significantly enriched clusters
+ bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling);
+ se <- find.significantly.enriched.regions(signal.data,control.data,window.size=window.size,z.thr=z.thr,tag.shift=tag.shift, bg.weight=bg.weight, ...)
+ chrl <- names(se); names(chrl) <- chrl;
+ se <- lapply(chrl,function(chr) {
+ d <- se[[chr]];
+ if(length(d$s>1)) {
+ d <- regionset.intersection.c(list(d,d),do.union=T);
+ sc <- points.within(abs(signal.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T);
+ cc <- points.within(abs(control.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T);
+ d$rv <- log2((sc+1)/(cc+1)/bg.weight);
+ return(d);
+ } else {
+ return(d)
+ }
+ })
+}
+
+write.broadpeak.info <- function(bp,fname) {
+ chrl <- names(bp); names(chrl) <- chrl;
+ chrl <- chrl[unlist(lapply(bp,function(d) length(d$s)))>0]
+ md <- do.call(rbind,lapply(chrl,function(chr) {
+ df <- bp[[chr]];
+ cbind(chr,df$s,df$e,".","0",".",df$rv,-1,-1)
+ }))
+ md <- md[order(as.numeric(md[,7]),decreasing=T),]
+ write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F);
+}
+
+
+get.clusters2 <- function(x,CL) {
+ temp <- which(diff(x) != 0)
+ begin <- c(1, temp + 1)
+ end <- c(temp, length(x))
+ size <- end - begin + 1
+
+ begin <- begin[size >= CL]
+ end <- end[size >= CL]
+ size <- size[size >= CL]
+
+ size <- size[x[end] != 0]
+ begin <- begin[x[end] != 0]
+ end <- end[x[end] != 0]
+
+ return (list(size=size,begin=begin,end=end))
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure
new file mode 100755
index 0000000..1cef55c
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure
@@ -0,0 +1,3856 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.63 for SPP 1.7.
+#
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## --------------------- ##
+## M4sh Initialization. ##
+## --------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ PATH_SEPARATOR=:
+ (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+ (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+ PATH_SEPARATOR=';'
+ }
+fi
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+ as_unset=unset
+else
+ as_unset=false
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order. Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" "" $as_nl"
+
+# Find who we are. Look in the path if we contain no directory separator.
+case $0 in
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+IFS=$as_save_IFS
+
+ ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+ as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+ $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+ { (exit 1); exit 1; }
+fi
+
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+
+# CDPATH.
+$as_unset CDPATH
+
+
+if test "x$CONFIG_SHELL" = x; then
+ if (eval ":") 2>/dev/null; then
+ as_have_required=yes
+else
+ as_have_required=no
+fi
+
+ if test $as_have_required = yes && (eval ":
+(as_func_return () {
+ (exit \$1)
+}
+as_func_success () {
+ as_func_return 0
+}
+as_func_failure () {
+ as_func_return 1
+}
+as_func_ret_success () {
+ return 0
+}
+as_func_ret_failure () {
+ return 1
+}
+
+exitcode=0
+if as_func_success; then
+ :
+else
+ exitcode=1
+ echo as_func_success failed.
+fi
+
+if as_func_failure; then
+ exitcode=1
+ echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+ :
+else
+ exitcode=1
+ echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+ exitcode=1
+ echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+ :
+else
+ exitcode=1
+ echo positional parameters were not saved.
+fi
+
+test \$exitcode = 0) || { (exit 1); exit 1; }
+
+(
+ as_lineno_1=\$LINENO
+ as_lineno_2=\$LINENO
+ test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" &&
+ test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; }
+") 2> /dev/null; then
+ :
+else
+ as_candidate_shells=
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ case $as_dir in
+ /*)
+ for as_base in sh bash ksh sh5; do
+ as_candidate_shells="$as_candidate_shells $as_dir/$as_base"
+ done;;
+ esac
+done
+IFS=$as_save_IFS
+
+
+ for as_shell in $as_candidate_shells $SHELL; do
+ # Try only shells that exist, to save several forks.
+ if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+ { ("$as_shell") 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+:
+_ASEOF
+}; then
+ CONFIG_SHELL=$as_shell
+ as_have_required=yes
+ if { "$as_shell" 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+:
+(as_func_return () {
+ (exit $1)
+}
+as_func_success () {
+ as_func_return 0
+}
+as_func_failure () {
+ as_func_return 1
+}
+as_func_ret_success () {
+ return 0
+}
+as_func_ret_failure () {
+ return 1
+}
+
+exitcode=0
+if as_func_success; then
+ :
+else
+ exitcode=1
+ echo as_func_success failed.
+fi
+
+if as_func_failure; then
+ exitcode=1
+ echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+ :
+else
+ exitcode=1
+ echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+ exitcode=1
+ echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = "$1" ); then
+ :
+else
+ exitcode=1
+ echo positional parameters were not saved.
+fi
+
+test $exitcode = 0) || { (exit 1); exit 1; }
+
+(
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; }
+
+_ASEOF
+}; then
+ break
+fi
+
+fi
+
+ done
+
+ if test "x$CONFIG_SHELL" != x; then
+ for as_var in BASH_ENV ENV
+ do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+ done
+ export CONFIG_SHELL
+ exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"}
+fi
+
+
+ if test $as_have_required = no; then
+ echo This script requires a shell more modern than all the
+ echo shells that I found on your system. Please install a
+ echo modern shell, or manually run the script under such a
+ echo shell if you do have one.
+ { (exit 1); exit 1; }
+fi
+
+
+fi
+
+fi
+
+
+
+(eval "as_func_return () {
+ (exit \$1)
+}
+as_func_success () {
+ as_func_return 0
+}
+as_func_failure () {
+ as_func_return 1
+}
+as_func_ret_success () {
+ return 0
+}
+as_func_ret_failure () {
+ return 1
+}
+
+exitcode=0
+if as_func_success; then
+ :
+else
+ exitcode=1
+ echo as_func_success failed.
+fi
+
+if as_func_failure; then
+ exitcode=1
+ echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+ :
+else
+ exitcode=1
+ echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+ exitcode=1
+ echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+ :
+else
+ exitcode=1
+ echo positional parameters were not saved.
+fi
+
+test \$exitcode = 0") || {
+ echo No shell found that supports shell functions.
+ echo Please tell bug-autoconf@gnu.org about your system,
+ echo including any error possibly output before this message.
+ echo This can help us improve future autoconf versions.
+ echo Configuration will now proceed without shell functions.
+}
+
+
+
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+ # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+ # uniformly replaced by the line number. The first 'sed' inserts a
+ # line-number line after each line using $LINENO; the second 'sed'
+ # does the real work. The second script uses 'N' to pair each
+ # line-number line with the line containing $LINENO, and appends
+ # trailing '-' during substitution so that $LINENO is not a special
+ # case at line end.
+ # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+ # scripts with optimization help from Paolo Bonzini. Blame Lee
+ # E. McMahon (1931-1989) for sed's syntax. :-)
+ sed -n '
+ p
+ /[$]LINENO/=
+ ' <$as_myself |
+ sed '
+ s/[$]LINENO.*/&-/
+ t lineno
+ b
+ :lineno
+ N
+ :loop
+ s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+ t loop
+ s/-\n.*//
+ ' >$as_me.lineno &&
+ chmod +x "$as_me.lineno" ||
+ { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+ { (exit 1); exit 1; }; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensitive to this).
+ . "./$as_me.lineno"
+ # Exit status is that of the last command.
+ exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+ as_dirname=dirname
+else
+ as_dirname=false
+fi
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in
+-n*)
+ case `echo 'x\c'` in
+ *c*) ECHO_T=' ';; # ECHO_T is single tab character.
+ *) ECHO_C='\c';;
+ esac;;
+*)
+ ECHO_N='-n';;
+esac
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+ rm -f conf$$.dir/conf$$.file
+else
+ rm -f conf$$.dir
+ mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+ if ln -s conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s='ln -s'
+ # ... but there are two gotchas:
+ # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+ # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+ # In both cases, we have to default to `cp -p'.
+ ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+ as_ln_s='cp -p'
+ elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+ else
+ as_ln_s='cp -p'
+ fi
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p=:
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+ as_test_x='test -x'
+else
+ if ls -dL / >/dev/null 2>&1; then
+ as_ls_L_option=L
+ else
+ as_ls_L_option=
+ fi
+ as_test_x='
+ eval sh -c '\''
+ if test -d "$1"; then
+ test -d "$1/.";
+ else
+ case $1 in
+ -*)set "./$1";;
+ esac;
+ case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
+ ???[sx]*):;;*)false;;esac;fi
+ '\'' sh
+ '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+
+exec 7<&0 </dev/null 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+# Identity of this package.
+PACKAGE_NAME='SPP'
+PACKAGE_TARNAME='spp'
+PACKAGE_VERSION='1.7'
+PACKAGE_STRING='SPP 1.7'
+PACKAGE_BUGREPORT=''
+
+ac_subst_vars='LTLIBOBJS
+LIBOBJS
+HAVE_LIBBZ2
+OBJEXT
+EXEEXT
+ac_ct_CC
+CPPFLAGS
+LDFLAGS
+CFLAGS
+CC
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+'
+ ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+ # If the previous option needs an argument, assign it.
+ if test -n "$ac_prev"; then
+ eval $ac_prev=\$ac_option
+ ac_prev=
+ continue
+ fi
+
+ case $ac_option in
+ *=*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+ *) ac_optarg=yes ;;
+ esac
+
+ # Accept the important Cygnus configure options, so we can diagnose typos.
+
+ case $ac_dashdash$ac_option in
+ --)
+ ac_dashdash=yes ;;
+
+ -bindir | --bindir | --bindi | --bind | --bin | --bi)
+ ac_prev=bindir ;;
+ -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+ bindir=$ac_optarg ;;
+
+ -build | --build | --buil | --bui | --bu)
+ ac_prev=build_alias ;;
+ -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+ build_alias=$ac_optarg ;;
+
+ -cache-file | --cache-file | --cache-fil | --cache-fi \
+ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+ ac_prev=cache_file ;;
+ -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+ cache_file=$ac_optarg ;;
+
+ --config-cache | -C)
+ cache_file=config.cache ;;
+
+ -datadir | --datadir | --datadi | --datad)
+ ac_prev=datadir ;;
+ -datadir=* | --datadir=* | --datadi=* | --datad=*)
+ datadir=$ac_optarg ;;
+
+ -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+ | --dataroo | --dataro | --datar)
+ ac_prev=datarootdir ;;
+ -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+ | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+ datarootdir=$ac_optarg ;;
+
+ -disable-* | --disable-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"enable_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval enable_$ac_useropt=no ;;
+
+ -docdir | --docdir | --docdi | --doc | --do)
+ ac_prev=docdir ;;
+ -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+ docdir=$ac_optarg ;;
+
+ -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+ ac_prev=dvidir ;;
+ -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+ dvidir=$ac_optarg ;;
+
+ -enable-* | --enable-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"enable_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval enable_$ac_useropt=\$ac_optarg ;;
+
+ -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+ | --exec | --exe | --ex)
+ ac_prev=exec_prefix ;;
+ -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+ | --exec=* | --exe=* | --ex=*)
+ exec_prefix=$ac_optarg ;;
+
+ -gas | --gas | --ga | --g)
+ # Obsolete; use --with-gas.
+ with_gas=yes ;;
+
+ -help | --help | --hel | --he | -h)
+ ac_init_help=long ;;
+ -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+ ac_init_help=recursive ;;
+ -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+ ac_init_help=short ;;
+
+ -host | --host | --hos | --ho)
+ ac_prev=host_alias ;;
+ -host=* | --host=* | --hos=* | --ho=*)
+ host_alias=$ac_optarg ;;
+
+ -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+ ac_prev=htmldir ;;
+ -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+ | --ht=*)
+ htmldir=$ac_optarg ;;
+
+ -includedir | --includedir | --includedi | --included | --include \
+ | --includ | --inclu | --incl | --inc)
+ ac_prev=includedir ;;
+ -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+ | --includ=* | --inclu=* | --incl=* | --inc=*)
+ includedir=$ac_optarg ;;
+
+ -infodir | --infodir | --infodi | --infod | --info | --inf)
+ ac_prev=infodir ;;
+ -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+ infodir=$ac_optarg ;;
+
+ -libdir | --libdir | --libdi | --libd)
+ ac_prev=libdir ;;
+ -libdir=* | --libdir=* | --libdi=* | --libd=*)
+ libdir=$ac_optarg ;;
+
+ -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+ | --libexe | --libex | --libe)
+ ac_prev=libexecdir ;;
+ -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+ | --libexe=* | --libex=* | --libe=*)
+ libexecdir=$ac_optarg ;;
+
+ -localedir | --localedir | --localedi | --localed | --locale)
+ ac_prev=localedir ;;
+ -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+ localedir=$ac_optarg ;;
+
+ -localstatedir | --localstatedir | --localstatedi | --localstated \
+ | --localstate | --localstat | --localsta | --localst | --locals)
+ ac_prev=localstatedir ;;
+ -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+ | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+ localstatedir=$ac_optarg ;;
+
+ -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+ ac_prev=mandir ;;
+ -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+ mandir=$ac_optarg ;;
+
+ -nfp | --nfp | --nf)
+ # Obsolete; use --without-fp.
+ with_fp=no ;;
+
+ -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+ | --no-cr | --no-c | -n)
+ no_create=yes ;;
+
+ -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+ no_recursion=yes ;;
+
+ -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+ | --oldin | --oldi | --old | --ol | --o)
+ ac_prev=oldincludedir ;;
+ -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+ oldincludedir=$ac_optarg ;;
+
+ -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+ ac_prev=prefix ;;
+ -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+ prefix=$ac_optarg ;;
+
+ -program-prefix | --program-prefix | --program-prefi | --program-pref \
+ | --program-pre | --program-pr | --program-p)
+ ac_prev=program_prefix ;;
+ -program-prefix=* | --program-prefix=* | --program-prefi=* \
+ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+ program_prefix=$ac_optarg ;;
+
+ -program-suffix | --program-suffix | --program-suffi | --program-suff \
+ | --program-suf | --program-su | --program-s)
+ ac_prev=program_suffix ;;
+ -program-suffix=* | --program-suffix=* | --program-suffi=* \
+ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+ program_suffix=$ac_optarg ;;
+
+ -program-transform-name | --program-transform-name \
+ | --program-transform-nam | --program-transform-na \
+ | --program-transform-n | --program-transform- \
+ | --program-transform | --program-transfor \
+ | --program-transfo | --program-transf \
+ | --program-trans | --program-tran \
+ | --progr-tra | --program-tr | --program-t)
+ ac_prev=program_transform_name ;;
+ -program-transform-name=* | --program-transform-name=* \
+ | --program-transform-nam=* | --program-transform-na=* \
+ | --program-transform-n=* | --program-transform-=* \
+ | --program-transform=* | --program-transfor=* \
+ | --program-transfo=* | --program-transf=* \
+ | --program-trans=* | --program-tran=* \
+ | --progr-tra=* | --program-tr=* | --program-t=*)
+ program_transform_name=$ac_optarg ;;
+
+ -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+ ac_prev=pdfdir ;;
+ -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+ pdfdir=$ac_optarg ;;
+
+ -psdir | --psdir | --psdi | --psd | --ps)
+ ac_prev=psdir ;;
+ -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+ psdir=$ac_optarg ;;
+
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ silent=yes ;;
+
+ -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+ ac_prev=sbindir ;;
+ -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+ | --sbi=* | --sb=*)
+ sbindir=$ac_optarg ;;
+
+ -sharedstatedir | --sharedstatedir | --sharedstatedi \
+ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+ | --sharedst | --shareds | --shared | --share | --shar \
+ | --sha | --sh)
+ ac_prev=sharedstatedir ;;
+ -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+ | --sha=* | --sh=*)
+ sharedstatedir=$ac_optarg ;;
+
+ -site | --site | --sit)
+ ac_prev=site ;;
+ -site=* | --site=* | --sit=*)
+ site=$ac_optarg ;;
+
+ -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+ ac_prev=srcdir ;;
+ -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+ srcdir=$ac_optarg ;;
+
+ -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+ | --syscon | --sysco | --sysc | --sys | --sy)
+ ac_prev=sysconfdir ;;
+ -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+ sysconfdir=$ac_optarg ;;
+
+ -target | --target | --targe | --targ | --tar | --ta | --t)
+ ac_prev=target_alias ;;
+ -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+ target_alias=$ac_optarg ;;
+
+ -v | -verbose | --verbose | --verbos | --verbo | --verb)
+ verbose=yes ;;
+
+ -version | --version | --versio | --versi | --vers | -V)
+ ac_init_version=: ;;
+
+ -with-* | --with-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"with_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval with_$ac_useropt=\$ac_optarg ;;
+
+ -without-* | --without-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"with_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval with_$ac_useropt=no ;;
+
+ --x)
+ # Obsolete; use --with-x.
+ with_x=yes ;;
+
+ -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+ | --x-incl | --x-inc | --x-in | --x-i)
+ ac_prev=x_includes ;;
+ -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+ x_includes=$ac_optarg ;;
+
+ -x-libraries | --x-libraries | --x-librarie | --x-librari \
+ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+ ac_prev=x_libraries ;;
+ -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+ x_libraries=$ac_optarg ;;
+
+ -*) { $as_echo "$as_me: error: unrecognized option: $ac_option
+Try \`$0 --help' for more information." >&2
+ { (exit 1); exit 1; }; }
+ ;;
+
+ *=*)
+ ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid variable name: $ac_envvar" >&2
+ { (exit 1); exit 1; }; }
+ eval $ac_envvar=\$ac_optarg
+ export $ac_envvar ;;
+
+ *)
+ # FIXME: should be removed in autoconf 3.0.
+ $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+ expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+ $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+ : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}
+ ;;
+
+ esac
+done
+
+if test -n "$ac_prev"; then
+ ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+ { $as_echo "$as_me: error: missing argument to $ac_option" >&2
+ { (exit 1); exit 1; }; }
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+ case $enable_option_checking in
+ no) ;;
+ fatal) { $as_echo "$as_me: error: unrecognized options: $ac_unrecognized_opts" >&2
+ { (exit 1); exit 1; }; } ;;
+ *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+ esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
+ datadir sysconfdir sharedstatedir localstatedir includedir \
+ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+ libdir localedir mandir
+do
+ eval ac_val=\$$ac_var
+ # Remove trailing slashes.
+ case $ac_val in
+ */ )
+ ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+ eval $ac_var=\$ac_val;;
+ esac
+ # Be sure to have absolute directory names.
+ case $ac_val in
+ [\\/$]* | ?:[\\/]* ) continue;;
+ NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+ esac
+ { $as_echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
+ { (exit 1); exit 1; }; }
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+ if test "x$build_alias" = x; then
+ cross_compiling=maybe
+ $as_echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host.
+ If a cross compiler is detected then cross compile mode will be used." >&2
+ elif test "x$build_alias" != "x$host_alias"; then
+ cross_compiling=yes
+ fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+ { $as_echo "$as_me: error: working directory cannot be determined" >&2
+ { (exit 1); exit 1; }; }
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+ { $as_echo "$as_me: error: pwd does not report name of working directory" >&2
+ { (exit 1); exit 1; }; }
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+ ac_srcdir_defaulted=yes
+ # Try the directory containing this script, then the parent directory.
+ ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_myself" : 'X\(//\)[^/]' \| \
+ X"$as_myself" : 'X\(//\)$' \| \
+ X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ srcdir=$ac_confdir
+ if test ! -r "$srcdir/$ac_unique_file"; then
+ srcdir=..
+ fi
+else
+ ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+ test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+ { $as_echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2
+ { (exit 1); exit 1; }; }
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+ cd "$srcdir" && test -r "./$ac_unique_file" || { $as_echo "$as_me: error: $ac_msg" >&2
+ { (exit 1); exit 1; }; }
+ pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+ srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+ eval ac_env_${ac_var}_set=\${${ac_var}+set}
+ eval ac_env_${ac_var}_value=\$${ac_var}
+ eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+ eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+ # Omit some internal or obsolete options to make the list less imposing.
+ # This message is too long to be a string in the A/UX 3.1 sh.
+ cat <<_ACEOF
+\`configure' configures SPP 1.7 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE. See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+ -h, --help display this help and exit
+ --help=short display options specific to this package
+ --help=recursive display the short help of all the included packages
+ -V, --version display version information and exit
+ -q, --quiet, --silent do not print \`checking...' messages
+ --cache-file=FILE cache test results in FILE [disabled]
+ -C, --config-cache alias for \`--cache-file=config.cache'
+ -n, --no-create do not create output files
+ --srcdir=DIR find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+ --prefix=PREFIX install architecture-independent files in PREFIX
+ [$ac_default_prefix]
+ --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX
+ [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+ --bindir=DIR user executables [EPREFIX/bin]
+ --sbindir=DIR system admin executables [EPREFIX/sbin]
+ --libexecdir=DIR program executables [EPREFIX/libexec]
+ --sysconfdir=DIR read-only single-machine data [PREFIX/etc]
+ --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
+ --localstatedir=DIR modifiable single-machine data [PREFIX/var]
+ --libdir=DIR object code libraries [EPREFIX/lib]
+ --includedir=DIR C header files [PREFIX/include]
+ --oldincludedir=DIR C header files for non-gcc [/usr/include]
+ --datarootdir=DIR read-only arch.-independent data root [PREFIX/share]
+ --datadir=DIR read-only architecture-independent data [DATAROOTDIR]
+ --infodir=DIR info documentation [DATAROOTDIR/info]
+ --localedir=DIR locale-dependent data [DATAROOTDIR/locale]
+ --mandir=DIR man documentation [DATAROOTDIR/man]
+ --docdir=DIR documentation root [DATAROOTDIR/doc/spp]
+ --htmldir=DIR html documentation [DOCDIR]
+ --dvidir=DIR dvi documentation [DOCDIR]
+ --pdfdir=DIR pdf documentation [DOCDIR]
+ --psdir=DIR ps documentation [DOCDIR]
+_ACEOF
+
+ cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+ case $ac_init_help in
+ short | recursive ) echo "Configuration of SPP 1.7:";;
+ esac
+ cat <<\_ACEOF
+
+Some influential environment variables:
+ CC C compiler command
+ CFLAGS C compiler flags
+ LDFLAGS linker flags, e.g. -L<lib dir> if you have libraries in a
+ nonstandard directory <lib dir>
+ LIBS libraries to pass to the linker, e.g. -l<library>
+ CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
+ you have headers in a nonstandard directory <include dir>
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+ # If there are subdirs, report their specific --help.
+ for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+ test -d "$ac_dir" ||
+ { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+ continue
+ ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+ ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+ # A ".." for each directory in $ac_dir_suffix.
+ ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+ case $ac_top_builddir_sub in
+ "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+ *) ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+ esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+ .) # We are building in place.
+ ac_srcdir=.
+ ac_top_srcdir=$ac_top_builddir_sub
+ ac_abs_top_srcdir=$ac_pwd ;;
+ [\\/]* | ?:[\\/]* ) # Absolute name.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir
+ ac_abs_top_srcdir=$srcdir ;;
+ *) # Relative name.
+ ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_build_prefix$srcdir
+ ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+ cd "$ac_dir" || { ac_status=$?; continue; }
+ # Check for guested configure.
+ if test -f "$ac_srcdir/configure.gnu"; then
+ echo &&
+ $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+ elif test -f "$ac_srcdir/configure"; then
+ echo &&
+ $SHELL "$ac_srcdir/configure" --help=recursive
+ else
+ $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+ fi || ac_status=$?
+ cd "$ac_pwd" || { ac_status=$?; break; }
+ done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+ cat <<\_ACEOF
+SPP configure 1.7
+generated by GNU Autoconf 2.63
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+ exit
+fi
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by SPP $as_me 1.7, which was
+generated by GNU Autoconf 2.63. Invocation command line was
+
+ $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown`
+
+/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown`
+/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown`
+/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown`
+/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown`
+/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ $as_echo "PATH: $as_dir"
+done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+ for ac_arg
+ do
+ case $ac_arg in
+ -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ continue ;;
+ *\'*)
+ ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ esac
+ case $ac_pass in
+ 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;;
+ 2)
+ ac_configure_args1="$ac_configure_args1 '$ac_arg'"
+ if test $ac_must_keep_next = true; then
+ ac_must_keep_next=false # Got value, back to normal.
+ else
+ case $ac_arg in
+ *=* | --config-cache | -C | -disable-* | --disable-* \
+ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+ | -with-* | --with-* | -without-* | --without-* | --x)
+ case "$ac_configure_args0 " in
+ "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+ esac
+ ;;
+ -* ) ac_must_keep_next=true ;;
+ esac
+ fi
+ ac_configure_args="$ac_configure_args '$ac_arg'"
+ ;;
+ esac
+ done
+done
+$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; }
+$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; }
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log. We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+ # Save into config.log some information that might help in debugging.
+ {
+ echo
+
+ cat <<\_ASBOX
+## ---------------- ##
+## Cache variables. ##
+## ---------------- ##
+_ASBOX
+ echo
+ # The following way of writing the cache mishandles newlines in values,
+(
+ for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+ eval ac_val=\$$ac_var
+ case $ac_val in #(
+ *${as_nl}*)
+ case $ac_var in #(
+ *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+ esac
+ case $ac_var in #(
+ _ | IFS | as_nl) ;; #(
+ BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+ *) $as_unset $ac_var ;;
+ esac ;;
+ esac
+ done
+ (set) 2>&1 |
+ case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+ *${as_nl}ac_space=\ *)
+ sed -n \
+ "s/'\''/'\''\\\\'\'''\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+ ;; #(
+ *)
+ sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+ ;;
+ esac |
+ sort
+)
+ echo
+
+ cat <<\_ASBOX
+## ----------------- ##
+## Output variables. ##
+## ----------------- ##
+_ASBOX
+ echo
+ for ac_var in $ac_subst_vars
+ do
+ eval ac_val=\$$ac_var
+ case $ac_val in
+ *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+ esac
+ $as_echo "$ac_var='\''$ac_val'\''"
+ done | sort
+ echo
+
+ if test -n "$ac_subst_files"; then
+ cat <<\_ASBOX
+## ------------------- ##
+## File substitutions. ##
+## ------------------- ##
+_ASBOX
+ echo
+ for ac_var in $ac_subst_files
+ do
+ eval ac_val=\$$ac_var
+ case $ac_val in
+ *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+ esac
+ $as_echo "$ac_var='\''$ac_val'\''"
+ done | sort
+ echo
+ fi
+
+ if test -s confdefs.h; then
+ cat <<\_ASBOX
+## ----------- ##
+## confdefs.h. ##
+## ----------- ##
+_ASBOX
+ echo
+ cat confdefs.h
+ echo
+ fi
+ test "$ac_signal" != 0 &&
+ $as_echo "$as_me: caught signal $ac_signal"
+ $as_echo "$as_me: exit $exit_status"
+ } >&5
+ rm -f core *.core core.conftest.* &&
+ rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+ exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+ trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+ ac_site_file1=$CONFIG_SITE
+elif test "x$prefix" != xNONE; then
+ ac_site_file1=$prefix/share/config.site
+ ac_site_file2=$prefix/etc/config.site
+else
+ ac_site_file1=$ac_default_prefix/share/config.site
+ ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+ test "x$ac_site_file" = xNONE && continue
+ if test -r "$ac_site_file"; then
+ { $as_echo "$as_me:$LINENO: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+ sed 's/^/| /' "$ac_site_file" >&5
+ . "$ac_site_file"
+ fi
+done
+
+if test -r "$cache_file"; then
+ # Some versions of bash will fail to source /dev/null (special
+ # files actually), so we avoid doing that.
+ if test -f "$cache_file"; then
+ { $as_echo "$as_me:$LINENO: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+ case $cache_file in
+ [\\/]* | ?:[\\/]* ) . "$cache_file";;
+ *) . "./$cache_file";;
+ esac
+ fi
+else
+ { $as_echo "$as_me:$LINENO: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+ >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+ eval ac_old_set=\$ac_cv_env_${ac_var}_set
+ eval ac_new_set=\$ac_env_${ac_var}_set
+ eval ac_old_val=\$ac_cv_env_${ac_var}_value
+ eval ac_new_val=\$ac_env_${ac_var}_value
+ case $ac_old_set,$ac_new_set in
+ set,)
+ { $as_echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,set)
+ { $as_echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,);;
+ *)
+ if test "x$ac_old_val" != "x$ac_new_val"; then
+ # differences in whitespace do not lead to failure.
+ ac_old_val_w=`echo x $ac_old_val`
+ ac_new_val_w=`echo x $ac_new_val`
+ if test "$ac_old_val_w" != "$ac_new_val_w"; then
+ { $as_echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+ ac_cache_corrupted=:
+ else
+ { $as_echo "$as_me:$LINENO: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+ eval $ac_var=\$ac_old_val
+ fi
+ { $as_echo "$as_me:$LINENO: former value: \`$ac_old_val'" >&5
+$as_echo "$as_me: former value: \`$ac_old_val'" >&2;}
+ { $as_echo "$as_me:$LINENO: current value: \`$ac_new_val'" >&5
+$as_echo "$as_me: current value: \`$ac_new_val'" >&2;}
+ fi;;
+ esac
+ # Pass precious variables to config.status.
+ if test "$ac_new_set" = set; then
+ case $ac_new_val in
+ *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+ *) ac_arg=$ac_var=$ac_new_val ;;
+ esac
+ case " $ac_configure_args " in
+ *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy.
+ *) ac_configure_args="$ac_configure_args '$ac_arg'" ;;
+ esac
+ fi
+done
+if $ac_cache_corrupted; then
+ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+ { $as_echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+ { { $as_echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5
+$as_echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;}
+ { (exit 1); exit 1; }; }
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="${ac_tool_prefix}gcc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+ ac_ct_CC=$CC
+ # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_CC"; then
+ ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_CC="gcc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+ { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+ if test "x$ac_ct_CC" = x; then
+ CC=""
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ CC=$ac_ct_CC
+ fi
+else
+ CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+ if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="${ac_tool_prefix}cc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ fi
+fi
+if test -z "$CC"; then
+ # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+ ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+ ac_prog_rejected=yes
+ continue
+ fi
+ ac_cv_prog_CC="cc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+ # We found a bogon in the path, so make sure we never use it.
+ set dummy $ac_cv_prog_CC
+ shift
+ if test $# != 0; then
+ # We chose a different compiler from the bogus one.
+ # However, it has the same basename, so the bogon will be chosen
+ # first if we set CC to just the basename; use the full file name.
+ shift
+ ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+ fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+ if test -n "$ac_tool_prefix"; then
+ for ac_prog in cl.exe
+ do
+ # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$CC" && break
+ done
+fi
+if test -z "$CC"; then
+ ac_ct_CC=$CC
+ for ac_prog in cl.exe
+do
+ # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_CC"; then
+ ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_CC="$ac_prog"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+ { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$ac_ct_CC" && break
+done
+
+ if test "x$ac_ct_CC" = x; then
+ CC=""
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ CC=$ac_ct_CC
+ fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:$LINENO: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+{ (ac_try="$ac_compiler --version >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compiler --version >&5") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compiler -v >&5") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compiler -V >&5") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }
+
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:$LINENO: checking for C compiler default output file name" >&5
+$as_echo_n "checking for C compiler default output file name... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+ * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+ esac
+done
+rm -f $ac_rmfiles
+
+if { (ac_try="$ac_link_default"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link_default") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; then
+ # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile. We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+ test -f "$ac_file" || continue
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+ ;;
+ [ab].out )
+ # We found the default executable, but exeext='' is most
+ # certainly right.
+ break;;
+ *.* )
+ if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+ then :; else
+ ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+ fi
+ # We set ac_cv_exeext here because the later test for it is not
+ # safe: cross compilers may not add the suffix if given an `-o'
+ # argument, so we may need to know it at that point already.
+ # Even if this section looks crufty: it has the advantage of
+ # actually working.
+ break;;
+ * )
+ break;;
+ esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+ ac_file=''
+fi
+
+{ $as_echo "$as_me:$LINENO: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+if test -z "$ac_file"; then
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: C compiler cannot create executables
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: C compiler cannot create executables
+See \`config.log' for more details." >&2;}
+ { (exit 77); exit 77; }; }; }
+fi
+
+ac_exeext=$ac_cv_exeext
+
+# Check that the compiler produces executables we can run. If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:$LINENO: checking whether the C compiler works" >&5
+$as_echo_n "checking whether the C compiler works... " >&6; }
+# FIXME: These cross compiler hacks should be removed for Autoconf 3.0
+# If not cross compiling, check that we can run a simple program.
+if test "$cross_compiling" != yes; then
+ if { ac_try='./$ac_file'
+ { (case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ cross_compiling=no
+ else
+ if test "$cross_compiling" = maybe; then
+ cross_compiling=yes
+ else
+ { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+ fi
+ fi
+fi
+{ $as_echo "$as_me:$LINENO: result: yes" >&5
+$as_echo "yes" >&6; }
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+# Check that the compiler produces executables we can run. If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:$LINENO: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+{ $as_echo "$as_me:$LINENO: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+{ $as_echo "$as_me:$LINENO: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; then
+ # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+ test -f "$ac_file" || continue
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+ *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+ break;;
+ * ) break;;
+ esac
+done
+else
+ { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+fi
+
+rm -f conftest$ac_cv_exeext
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+{ $as_echo "$as_me:$LINENO: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if test "${ac_cv_objext+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; then
+ for ac_file in conftest.o conftest.obj conftest.*; do
+ test -f "$ac_file" || continue;
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+ *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+ break;;
+ esac
+done
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+fi
+
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if test "${ac_cv_c_compiler_gnu+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+#ifndef __GNUC__
+ choke me
+#endif
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_compiler_gnu=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_compiler_gnu=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+ GCC=yes
+else
+ GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if test "${ac_cv_prog_cc_g+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ ac_save_c_werror_flag=$ac_c_werror_flag
+ ac_c_werror_flag=yes
+ ac_cv_prog_cc_g=no
+ CFLAGS="-g"
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_prog_cc_g=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ CFLAGS=""
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ :
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_c_werror_flag=$ac_save_c_werror_flag
+ CFLAGS="-g"
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_prog_cc_g=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+ CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+ if test "$GCC" = yes; then
+ CFLAGS="-g -O2"
+ else
+ CFLAGS="-g"
+ fi
+else
+ if test "$GCC" = yes; then
+ CFLAGS="-O2"
+ else
+ CFLAGS=
+ fi
+fi
+{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if test "${ac_cv_prog_cc_c89+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <stdarg.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+ char **p;
+ int i;
+{
+ return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+ char *s;
+ va_list v;
+ va_start (v,p);
+ s = g (p, va_arg (v,int));
+ va_end (v);
+ return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has
+ function prototypes and stuff, but not '\xHH' hex character constants.
+ These don't provoke an error unfortunately, instead are silently treated
+ as 'x'. The following induces an error, until -std is added to get
+ proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an
+ array size at least. It's necessary to write '\x00'==0 to get something
+ that's true only with -std. */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+ inside strings and character constants. */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1];
+ ;
+ return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+ -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+ CC="$ac_save_CC $ac_arg"
+ rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_prog_cc_c89=$ac_arg
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext
+ test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+ x)
+ { $as_echo "$as_me:$LINENO: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+ xno)
+ { $as_echo "$as_me:$LINENO: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+ *)
+ CC="$CC $ac_cv_prog_cc_c89"
+ { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+{ $as_echo "$as_me:$LINENO: checking for BZ2_bzDecompressInit in -lbz2" >&5
+$as_echo_n "checking for BZ2_bzDecompressInit in -lbz2... " >&6; }
+if test "${ac_cv_lib_bz2_BZ2_bzDecompressInit+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-lbz2 $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+/* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char BZ2_bzDecompressInit ();
+int
+main ()
+{
+return BZ2_bzDecompressInit ();
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest$ac_exeext && {
+ test "$cross_compiling" = yes ||
+ $as_test_x conftest$ac_exeext
+ }; then
+ ac_cv_lib_bz2_BZ2_bzDecompressInit=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_cv_lib_bz2_BZ2_bzDecompressInit=no
+fi
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_lib_bz2_BZ2_bzDecompressInit" >&5
+$as_echo "$ac_cv_lib_bz2_BZ2_bzDecompressInit" >&6; }
+if test "x$ac_cv_lib_bz2_BZ2_bzDecompressInit" = x""yes; then
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBBZ2 1
+_ACEOF
+
+ LIBS="-lbz2 $LIBS"
+
+fi
+
+
+ac_config_files="$ac_config_files src/Makevars"
+
+cp confdefs.h src/config.h
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems. If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+ for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+ eval ac_val=\$$ac_var
+ case $ac_val in #(
+ *${as_nl}*)
+ case $ac_var in #(
+ *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+ esac
+ case $ac_var in #(
+ _ | IFS | as_nl) ;; #(
+ BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+ *) $as_unset $ac_var ;;
+ esac ;;
+ esac
+ done
+
+ (set) 2>&1 |
+ case $as_nl`(ac_space=' '; set) 2>&1` in #(
+ *${as_nl}ac_space=\ *)
+ # `set' does not quote correctly, so add quotes (double-quote
+ # substitution turns \\\\ into \\, and sed turns \\ into \).
+ sed -n \
+ "s/'/'\\\\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+ ;; #(
+ *)
+ # `set' quotes correctly as required by POSIX, so do not add quotes.
+ sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+ ;;
+ esac |
+ sort
+) |
+ sed '
+ /^ac_cv_env_/b end
+ t clear
+ :clear
+ s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+ t end
+ s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+ :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+ if test -w "$cache_file"; then
+ test "x$cache_file" != "x/dev/null" &&
+ { $as_echo "$as_me:$LINENO: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+ cat confcache >$cache_file
+ else
+ { $as_echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+ fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section. Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+ g
+ s/^\n//
+ s/\n/ /g
+ p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+ # 1. Remove the extension, and $U if already installed.
+ ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+ ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+ # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR
+ # will be set to the directory where LIBOBJS objects are built.
+ ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+ ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: ${CONFIG_STATUS=./config.status}
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+cat >$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+SHELL=\${CONFIG_SHELL-$SHELL}
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+## --------------------- ##
+## M4sh Initialization. ##
+## --------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ PATH_SEPARATOR=:
+ (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+ (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+ PATH_SEPARATOR=';'
+ }
+fi
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+ as_unset=unset
+else
+ as_unset=false
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order. Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" "" $as_nl"
+
+# Find who we are. Look in the path if we contain no directory separator.
+case $0 in
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+IFS=$as_save_IFS
+
+ ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+ as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+ $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+ { (exit 1); exit 1; }
+fi
+
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+
+# CDPATH.
+$as_unset CDPATH
+
+
+
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+ # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+ # uniformly replaced by the line number. The first 'sed' inserts a
+ # line-number line after each line using $LINENO; the second 'sed'
+ # does the real work. The second script uses 'N' to pair each
+ # line-number line with the line containing $LINENO, and appends
+ # trailing '-' during substitution so that $LINENO is not a special
+ # case at line end.
+ # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+ # scripts with optimization help from Paolo Bonzini. Blame Lee
+ # E. McMahon (1931-1989) for sed's syntax. :-)
+ sed -n '
+ p
+ /[$]LINENO/=
+ ' <$as_myself |
+ sed '
+ s/[$]LINENO.*/&-/
+ t lineno
+ b
+ :lineno
+ N
+ :loop
+ s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+ t loop
+ s/-\n.*//
+ ' >$as_me.lineno &&
+ chmod +x "$as_me.lineno" ||
+ { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+ { (exit 1); exit 1; }; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensitive to this).
+ . "./$as_me.lineno"
+ # Exit status is that of the last command.
+ exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+ as_dirname=dirname
+else
+ as_dirname=false
+fi
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in
+-n*)
+ case `echo 'x\c'` in
+ *c*) ECHO_T=' ';; # ECHO_T is single tab character.
+ *) ECHO_C='\c';;
+ esac;;
+*)
+ ECHO_N='-n';;
+esac
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+ rm -f conf$$.dir/conf$$.file
+else
+ rm -f conf$$.dir
+ mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+ if ln -s conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s='ln -s'
+ # ... but there are two gotchas:
+ # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+ # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+ # In both cases, we have to default to `cp -p'.
+ ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+ as_ln_s='cp -p'
+ elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+ else
+ as_ln_s='cp -p'
+ fi
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p=:
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+ as_test_x='test -x'
+else
+ if ls -dL / >/dev/null 2>&1; then
+ as_ls_L_option=L
+ else
+ as_ls_L_option=
+ fi
+ as_test_x='
+ eval sh -c '\''
+ if test -d "$1"; then
+ test -d "$1/.";
+ else
+ case $1 in
+ -*)set "./$1";;
+ esac;
+ case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
+ ???[sx]*):;;*)false;;esac;fi
+ '\'' sh
+ '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+
+# Save the log message, to keep $[0] and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by SPP $as_me 1.7, which was
+generated by GNU Autoconf 2.63. Invocation command line was
+
+ CONFIG_FILES = $CONFIG_FILES
+ CONFIG_HEADERS = $CONFIG_HEADERS
+ CONFIG_LINKS = $CONFIG_LINKS
+ CONFIG_COMMANDS = $CONFIG_COMMANDS
+ $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files from templates according to the
+current configuration.
+
+Usage: $0 [OPTION]... [FILE]...
+
+ -h, --help print this help, then exit
+ -V, --version print version number and configuration settings, then exit
+ -q, --quiet, --silent
+ do not print progress messages
+ -d, --debug don't remove temporary files
+ --recheck update $as_me by reconfiguring in the same conditions
+ --file=FILE[:TEMPLATE]
+ instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to <bug-autoconf@gnu.org>."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_version="\\
+SPP config.status 1.7
+configured by $0, generated by GNU Autoconf 2.63,
+ with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
+
+Copyright (C) 2008 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+ case $1 in
+ --*=*)
+ ac_option=`expr "X$1" : 'X\([^=]*\)='`
+ ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+ ac_shift=:
+ ;;
+ *)
+ ac_option=$1
+ ac_optarg=$2
+ ac_shift=shift
+ ;;
+ esac
+
+ case $ac_option in
+ # Handling of the options.
+ -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+ ac_cs_recheck=: ;;
+ --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+ $as_echo "$ac_cs_version"; exit ;;
+ --debug | --debu | --deb | --de | --d | -d )
+ debug=: ;;
+ --file | --fil | --fi | --f )
+ $ac_shift
+ case $ac_optarg in
+ *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ esac
+ CONFIG_FILES="$CONFIG_FILES '$ac_optarg'"
+ ac_need_defaults=false;;
+ --he | --h | --help | --hel | -h )
+ $as_echo "$ac_cs_usage"; exit ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil | --si | --s)
+ ac_cs_silent=: ;;
+
+ # This is an error.
+ -*) { $as_echo "$as_me: error: unrecognized option: $1
+Try \`$0 --help' for more information." >&2
+ { (exit 1); exit 1; }; } ;;
+
+ *) ac_config_targets="$ac_config_targets $1"
+ ac_need_defaults=false ;;
+
+ esac
+ shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+ exec 6>/dev/null
+ ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+ set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+ shift
+ \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+ CONFIG_SHELL='$SHELL'
+ export CONFIG_SHELL
+ exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+ echo
+ sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+ $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+ case $ac_config_target in
+ "src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;;
+
+ *) { { $as_echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5
+$as_echo "$as_me: error: invalid argument: $ac_config_target" >&2;}
+ { (exit 1); exit 1; }; };;
+ esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used. Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+ test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience. Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+ tmp=
+ trap 'exit_status=$?
+ { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status
+' 0
+ trap '{ (exit 1); exit 1; }' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+ tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+} ||
+{
+ tmp=./conf$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+} ||
+{
+ $as_echo "$as_me: cannot create a temporary directory in ." >&2
+ { (exit 1); exit 1; }
+}
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=' '
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+ ac_cs_awk_cr='\\r'
+else
+ ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+ echo "cat >conf$$subs.awk <<_ACEOF" &&
+ echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+ echo "_ACEOF"
+} >conf$$subs.sh ||
+ { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '$'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+ . ./conf$$subs.sh ||
+ { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+
+ ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+ if test $ac_delim_n = $ac_delim_num; then
+ break
+ elif $ac_last_try; then
+ { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+ else
+ ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+ fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\).*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\).*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+ N
+ s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$tmp/subs1.awk" <<_ACAWK &&
+ for (key in S) S_is_set[key] = 1
+ FS = ""
+
+}
+{
+ line = $ 0
+ nfields = split(line, field, "@")
+ substed = 0
+ len = length(field[1])
+ for (i = 2; i < nfields; i++) {
+ key = field[i]
+ keylen = length(key)
+ if (S_is_set[key]) {
+ value = S[key]
+ line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+ len += length(value) + length(field[++i])
+ substed = 1
+ } else
+ len += 1 + keylen
+ }
+
+ print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+ sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+ cat
+fi < "$tmp/subs1.awk" > "$tmp/subs.awk" \
+ || { { $as_echo "$as_me:$LINENO: error: could not setup config files machinery" >&5
+$as_echo "$as_me: error: could not setup config files machinery" >&2;}
+ { (exit 1); exit 1; }; }
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove $(srcdir),
+# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+ ac_vpsub='/^[ ]*VPATH[ ]*=/{
+s/:*\$(srcdir):*/:/
+s/:*\${srcdir}:*/:/
+s/:*@srcdir@:*/:/
+s/^\([^=]*=[ ]*\):*/\1/
+s/:*$//
+s/^[^=]*=[ ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X " :F $CONFIG_FILES "
+shift
+for ac_tag
+do
+ case $ac_tag in
+ :[FHLC]) ac_mode=$ac_tag; continue;;
+ esac
+ case $ac_mode$ac_tag in
+ :[FHL]*:*);;
+ :L* | :C*:*) { { $as_echo "$as_me:$LINENO: error: invalid tag $ac_tag" >&5
+$as_echo "$as_me: error: invalid tag $ac_tag" >&2;}
+ { (exit 1); exit 1; }; };;
+ :[FH]-) ac_tag=-:-;;
+ :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+ esac
+ ac_save_IFS=$IFS
+ IFS=:
+ set x $ac_tag
+ IFS=$ac_save_IFS
+ shift
+ ac_file=$1
+ shift
+
+ case $ac_mode in
+ :L) ac_source=$1;;
+ :[FH])
+ ac_file_inputs=
+ for ac_f
+ do
+ case $ac_f in
+ -) ac_f="$tmp/stdin";;
+ *) # Look for the file first in the build tree, then in the source tree
+ # (if the path is not absolute). The absolute path cannot be DOS-style,
+ # because $ac_f cannot contain `:'.
+ test -f "$ac_f" ||
+ case $ac_f in
+ [\\/$]*) false;;
+ *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+ esac ||
+ { { $as_echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5
+$as_echo "$as_me: error: cannot find input file: $ac_f" >&2;}
+ { (exit 1); exit 1; }; };;
+ esac
+ case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+ ac_file_inputs="$ac_file_inputs '$ac_f'"
+ done
+
+ # Let's still pretend it is `configure' which instantiates (i.e., don't
+ # use $as_me), people would be surprised to read:
+ # /* config.h. Generated by config.status. */
+ configure_input='Generated from '`
+ $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+ `' by configure.'
+ if test x"$ac_file" != x-; then
+ configure_input="$ac_file. $configure_input"
+ { $as_echo "$as_me:$LINENO: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+ fi
+ # Neutralize special characters interpreted by sed in replacement strings.
+ case $configure_input in #(
+ *\&* | *\|* | *\\* )
+ ac_sed_conf_input=`$as_echo "$configure_input" |
+ sed 's/[\\\\&|]/\\\\&/g'`;; #(
+ *) ac_sed_conf_input=$configure_input;;
+ esac
+
+ case $ac_tag in
+ *:-:* | *:-) cat >"$tmp/stdin" \
+ || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+ { (exit 1); exit 1; }; } ;;
+ esac
+ ;;
+ esac
+
+ ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$ac_file" : 'X\(//\)[^/]' \| \
+ X"$ac_file" : 'X\(//\)$' \| \
+ X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ { as_dir="$ac_dir"
+ case $as_dir in #(
+ -*) as_dir=./$as_dir;;
+ esac
+ test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
+ as_dirs=
+ while :; do
+ case $as_dir in #(
+ *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+ *) as_qdir=$as_dir;;
+ esac
+ as_dirs="'$as_qdir' $as_dirs"
+ as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_dir" : 'X\(//\)[^/]' \| \
+ X"$as_dir" : 'X\(//\)$' \| \
+ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ test -d "$as_dir" && break
+ done
+ test -z "$as_dirs" || eval "mkdir $as_dirs"
+ } || test -d "$as_dir" || { { $as_echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
+$as_echo "$as_me: error: cannot create directory $as_dir" >&2;}
+ { (exit 1); exit 1; }; }; }
+ ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+ ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+ # A ".." for each directory in $ac_dir_suffix.
+ ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+ case $ac_top_builddir_sub in
+ "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+ *) ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+ esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+ .) # We are building in place.
+ ac_srcdir=.
+ ac_top_srcdir=$ac_top_builddir_sub
+ ac_abs_top_srcdir=$ac_pwd ;;
+ [\\/]* | ?:[\\/]* ) # Absolute name.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir
+ ac_abs_top_srcdir=$srcdir ;;
+ *) # Relative name.
+ ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_build_prefix$srcdir
+ ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+ case $ac_mode in
+ :F)
+ #
+ # CONFIG_FILE
+ #
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+
+ac_sed_dataroot='
+/datarootdir/ {
+ p
+ q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p
+'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+ { $as_echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ ac_datarootdir_hack='
+ s&@datadir@&$datadir&g
+ s&@docdir@&$docdir&g
+ s&@infodir@&$infodir&g
+ s&@localedir@&$localedir&g
+ s&@mandir@&$mandir&g
+ s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$tmp/subs.awk" >$tmp/out \
+ || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+ { (exit 1); exit 1; }; }
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+ { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&
+ { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } &&
+ { $as_echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined. Please make sure it is defined." >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined. Please make sure it is defined." >&2;}
+
+ rm -f "$tmp/stdin"
+ case $ac_file in
+ -) cat "$tmp/out" && rm -f "$tmp/out";;
+ *) rm -f "$ac_file" && mv "$tmp/out" "$ac_file";;
+ esac \
+ || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+ { (exit 1); exit 1; }; }
+ ;;
+
+
+
+ esac
+
+done # for ac_tag
+
+
+{ (exit 0); exit 0; }
+_ACEOF
+chmod +x $CONFIG_STATUS
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+ { { $as_echo "$as_me:$LINENO: error: write failure creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: write failure creating $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded. So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status. When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+ ac_cs_success=:
+ ac_config_status_args=
+ test "$silent" = yes &&
+ ac_config_status_args="$ac_config_status_args --quiet"
+ exec 5>/dev/null
+ $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+ exec 5>>config.log
+ # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+ # would make configure fail if this is the last instruction.
+ $ac_cs_success || { (exit 1); exit 1; }
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+ { $as_echo "$as_me:$LINENO: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure.ac b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure.ac
new file mode 100755
index 0000000..db87fcd
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/configure.ac
@@ -0,0 +1,7 @@
+AC_INIT([SPP], 1.7)
+
+AC_CHECK_LIB(bz2, BZ2_bzDecompressInit)
+AC_SUBST(HAVE_LIBBZ2)
+AC_CONFIG_FILES([src/Makevars])
+cp confdefs.h src/config.h
+AC_OUTPUT
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/add.broad.peak.regions.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/add.broad.peak.regions.Rd
new file mode 100755
index 0000000..24355db
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/add.broad.peak.regions.Rd
@@ -0,0 +1,27 @@
+\name{add.broad.peak.regions}
+\alias{add.broad.peak.regions}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate chromosome-wide profiles of smoothed tag density }
+\description{
+ Looks for broader regions of enrichment associated with the determined
+ peak positions, adds them to the $npl data as $rs, $re columns.
+}
+\usage{
+add.broad.peak.regions(signal.tags, control.tags, binding.postions,window.size=500,z.thr=2)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output
+ of \code{\link{select.informative.tags}} }
+ \item{control.tags}{ optionall control (input) tags }
+ \item{binding.positions}{ output of find.binding.positions call }
+ \item{window.size}{ window size to be used in calculating enrichment }
+ \item{z.thr}{ Z-score corresponding to the Poisson ratio threshold
+ used to flag significantly enriched windows}
+}
+\value{
+ A structure identical to binding.postions with two additional columns
+ added (rs and re) corresponding to start and end of the associated
+ significantly enriched region. If no region was associated with a
+ particular peak, NAs values are reported.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/find.binding.positions.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/find.binding.positions.Rd
new file mode 100755
index 0000000..5b67e88
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/find.binding.positions.Rd
@@ -0,0 +1,128 @@
+\name{find.binding.positions}
+\alias{find.binding.positions}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Determine significant point protein binding positions (peaks) }
+\description{
+ Given the signal and optional control (input) data, determine location of the
+ statistically significant point binding positions. If the control data
+ is not provided, the statistical significance can be assessed based on
+ tag randomization. The method also provides options for masking
+ regions exhibiting strong signals within the control data.
+}
+\usage{
+find.binding.positions(signal.data, e.value = NULL, fdr = NULL, masked.data = NULL, control.data = NULL, min.dist = 200, window.size = 4e+07, cluster = NULL, debug = T, n.randomizations = 3, shuffle.window = 1, min.thr = 0, topN = NULL, tag.count.whs = 100, enrichment.z = 2, method = tag.wtd, tec.filter = T, tec.window.size = 10000, tec.masking.window.size=tec.window.size, tec.z = 5, tec.poisson.z=5,tec.poisson.ratio=5, n.control.samples = 1, enrichment.background.scales = c(1, 5, 10), background.density.scaling = F, use.randomized.controls = F, ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ ~~ tag data ~~
+ \item{signal.data}{ signal tag vector list }
+ \item{control.data}{ optional control (input) tag vector list }
+
+ ~~ position stringency criteria ~~
+ \item{e.value}{ E-value defining the desired statistical significance
+ of binding positions. }
+ \item{fdr}{ FDR defining statistical significance of binding positions }
+ \item{topN}{ instead of determining statistical significance
+ thresholds, return the specified number of highest-scoring
+ positions}
+
+ ~~ other params ~~
+ \item{whs}{ window half-sized that should be used for binding
+ detection (e.g. determined from cross-correlation profiles)}
+ \item{masked.data}{ optional set of coordinates that should be masked
+ (e.g. known non-unique regions) }
+ \item{min.dist}{ minimal distance that must separate detected binding
+ positions. In case multiple binding positions are detected within
+ such distance, the position with the highest score is returned. }
+ \item{window.size}{ size of the window used to segment the chromosome
+ during calculations to reduce memory usage. }
+ \item{cluster}{ optional \code{snow} cluster to parallelize the
+ processing on }
+ \item{min.thr}{ minimal score requirement for a peak }
+ \item{background.density.scaling}{ If TRUE, regions of significant tag
+ enrichment will be masked out when calculating size ratio of the
+ signal to control datasets (to estimate ratio of the background tag
+ density). If FALSE, the dataset ratio will be equal to the ratio of
+ the number of tags in each dataset.}
+
+ ~~ randomized controls ~~
+ \item{n.randomizations}{ number of tag randomziations that should be
+ performed (when the control data is not provided) }
+ \item{use.randomized.controls}{ Use randomized tag control, even if
+ \code{control.data} is supplied. }
+ \item{shuffle.window}{ during tag randomizations, tags will be split
+ into groups of \code{shuffle.window} and will be maintained
+ together throughout the randomization. }
+
+ ~~ fold-enrichment confidence intervals
+ \item{tag.count.whs}{ half-size of a window used to assess fold
+ enrichment of a binding position}
+ \item{enrichment.z}{ Z-score used to define the significance level of
+ the fold-enrichment confidence intervals }
+ \item{enrichment.background.scales}{ In estimating the peak
+ fold-enrichment confidence intervals, the background tag density is
+ estimated based on windows with half-sizes of
+ \code{2*tag.count.whs*enrichment.background.scales}. }
+ \item{method}{ either \code{tag.wtd} for WTD method, or
+ \code{tag.lwcc} for MTC method}
+ \item{mle.filter}{ If turned on, will exclude predicted positions
+ whose MLE enrichment ratio (for any of the background scales) is
+ below a specified min.mle.threshold }
+ \item{min.mle.threshold}{ MLE enrichment ratio threshold that each
+ predicted position must exceed if mle.filter is turned on. }
+
+ ~~ masking regions of significant control enrichment ~~
+ \item{tec.filter}{ Whether to mask out the regions exhibiting
+ significant enrichment in the control data in doing other
+ calculations. The regions are identified using Poisson statistics
+ within sliding windows, either relative to the scaled signal (tec.z), or
+ relative to randomly-distributed expectation (tec.poisson.z).}
+ \item{tec.window.size}{ size of the window used to determine
+ significantly enrichent control regions }
+ \item{tec.masking.window.size}{ size of the window used to mask
+ the area around significantly enrichent control regions }
+ \item{tec.z}{ Z-score defining statistical stringency by which a given
+ window is determined to be significantly higher in the input than in
+ the signal, and masked if that is the case.}
+ \item{tec.poisson.z}{ Z-score defining statistical stringency by which a given
+ window is determined to be significantly higher than the
+ tec.poisson.ratio above the expected uniform input background. }
+ \item{tec.poisson.ratio}{ Fold ratio by which input must exceed the
+ level expected from the uniform distribution. }
+
+
+
+
+}
+\value{
+ \item{npl}{A per-chromosome list containing data frames describing
+ determined binding positions. Column description:
+ \item{x}{ position }
+ \item{y}{ score }
+ \item{evalue}{ E-value }
+ \item{fdr}{ FDR. For peaks higher than the maximum control peak,
+ the highest dataset FDR is reported }
+ \item{enr}{ lower bound of the fold-enrichment ratio confidence
+ interval. This is the estimate determined using scale of
+ 1. Estimates corresponding to higher scales are returned in other enr columns
+ with scale appearing in the name.}
+ \item{enr.mle}{ enrichment ratio maximum likely estimate }
+ }
+ \item{thr}{ info on the chosen statistical threshold of the peak scores}
+}
+
+\examples{
+ # find binding positions using WTD method, 200bp half-window size,
+control data, 1% FDR
+ bp <-
+find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.wtd,whs=200);
+
+ # find binding positions using MTC method, using 5 tag randomizations,
+ # keeping pairs of tag positions together (shuffle.window=2)
+ bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.lwcc,whs=200,use.randomized.controls=T,n.randomizations=5,shuffle.window=2)
+
+ # print out the number of determined positions
+ print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks"));
+
+
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.binding.characteristics.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.binding.characteristics.Rd
new file mode 100755
index 0000000..07f2ae4
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.binding.characteristics.Rd
@@ -0,0 +1,55 @@
+\name{get.binding.characteristics}
+\alias{get.binding.characteristics}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate characteristics of observed DNA-binding signal from
+ cross-correlation profiles }
+\description{
+ The methods calculates strand cross-correlation profile to determine binding
+ peak separation distance and approximate window size that should be used
+ for binding detection. If quality scores were given for the tags,
+ which quality bins improve the cross-correlation pattern.
+}
+\usage{
+get.binding.characteristics(data, srange = c(50, 500), bin = 5, cluster = NULL, debug = F, min.tag.count = 1000, acceptance.z.score = 3, remove.tag.anomalies = T, anomalies.z = 5,accept.all.tags=F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{data}{ Tag/quality data: output of \code{read.eland.tags} or similar function }
+ \item{srange}{ A range within which the binding peak separation is
+ expected to fall. Should be larger than probe size to avoid artifacts. }
+ \item{bin}{ Resolution (in basepairs) at which cross-corrrelation
+ should be calculated. bin=1 is ideal, but takes longer to calculate. }
+ \item{cluster}{ optional snow cluster for parallel processing }
+ \item{debug}{ whether to print debug messages }
+ \item{min.tag.count}{ minimal number of tags on the chromosome to be
+ considered in the cross-correlation calculations }
+ \item{acceptance.z.score}{ A Z-score used to determine if a given tag
+ quality bin provides significant improvement to the strand cross-correlation }
+ \item{remove.tag.anomalies}{ Whether to remove singular tag count peaks prior to
+ calculation. This is recommended, since such positions may distort the
+ cross-correlation profile and increase the necessary computational time. }
+ \item{anomalies.z}{ Z-score for determining if the number of tags at a
+ given position is significantly higher about background, and should be
+ considered an anomaly.}
+ \item{accept.all.tags}{ Whether tag alignment quality calculations
+ should be skipped and all available tags should be accepted in the
+ downstream analysis.}
+}
+\value{
+ \item{cross.correlation }{ Cross-correlation profile as an $x/$y data.frame}
+ \item{peak }{Position ($x) and height ($y) of automatically detected
+ cross-correlation peak.}
+ \item{whs} { Optimized window half-size for binding detection (based
+ on the width of the cross-correlation peak) }
+ \item{quality.bin.acceptance} { A list structure, describing the
+ effect of inclusion of different tag quality bins on
+ cross-correlation, and a resolution on which bins should be
+ considered.
+ \item{informative.bins} { A boolean vector indicating whether the
+ inclusion of tags from the tag quality bin specified in the name
+ attribute significantly increases cross-correlation profile near
+ the peak.}
+ \item{quality.cc} { A list giving the cross-correlation profile
+ after the inclusion of the tags from different quality bins }
+ }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.broad.enrichment.clusters.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.broad.enrichment.clusters.Rd
new file mode 100755
index 0000000..1a6cff0
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.broad.enrichment.clusters.Rd
@@ -0,0 +1,27 @@
+\name{get.broad.enrichment.clusters}
+\alias{get.broad.enrichment.clusters}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Determine broad clusters of enrichment }
+\description{
+ Scan chromosomes with a pre-defined window size, comparing scaled ChIP
+ and input tag coutns to see if their ratio exceeds that expected from
+ a Poisson process (normalized for dataset size).
+}
+\usage{
+get.broad.enrichment.clusters(chip.tags, input.tags, window.size=1e3,z.thr=3,tag.shift=146/2)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{chip.tags}{ foreground tag vector list }
+ \item{input.tags}{ background tag vector list }
+ \item{window.size}{ window size to be used for tag counting }
+ \item{z.thr}{ Z-score to be used as a significance threshold }
+ \item{tag.shift}{ number of base pairs by which positive and negative
+ tag coordinates should be shifted towards eachother (half of binding
+ peak separation distance)}
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $s/$e/$rv data.frame giving the starting, ending positions and the log2
+ enrichment estimate for that region.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.conservative.fold.enrichment.profile.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.conservative.fold.enrichment.profile.Rd
new file mode 100755
index 0000000..0b20432
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.conservative.fold.enrichment.profile.Rd
@@ -0,0 +1,59 @@
+\name{get.conservative.fold.enrichment.profile}
+\alias{get.conservative.fold.enrichment.profile}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Estimate minimal fold enrichment/depletion along the chromosomes }
+\description{
+ The method provides a statistical assessment of enrichment/depletion
+ along the chromosomes. To assess tag density enrichment/depletion, a
+ sliding window of a specified size (\code{fws}) is used to calculate
+ the density of the foreground tags (\code{ftl}). Multiple, typically
+ larger windows are used to estimate background tag (\code{btl}) density around the
+ same location. The densities are compared as ratios of two Poisson
+ processes to estimate lower bound of foreground enrichment, or upper
+ bound of foreground depletion. If multiple window sizes were used to
+ estimate the background tag density, the most conservative one is
+ chosen for each point.
+}
+\usage{
+get.conservative.fold.enrichment.profile(ftl, btl, fws, bwsl = c(1, 5, 25, 50) * fws, step = 50, tag.shift = 146/2, alpha = 0.05, use.most.informative.scale = F, quick.calculation = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{ftl}{ foreground tag vector list }
+ \item{btl}{ background tag vector list }
+ \item{fws}{ foreground window size }
+ \item{bwsl}{ background window scales. The size(s) of background windows
+ will be \code{fws*bwsl}. }
+ \item{step}{ spacing between positions at which the
+ enrichment/depletion is evaluated }
+ \item{tag.shift}{ number of basepairs by which positive and negative
+ tag coordinates should be shifted towards eachother (half of binding
+ peak separation distance)}
+ \item{alpha}{ desired level of statistical significance }
+ \item{use.most.informative.scale}{ for each position, instead of
+ evaluating enrichment ratio bounds for all background window scales,
+ choose the one with the highest observed density to speed up the calculations}
+ \item{quick.calculation}{ Use square root transformation method
+ instead of a Bayesian method. This speeds up the caclulation
+ considerably and is turned on by default. }
+ \item{background.density.scaling}{ If TRUE, regions of significant tag
+ enrichment will be masked out when calculating size ratio of the
+ signal to control datasets (to estimate ratio of the background tag
+ density). If FALSE, the dataset ratio will be equal to the ratio of
+ the number of tags in each dataset.}
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $x/$y data.frame giving the position and the log2
+ conservative estimate of enrichment/depletion fold ratios around that
+ position.
+ Use \code{\link{writewig}} to output the structure to a WIG
+ file.
+}
+\references{ R.M.Price, D.G. Bonett "Estimating the ratio fo two Poisson
+ rates", Comp. Stat & Data Anal. 32(2000) 345}
+\seealso{ \code{\link{get.smoothed.tag.density}} }
+\examples{
+ enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01);
+ writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale");
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.Rd
new file mode 100755
index 0000000..cf60fe8
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.Rd
@@ -0,0 +1,46 @@
+\name{get.mser}
+\alias{get.mser}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate minimal saturated enrichment fold ratio }
+\description{
+ Determine if the dataset has reached absolute saturation, or otherwise
+ find minimal fold enrichment ratio above which the detection of peaks
+ has stabilized enough to meet the saturation criteria.
+}
+\usage{
+get.mser(signal.data, control.data, n.chains = 5, step.size = 1e+05, chains = NULL, cluster = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), n.steps = 1, ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.data}{ signal tag vector list }
+ \item{control.data}{ control tag vector list }
+ \item{n.chains}{ number of dataset subsamples to use }
+ \item{step.size}{ subsampling step describing the saturation
+ criteria. The criteria requires the set of detected binding sites to
+ be stable (as described by the \code{test.agreement} param) when the
+ number of tags in the dataset is reduced by \code{step.size}. The
+ value can either be an integer above one, in which case it specifies a fixed
+ number of tags, or a real value below one, in which case it
+ specifies the fraction of tags that should be removed (e.g. 0.1 will
+ remove 10% of tags).
+ }
+ \item{test.agreement}{ Fraction of the detected peaks that should
+ agree between the full and subsampled datasets. }
+ \item{chains}{ optional parameter, giving pre-calculated chains }
+ \item{cluster}{ optional \code{snow} cluster to parallelize processing }
+
+ \item{return.chains}{ whether subsampled dataset results should be returned as
+ well }
+ \item{enrichment.background.scales}{ one or multiple window scales at
+ which the background tag density should be assessed. See
+ \code{enrichment.background.scales} in
+ \code{\link{find.binding.positions}}. If multiple scales are provided,
+ multiple MSER estimates will be returned.}
+ \item{\dots}{ additional parameters should be the same as those passed
+ to the \code{\link{find.binding.positions}}}
+}
+\value{
+ A single, or multple (if multiple \code{enrichment.background.scales} were
+ provided) MSER value. A value of 1 or very close to it implies that
+ the dataset has reached absolute saturation based on the given criteria.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.interpolation.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.interpolation.Rd
new file mode 100755
index 0000000..e10b81e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.mser.interpolation.Rd
@@ -0,0 +1,56 @@
+\name{get.mser.interpolation}
+\alias{get.mser.interpolation}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Interpolate MSER dependency on the tag count }
+\description{
+ MSER generally decreases with increasing sequencing depth. This
+ function interpolates the dependency of MSER on tag counts as a
+ log-log linear function. The log-log fit is used to estimate the depth
+ of sequencing required to reach desired \code{target.fold.enrichment}.
+}
+\usage{
+get.mser.interpolation(signal.data, control.data, target.fold.enrichment = 5, n.chains = 10, n.steps = 6, step.size = 1e+05, chains = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), excluded.steps = c(seq(2, n.steps - 2)), ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.data}{ signal chromosome tag vector list }
+ \item{control.data}{ control chromosome tag vector list }
+ \item{target.fold.enrichment}{ target MSER for which the depth should
+ be estimated}
+ \item{n.steps}{ number of steps in each subset chain. }
+ \item{step.size}{ Either number of tags or fraction of the dataset
+ size, see \code{step.size} parameter for \code{\link{get.mser}}. }
+ \item{test.agreement}{ Fraction of the detected peaks that should
+ agree between the full and subsampled datasets. See \code{test.agreement} parameter for \code{\link{get.mser}}}
+ \item{n.chains}{ number of random subset chains }
+ \item{chains}{ optional structure of pre-calculated chains
+ (e.g. generated by an earlier call with \code{return.chains=T}.}
+
+ \item{return.chains}{ whether to return peak predictions calculated on
+ random chains. These can be passed back using \code{chains} argument
+ to skip subsampling/prediction steps, and just recalculate the depth
+ estimate for a different MSER.}
+ \item{enrichment.background.scales}{ see \code{enrichment.background.scales} parameter for \code{\link{get.mser}} }
+ \item{excluded.steps}{ Intermediate subsampling steps that should be excluded from
+ the chains to speed up the calculation. By default, all intermediate
+ steps except for first two and last two are skipped. Adding
+ intermediate steps improves interpolation at the expense of
+ computational time.}
+ \item{\dots}{ additional parameters are passed to \code{\link{get.mser}} }
+}
+\details{
+ To simulate sequencing growth, the method calculates peak predictions
+ on random chains. Each chain is produced by sequential random
+ subsampling of the original data. The number of steps in the chain
+ indicates how many times the random subsampling will be performed.
+}
+\value{
+ Normally reurns a list, specifying for each backgroundscale:
+ \item{prediction}{estimated sequencing depth required to reach
+ specified target MSER}
+ \item{log10.fit}{linear fit model, a result of \code{lm()} call}
+
+ If \code{return.chains=T}, the above structure is returned under
+ \code{interpolation} field, along with \code{chains} field containing
+ results of \code{\link{find.binding.positions}} calls on subsampled chains.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.enrichment.mle.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.enrichment.mle.Rd
new file mode 100755
index 0000000..fe80329
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.enrichment.mle.Rd
@@ -0,0 +1,35 @@
+\name{get.smoothed.enrichment.mle}
+\alias{get.smoothed.enrichment.mle}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate chromosome-wide profiles of smoothed enrichment estimate }
+\description{
+ Given signal and control tag positions, the method calculates log2
+ signal to control enrichment esimates (maximum likelihood) for each
+ chromosome, based on the smoothed tag density profile (see \link{get.smoothed.tag.density}).
+}
+\usage{
+get.smoothed.enrichment.mle(signal.tags, control.tags, bandwidth = 150,tag.shift = 146/2, step = 50)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output
+ of \code{\link{select.informative.tags}} }
+ \item{control.tags}{ control (input) tags }
+ \item{pseudocount}{ pseudocount value to be added to tag density -
+ defaults to 1 }
+ other parameters (such as bandwidth, step.size and tag.shift) are
+ passed to \link{get.smoothed.tag.density} - see appropriate reference
+ for details.
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $x/$y data.frame giving the position and associated
+ log2 signal/control enrichment estimate.
+}
+\seealso{ \code{\link{writewig}} }
+\examples{
+ # get smoothed enrichment estimate profile using 500bp bandwidth at
+ # 50bp steps
+ smoothed.M <- get.smoothed.enrichment.mle(chip.data,bandwidth=500,step=50);
+ writewig(smoothed.M,"example.smoothedM.wig","Example smoothed log2 intensity ratio estimate");
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.tag.density.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.tag.density.Rd
new file mode 100755
index 0000000..9807249
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/get.smoothed.tag.density.Rd
@@ -0,0 +1,45 @@
+\name{get.smoothed.tag.density}
+\alias{get.smoothed.tag.density}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate chromosome-wide profiles of smoothed tag density }
+\description{
+ Given tag positions, the method calculates for each chromosome a tag
+ density profile, smoothed by the Gaussian kernel. If the optional
+ control tags are provided, the difference between ChIP and control tag
+ density is returned.
+}
+\usage{
+get.smoothed.tag.density(signal.tags, control.tags = NULL, bandwidth = 150, bg.weight = NULL, tag.shift = 146/2, step = round(bandwidth/3))
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output
+ of \code{\link{select.informative.tags}} }
+ \item{control.tags}{ optional control (input) tags }
+ \item{bandwidth}{ standard deviation of the Gaussian kernel }
+ \item{bg.weight}{ optional weight by which the background density
+ should be multipled for scaling. If not supplied, the weight is
+ calculated based on the ratio of the reduced ChIP to input dataset sizes. }
+ \item{tag.shift}{ Distance by which the positive and negative strand
+ tags should be shifted towards eachother. This
+ normally corresponds to the half of the cross-correlation peak
+ position (e.g. \code{get.binding.characteristics()}$peak$x/2) }
+ \item{step}{ The distance between the regularly spaced points for
+ which the values should be calculated. }
+ \item{background.density.scaling}{ If TRUE, regions of significant tag
+ enrichment will be masked out when calculating size ratio of the
+ signal to control datasets (to estimate ratio of the background tag
+ density). If FALSE, the dataset ratio will be equal to the ratio of
+ the number of tags in each dataset.}
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $x/$y data.frame giving the position and associated tag
+ density. Use \code{\link{writewig}} to output the structure to a WIG
+ file.
+}
+\seealso{ \code{\link{writewig}} }
+\examples{
+ smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2));
+ writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density");
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/output.binding.results.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/output.binding.results.Rd
new file mode 100755
index 0000000..eddfe35
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/output.binding.results.Rd
@@ -0,0 +1,24 @@
+\name{output.binding.results}
+\alias{output.binding.results}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Write out determined binding peaks into a text file table }
+\description{
+ Writes out determined binding positions into a text file. The file
+ will contain a table with each row corresponding to a detected
+ position, with the following columns:
+ \item{chr}{ chromosome or target sequence }
+ \item{pos}{ position of detected binding site on the chromosome/sequence}
+ \item{score}{a score reflecting magnitude of the binding}
+ \item{Evalue}{E-value corresponding to the peak magnitude}
+ \item{FDR}{FDR corresponding to the peak magnitude}
+ \item{enrichment.lb}{lower bound of the fold-enrichment ratio}
+ \item{enrichment.mle}{maximum likelihood estimate of the fold-enrichment ratio}
+}
+\usage{
+output.binding.results(results, filename)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{results}{ output of the \code{\link{find.binding.positions}} }
+ \item{filename}{ file name }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bam.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bam.tags.Rd
new file mode 100755
index 0000000..c4a579e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bam.tags.Rd
@@ -0,0 +1,24 @@
+\name{read.bam.tags}
+\alias{read.bam.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read BAM alignment file }
+\description{
+ Reads in aligned reads from BAM file. Note: no split (non-unique)
+ alignemnts should be reported in the BAM file.
+}
+\usage{
+read.bam.tags(filename, read.tag.names = F, fix.chromosome.names = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ BAM file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bin.maqmap.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bin.maqmap.tags.Rd
new file mode 100755
index 0000000..8260d61
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bin.maqmap.tags.Rd
@@ -0,0 +1,23 @@
+\name{read.bin.maqmap.tags}
+\alias{read.bin.maqmap.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read MAQ binary alignment map file }
+\description{
+ Reads in MAQ binary map alignment result file
+}
+\usage{
+read.bin.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ MAQ map output file (binary) }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bowtie.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bowtie.tags.Rd
new file mode 100755
index 0000000..678e9fc
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.bowtie.tags.Rd
@@ -0,0 +1,23 @@
+\name{read.bowtie.tags}
+\alias{read.bowtie.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read bowtie text alignment output file }
+\description{
+ Reads in bowtie alignment results in text format
+}
+\usage{
+read.bowtie.tags(filename, read.tag.names = F, fix.chromosome.names = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ bowtie text output file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.eland.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.eland.tags.Rd
new file mode 100755
index 0000000..aa29d6b
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.eland.tags.Rd
@@ -0,0 +1,30 @@
+\name{read.eland.tags}
+\alias{read.eland.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read eland output file }
+\description{
+ Reads in ELAND output file, returning 5'-end tag coordinates and
+ number of mismatches associated with each mapped tag.
+}
+\usage{
+read.eland.tags(filename, read.tag.names = F, fix.chromosome.names = T, max.eland.tag.length = -1,extended=F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ ELAND output file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+ \item{max.eland.tag.length}{ Specifies max length of the tag sequence
+ considered by ELAND. This needs to be specified if the tags are
+ longer than the sequences considred by ELAND during alignment. }
+ \item{extended}{ Whether the file is written out in "extended" format
+ provided in GA pipeline 1.0. }
+ \item{multi}{ Whether the file is written in "multi" format, showing multiple alignments of the reads }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.maqmap.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.maqmap.tags.Rd
new file mode 100755
index 0000000..31c5309
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.maqmap.tags.Rd
@@ -0,0 +1,23 @@
+\name{read.maqmap.tags}
+\alias{read.maqmap.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read MAQ text alignment output file }
+\description{
+ Reads in MAQ alignment results in text format (that results from "maq mapview" command.)
+}
+\usage{
+read.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ MAQ text output file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.meland.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.meland.tags.Rd
new file mode 100755
index 0000000..c21a815
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/read.meland.tags.Rd
@@ -0,0 +1,29 @@
+\name{read.meland.tags}
+\alias{read.meland.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read modified BED tag alignment file that contains variable match
+ length information }
+\description{
+ Reads in an extended BED tag alignment file. An example line given below:
+ \code{49 . U1 . 1 . . 23 chr2 -234567}
+ The line above specifies a 23-bp portion of the tag tag with id 49 was
+ aligned with 1 mismatch to the negative strand of chr2 at position 234567.
+}
+\usage{
+read.meland.tags(filename, read.tag.names = F, fix.chromosome.names = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ name of the extended BED file }
+ \item{read.tag.names}{ whether to read in tag names }
+ \item{fix.chromosome.names}{ whether to remove ".fa" from the sequence
+ name ends. }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Quality expressed as a float x.y, where x is
+ tag.length - aligned.tag.portion.length, and y is the number of
+ mismatches (must be less than 10). }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/remove.local.tag.anomalies.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/remove.local.tag.anomalies.Rd
new file mode 100755
index 0000000..705705f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/remove.local.tag.anomalies.Rd
@@ -0,0 +1,46 @@
+\name{remove.local.tag.anomalies}
+\alias{remove.local.tag.anomalies}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Restrict or remove positions with too many tags relative to
+ local background. }
+\description{
+ In Solexa ChIP-seq experiments some anomalous positions contain
+ extremely high number of tags at the exact coordinates. The function
+ scans the chromosomes, determining local tag density based on a
+ provided \code{window.size}, doing two types of corrections:
+ 1. removing all tags from positions that exceed local density by
+ \code{eliminate.fold}; 2. reducing the tag count at positions
+ exceeding \code{cap.fold} to the maximal allowed count. The
+ statistical significance of counts exceeding either of these two
+ threshold densities is calculated based on Poisson model, with
+ confidence interval determined by the \code{z.threshold} Z-score parameter.
+}
+\usage{
+remove.local.tag.anomalies(tags, window.size = 200, eliminate.fold = 10, cap.fold = 4, z.threshold = 3)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{tags}{ Chromosome-list of tag vectors }
+ \item{window.size}{ Size of the window used to assess local
+ density. Increasing the window size considerably beyond the size of
+ the binding features will result in flattened profiles, with bound
+ positions exhibiting a difference of just 1 tag beyond the background. }
+ \item{eliminate.fold}{ Threshold definining fold-over background
+ density above which the position is considered anomalous and removed
+ completely.}
+ \item{cap.fold}{ Threshold fold-over background density above which
+ the position is capped to the maximum statistically likely given
+ local tag density }
+ \item{z.threshold}{ Z-score used to assess significance of a given
+ position exceeding either of the two density thresholds. }
+}
+\value{
+ A modified chromosome-wise tag vector list.
+}
+\references{ ~put references to the literature/web site here ~ }
+
+\note{ ~~further notes~~
+ Increasing window.size to very large values will result in flat
+ profiles similar to those described by Zhang et al. "Model-based
+ Analysis of ChIP-Seq (MACS)." Genome Biol. 2008 Sep 17;9(9):R137.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/select.informative.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/select.informative.tags.Rd
new file mode 100755
index 0000000..73a4155
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/select.informative.tags.Rd
@@ -0,0 +1,29 @@
+\name{select.informative.tags}
+\alias{select.informative.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Choose informative tags }
+\description{
+ For datasets with tag alignment quality information (e.g. number of
+ mismatches for Eland alignments),
+ \code{\link{get.binding.characteristics}} determines whether inclusion
+ of tags from each specific quality bin improves the cross-correlation
+ profile. The present function is then used to actually select these
+ informative tags, discarding all other information, including quality
+ scores that are not used in further processing.
+}
+\usage{
+select.informative.tags(data, binding.characteristics)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{data}{ Full alignment data (a list with $tags and $quality elements) }
+ \item{binding.characteristics}{ result of a
+ \code{\link{get.binding.characteristics}} call. If NULL value is
+ supplied,all tags will be accepted. }
+}
+\value{
+ A chromosome-wise tag list. Each element of the list corresponds to a
+ chromosome and is a numeric vector of 5' tag coordinates, with sign
+ designating DNA strand.
+ This form of tag data is used for most of the other processing.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/spp-package.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/spp-package.Rd
new file mode 100755
index 0000000..542bafc
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/spp-package.Rd
@@ -0,0 +1,144 @@
+\name{spp-package}
+\alias{spp-package}
+\alias{spp}
+\docType{package}
+\title{
+ChIP-seq (Solexa) Processing Pipeline
+}
+\description{
+A set of routines for reading short sequence alignments, calculating tag
+density, estimates of statistically significant enrichment/depletion
+along the chromosome, identifying point binding positions (peaks), and
+characterizing saturation properties related to sequencing depth.
+}
+\details{
+\tabular{ll}{
+Package: \tab spp\cr
+Type: \tab Package\cr
+Version: \tab 1.8\cr
+Date: \tab 2008-11-14\cr
+License: \tab What license is it under?\cr
+LazyLoad: \tab yes\cr
+}
+See example below for typical processing sequence.y
+}
+\author{Peter Kharchenko <peter.kharchenko@post.harvard.edu>}
+\references{
+Kharchenko P., Tolstorukov M., Park P. "Design and analysis of ChIP-seq
+experiments for DNA-binding proteins." Nature Biotech. doi:10.1038/nbt.1508
+}
+
+\examples{
+
+ # load the library
+ library(spp);
+
+ ## The following section shows how to initialize a cluster of 8 nodes for parallel processing
+ ## To enable parallel processing, uncomment the next three lines, and comment out "cluster<-NULL";
+ ## see "snow" package manual for details.
+ #library(snow)
+ #cluster <- makeCluster(2);
+ #invisible(clusterCall(cluster,source,"routines.r"));
+ cluster <- NULL;
+
+
+
+ # read in tag alignments
+ chip.data <- read.eland.tags("chip.eland.alignment");
+ input.data <- read.eland.tags("input.eland.alignment");
+
+ # get binding info from cross-correlation profile
+ # srange gives the possible range for the size of the protected region;
+ # srange should be higher than tag length; making the upper boundary too high will increase calculation time
+ #
+ # bin - bin tags within the specified number of basepairs to speed up calculation;
+ # increasing bin size decreases the accuracy of the determined parameters
+ binding.characteristics <- get.binding.characteristics(chip.data,srange=c(50,500),bin=5,cluster=cluster);
+
+
+ # plot cross-correlation profile
+ pdf(file="example.crosscorrelation.pdf",width=5,height=5)
+ par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
+ plot(binding.characteristics$cross.correlation,type='l',xlab="strand shift",ylab="cross-correlation");
+ abline(v=binding.characteristics$peak$x,lty=2,col=2)
+ dev.off();
+
+ # select informative tags based on the binding characteristics
+ chip.data <- select.informative.tags(chip.data,binding.characteristics);
+ input.data <- select.informative.tags(input.data,binding.characteristics);
+
+ # restrict or remove positions with anomalous number of tags relative
+ # to the local density
+ chip.data <- remove.local.tag.anomalies(chip.data);
+ input.data <- remove.local.tag.anomalies(input.data);
+
+
+ # output smoothed tag density (subtracting re-scaled input) into a WIG file
+ # note that the tags are shifted by half of the peak separation distance
+ smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2));
+ writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density");
+ rm(smoothed.density);
+
+ # output conservative enrichment estimates
+ # alpha specifies significance level at which confidence intervals will be estimated
+ enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01);
+ writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale");
+ rm(enrichment.estimates);
+
+
+ # binding detection parameters
+ # desired FDR. Alternatively, an E-value can be supplied to the method calls below instead of the fdr parameter
+ fdr <- 1e-2;
+ # the binding.characteristics contains the optimized half-size for binding detection window
+ detection.window.halfsize <- binding.characteristics$whs;
+
+ # determine binding positions using wtd method
+ bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize,cluster=cluster)
+
+ # alternatively determined binding positions using lwcc method (note: this takes longer than wtd)
+ # bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.lwcc,whs=detection.window.halfsize,cluster=cluster)
+
+ print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks"));
+
+ # output detected binding positions
+ output.binding.results(bp,"example.binding.positions.txt");
+
+
+ # -------------------------------------------------------------------------------------------
+ # the set of commands in the following section illustrates methods for saturation analysis
+ # these are separated from the previous section, since they are highly CPU intensive
+ # -------------------------------------------------------------------------------------------
+
+ # determine MSER
+ # note: this will take approximately 10-15x the amount of time the initial binding detection did
+ # The saturation criteria here is 0.99 consistency in the set of binding positions when adding 1e5 tags.
+ # To ensure convergence the number of subsampled chains (n.chains) should be higher (80)
+ mser <- get.mser(chip.data,input.data,step.size=1e5,test.agreement=0.99,n.chains=8,cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize)
+
+ print(paste("MSER at a current depth is",mser));
+
+ # note: an MSER value of 1 or very near one implies that the set of detected binding positions satisfies saturation criteria without
+ # additional selection by fold-enrichment ratios. In other words, the dataset has reached saturation in a traditional sense (absolute saturation).
+
+ # interpolate MSER dependency on tag count
+ # note: this requires considerably more calculations than the previous steps (~ 3x more than the first MSER calculation)
+ # Here we interpolate MSER dependency to determine a point at which MSER of 2 is reached
+ # The interpolation will be based on the difference in MSER at the current depth, and a depth at 5e5 fewer tags (n.steps=6);
+ # evaluation of the intermediate points is omitted here to speed up the calculation (excluded.steps parameter)
+ # A total of 7 chains is used here to speed up calculation, whereas a higher number of chains (50) would give good convergence
+ msers <- get.mser.interpolation(chip.data,input.data,step.size=1e5,test.agreement=0.99, target.fold.enrichment=2, n.chains=7,n.steps=6,excluded.steps=c(2:4),cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize)
+
+ print(paste("predicted sequencing depth =",round(unlist(lapply(msers,function(x) x$prediction))/1e6,5)," million tags"))
+
+
+ # note: the interpolation will return NA prediction if the dataset has reached absolute saturation at the current depth.
+ # note: use return.chains=T to also calculated random chains (returned under msers$chains field) - these can be passed back as
+ # "get.mser.interpolation( ..., chains=msers$chains)" to calculate predictions for another target.fold.enrichment value
+ # without having to recalculate the random chain predictions.
+
+ ## stop cluster if it was initialized
+ #stopCluster(cluster);
+
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.broadpeak.info.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.broadpeak.info.Rd
new file mode 100755
index 0000000..0ed5f66
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.broadpeak.info.Rd
@@ -0,0 +1,16 @@
+\name{write.broadpeak.info}
+\alias{write.broadpeak.info}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Write out determined broad enrichment regions using broadPeak format }
+\description{
+ Writes out broad regions of enrichment determined by the
+ get.broad.enrichment.clusters method in a broadPeak format.
+}
+\usage{
+write.broadpeak.info(broadpeak.results, filename)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{broadpeak.results}{ output of the \code{\link{get.broad.enrichment.clusters}} }
+ \item{filename}{ file name }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.narrowpeak.binding.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.narrowpeak.binding.Rd
new file mode 100755
index 0000000..ca259bb
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/write.narrowpeak.binding.Rd
@@ -0,0 +1,21 @@
+\name{write.narrowpeak.binding}
+\alias{write.narrowpeak.binding}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Write out determined binding peaks using narrowPeak format }
+\description{
+ Writes out determined binding positions into a narrowPeak file.
+ The region will correspond to associated broad enrichment region, if
+ such were added using add.broad.peak.regions method. Otherwise the
+ region size will be determined using margin (which defaults to the
+ window half size that was used to determine binding positions)
+}
+\usage{
+write.narrowpeak.binding(results, filename,margin=results$whs)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{results}{ output of the \code{\link{find.binding.positions}} }
+ \item{filename}{ file name }
+ \item{margin}{ explicit value of the margin to be used if the borad
+ region information is absent (defaults to peak detection window half-size}
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/writewig.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/writewig.Rd
new file mode 100755
index 0000000..f7e23d9
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/man/writewig.Rd
@@ -0,0 +1,31 @@
+\name{writewig}
+\alias{writewig}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ A function to save a list of chromosome-wise x/y data frames
+ into a WIG file format. }
+\description{
+ Takes a list that contains an $x and $y data.frame for a number of
+ chromosomes and writes it out to a WIG BED style format.
+}
+\usage{
+writewig(dat, fname, feature, threshold = 5, zip = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{dat}{ Chromosome coordinate-value data. \code{dat} is a list,
+ each member of a list is a data frame with $x and $y columns
+ containing chromosome positions and associated values. The names of
+ the list elements correspond to the chromosomes. }
+ \item{fname}{ Filename to which the output should be written }
+ \item{feature}{ Data description to be incorporated into the WIG header }
+ \item{threshold}{ Optional threshold to be saved in the WIG file}
+ \item{zip}{ Wheter to invoke a zip program to compress the file }
+}
+
+\seealso{ ~~objects to See Also as \code{\link{help}}, ~~~ }
+\examples{
+
+data <- list("chr1"=data.frame(x=c(100,130,200),y=c(1.2,4.0,2.3)));
+writewig(data,"filename");
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.cpp
new file mode 100755
index 0000000..6a89987
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.cpp
@@ -0,0 +1,398 @@
+// ***************************************************************************
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading & writing BGZF files
+// ***************************************************************************
+
+#include <BGZF.h>
+using namespace BamTools;
+
+#include <algorithm>
+using namespace std;
+
+BgzfData::BgzfData(void)
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+ , CompressedBlockSize(MAX_BLOCK_SIZE)
+ , BlockLength(0)
+ , BlockOffset(0)
+ , BlockAddress(0)
+ , IsOpen(false)
+ , IsWriteOnly(false)
+ , IsWriteUncompressed(false)
+ , Stream(NULL)
+ , UncompressedBlock(NULL)
+ , CompressedBlock(NULL)
+{
+ try {
+ CompressedBlock = new char[CompressedBlockSize];
+ UncompressedBlock = new char[UncompressedBlockSize];
+ } catch( std::bad_alloc& ba ) {
+ fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n");
+ exit(1);
+ }
+}
+
+// destructor
+BgzfData::~BgzfData(void) {
+ if( CompressedBlock ) delete[] CompressedBlock;
+ if( UncompressedBlock ) delete[] UncompressedBlock;
+}
+
+// closes BGZF file
+void BgzfData::Close(void) {
+
+ // skip if file not open, otherwise set flag
+ if ( !IsOpen ) return;
+
+ // if writing to file, flush the current BGZF block,
+ // then write an empty block (as EOF marker)
+ if ( IsWriteOnly ) {
+ FlushBlock();
+ int blockLength = DeflateBlock();
+ fwrite(CompressedBlock, 1, blockLength, Stream);
+ }
+
+ // flush and close
+ fflush(Stream);
+ fclose(Stream);
+ IsWriteUncompressed = false;
+ IsOpen = false;
+}
+
+// compresses the current block
+int BgzfData::DeflateBlock(void) {
+
+ // initialize the gzip header
+ char* buffer = CompressedBlock;
+ memset(buffer, 0, 18);
+ buffer[0] = GZIP_ID1;
+ buffer[1] = (char)GZIP_ID2;
+ buffer[2] = CM_DEFLATE;
+ buffer[3] = FLG_FEXTRA;
+ buffer[9] = (char)OS_UNKNOWN;
+ buffer[10] = BGZF_XLEN;
+ buffer[12] = BGZF_ID1;
+ buffer[13] = BGZF_ID2;
+ buffer[14] = BGZF_LEN;
+
+ // set compression level
+ const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );
+
+ // loop to retry for blocks that do not compress enough
+ int inputLength = BlockOffset;
+ int compressedLength = 0;
+ unsigned int bufferSize = CompressedBlockSize;
+
+ while ( true ) {
+
+ // initialize zstream values
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)UncompressedBlock;
+ zs.avail_in = inputLength;
+ zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+ // initialize the zlib compression algorithm
+ if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n");
+ exit(1);
+ }
+
+ // compress the data
+ int status = deflate(&zs, Z_FINISH);
+ if ( status != Z_STREAM_END ) {
+
+ deflateEnd(&zs);
+
+ // reduce the input length and try again
+ if ( status == Z_OK ) {
+ inputLength -= 1024;
+ if( inputLength < 0 ) {
+ fprintf(stderr, "BGZF ERROR: input reduction failed.\n");
+ exit(1);
+ }
+ continue;
+ }
+
+ fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
+ exit(1);
+ }
+
+ // finalize the compression routine
+ if ( deflateEnd(&zs) != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
+ exit(1);
+ }
+
+ compressedLength = zs.total_out;
+ compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ if ( compressedLength > MAX_BLOCK_SIZE ) {
+ fprintf(stderr, "BGZF ERROR: deflate overflow.\n");
+ exit(1);
+ }
+
+ break;
+ }
+
+ // store the compressed length
+ BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+ // store the CRC32 checksum
+ unsigned int crc = crc32(0, NULL, 0);
+ crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+ // ensure that we have less than a block of data left
+ int remaining = BlockOffset - inputLength;
+ if ( remaining > 0 ) {
+ if ( remaining > inputLength ) {
+ fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n");
+ exit(1);
+ }
+ memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
+ }
+
+ BlockOffset = remaining;
+ return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BgzfData::FlushBlock(void) {
+
+ // flush all of the remaining blocks
+ while ( BlockOffset > 0 ) {
+
+ // compress the data block
+ int blockLength = DeflateBlock();
+
+ // flush the data to our output stream
+ int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
+
+ if ( numBytesWritten != blockLength ) {
+ fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
+ exit(1);
+ }
+
+ BlockAddress += blockLength;
+ }
+}
+
+// de-compresses the current block
+int BgzfData::InflateBlock(const int& blockLength) {
+
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)CompressedBlock + 18;
+ zs.avail_in = blockLength - 16;
+ zs.next_out = (Bytef*)UncompressedBlock;
+ zs.avail_out = UncompressedBlockSize;
+
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if ( status != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
+ return -1;
+ }
+
+ status = inflate(&zs, Z_FINISH);
+ if ( status != Z_STREAM_END ) {
+ inflateEnd(&zs);
+ fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
+ return -1;
+ }
+
+ status = inflateEnd(&zs);
+ if ( status != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
+ return -1;
+ }
+
+ return zs.total_out;
+}
+
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
+bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {
+
+ // determine open mode
+ if ( strcmp(mode, "rb") == 0 )
+ IsWriteOnly = false;
+ else if ( strcmp(mode, "wb") == 0)
+ IsWriteOnly = true;
+ else {
+ fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode);
+ return false;
+ }
+
+ // ----------------------------------------------------------------
+ // open Stream to read to/write from file, stdin, or stdout
+ // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
+
+ // read/write BGZF data to/from a file
+ if ( (filename != "stdin") && (filename != "stdout") )
+ Stream = fopen(filename.c_str(), mode);
+
+ // read BGZF data from stdin
+ else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
+ Stream = freopen(NULL, mode, stdin);
+
+ // write BGZF data to stdout
+ else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
+ Stream = freopen(NULL, mode, stdout);
+
+ if ( !Stream ) {
+ fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() );
+ return false;
+ }
+
+ // set flags, return success
+ IsOpen = true;
+ IsWriteUncompressed = isWriteUncompressed;
+ return true;
+}
+
+// reads BGZF data into a byte buffer
+int BgzfData::Read(char* data, const unsigned int dataLength) {
+
+ if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;
+
+ char* output = data;
+ unsigned int numBytesRead = 0;
+ while ( numBytesRead < dataLength ) {
+
+ int bytesAvailable = BlockLength - BlockOffset;
+ if ( bytesAvailable <= 0 ) {
+ if ( !ReadBlock() ) return -1;
+ bytesAvailable = BlockLength - BlockOffset;
+ if ( bytesAvailable <= 0 ) break;
+ }
+
+ char* buffer = UncompressedBlock;
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+ memcpy(output, buffer + BlockOffset, copyLength);
+
+ BlockOffset += copyLength;
+ output += copyLength;
+ numBytesRead += copyLength;
+ }
+
+ if ( BlockOffset == BlockLength ) {
+ BlockAddress = ftell64(Stream);
+ BlockOffset = 0;
+ BlockLength = 0;
+ }
+
+ return numBytesRead;
+}
+
+// reads a BGZF block
+bool BgzfData::ReadBlock(void) {
+
+ char header[BLOCK_HEADER_LENGTH];
+ int64_t blockAddress = ftell64(Stream);
+
+ int count = fread(header, 1, sizeof(header), Stream);
+ if ( count == 0 ) {
+ BlockLength = 0;
+ return true;
+ }
+
+ if ( count != sizeof(header) ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n");
+ return false;
+ }
+
+ if ( !BgzfData::CheckBlockHeader(header) ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n");
+ return false;
+ }
+
+ int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
+ char* compressedBlock = CompressedBlock;
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;
+
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
+ if ( count != remaining ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n");
+ return false;
+ }
+
+ count = InflateBlock(blockLength);
+ if ( count < 0 ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n");
+ return false;
+ }
+
+ if ( BlockLength != 0 )
+ BlockOffset = 0;
+
+ BlockAddress = blockAddress;
+ BlockLength = count;
+ return true;
+}
+
+// seek to position in BGZF file
+bool BgzfData::Seek(int64_t position) {
+
+ if ( !IsOpen ) return false;
+
+ int blockOffset = (position & 0xFFFF);
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+
+ if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
+ fprintf(stderr, "BGZF ERROR: unable to seek in file\n");
+ return false;
+ }
+
+ BlockLength = 0;
+ BlockAddress = blockAddress;
+ BlockOffset = blockOffset;
+ return true;
+}
+
+// get file position in BGZF file
+int64_t BgzfData::Tell(void) {
+ if ( !IsOpen )
+ return false;
+ else
+ return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
+
+ if ( !IsOpen || !IsWriteOnly ) return false;
+
+ // initialize
+ unsigned int numBytesWritten = 0;
+ const char* input = data;
+ unsigned int blockLength = UncompressedBlockSize;
+
+ // copy the data to the buffer
+ while ( numBytesWritten < dataLen ) {
+
+ unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
+ char* buffer = UncompressedBlock;
+ memcpy(buffer + BlockOffset, input, copyLength);
+
+ BlockOffset += copyLength;
+ input += copyLength;
+ numBytesWritten += copyLength;
+
+ if ( BlockOffset == blockLength )
+ FlushBlock();
+ }
+
+ return numBytesWritten;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.h
new file mode 100755
index 0000000..46b82a3
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BGZF.h
@@ -0,0 +1,322 @@
+// ***************************************************************************
+// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading & writing BGZF files
+// ***************************************************************************
+
+#ifndef BGZF_H
+#define BGZF_H
+
+#include <api_global.h>
+#include <zlib.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+// Platform-specific large-file support
+#ifndef BAMTOOLS_LFS
+#define BAMTOOLS_LFS
+ #ifdef WIN32
+ #define ftell64(a) _ftelli64(a)
+ #define fseek64(a,b,c) _fseeki64(a,b,c)
+ #else
+ #define ftell64(a) ftello(a)
+ #define fseek64(a,b,c) fseeko(a,b,c)
+ #endif
+#endif // BAMTOOLS_LFS
+
+// Platform-specific type definitions
+#ifndef BAMTOOLS_TYPES
+#define BAMTOOLS_TYPES
+ #ifdef _MSC_VER
+ typedef char int8_t;
+ typedef unsigned char uint8_t;
+ typedef short int16_t;
+ typedef unsigned short uint16_t;
+ typedef int int32_t;
+ typedef unsigned int uint32_t;
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+ #else
+ #include <stdint.h>
+ #endif
+#endif // BAMTOOLS_TYPES
+
+namespace BamTools {
+
+// zlib constants
+const int GZIP_ID1 = 31;
+const int GZIP_ID2 = 139;
+const int CM_DEFLATE = 8;
+const int FLG_FEXTRA = 4;
+const int OS_UNKNOWN = 255;
+const int BGZF_XLEN = 6;
+const int BGZF_ID1 = 66;
+const int BGZF_ID2 = 67;
+const int BGZF_LEN = 2;
+const int GZIP_WINDOW_BITS = -15;
+const int Z_DEFAULT_MEM_LEVEL = 8;
+
+// BZGF constants
+const int BLOCK_HEADER_LENGTH = 18;
+const int BLOCK_FOOTER_LENGTH = 8;
+const int MAX_BLOCK_SIZE = 65536;
+const int DEFAULT_BLOCK_SIZE = 65536;
+
+struct API_EXPORT BgzfData {
+
+ // data members
+ public:
+ unsigned int UncompressedBlockSize;
+ unsigned int CompressedBlockSize;
+ unsigned int BlockLength;
+ unsigned int BlockOffset;
+ uint64_t BlockAddress;
+ bool IsOpen;
+ bool IsWriteOnly;
+ bool IsWriteUncompressed;
+ FILE* Stream;
+ char* UncompressedBlock;
+ char* CompressedBlock;
+
+ // constructor & destructor
+ public:
+ BgzfData(void);
+ ~BgzfData(void);
+
+ // main interface methods
+ public:
+ // closes BGZF file
+ void Close(void);
+ // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing)
+ bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false);
+ // reads BGZF data into a byte buffer
+ int Read(char* data, const unsigned int dataLength);
+ // seek to position in BGZF file
+ bool Seek(int64_t position);
+ // get file position in BGZF file
+ int64_t Tell(void);
+ // writes the supplied data into the BGZF buffer
+ unsigned int Write(const char* data, const unsigned int dataLen);
+
+ // internal methods
+ private:
+ // compresses the current block
+ int DeflateBlock(void);
+ // flushes the data in the BGZF block
+ void FlushBlock(void);
+ // de-compresses the current block
+ int InflateBlock(const int& blockLength);
+ // reads a BGZF block
+ bool ReadBlock(void);
+
+ // static 'utility' methods
+ public:
+ // checks BGZF block header
+ static inline bool CheckBlockHeader(char* header);
+ // packs an unsigned integer into the specified buffer
+ static inline void PackUnsignedInt(char* buffer, unsigned int value);
+ // packs an unsigned short into the specified buffer
+ static inline void PackUnsignedShort(char* buffer, unsigned short value);
+ // unpacks a buffer into a double
+ static inline double UnpackDouble(char* buffer);
+ static inline double UnpackDouble(const char* buffer);
+ // unpacks a buffer into a float
+ static inline float UnpackFloat(char* buffer);
+ static inline float UnpackFloat(const char* buffer);
+ // unpacks a buffer into a signed int
+ static inline signed int UnpackSignedInt(char* buffer);
+ static inline signed int UnpackSignedInt(const char* buffer);
+ // unpacks a buffer into a signed short
+ static inline signed short UnpackSignedShort(char* buffer);
+ static inline signed short UnpackSignedShort(const char* buffer);
+ // unpacks a buffer into an unsigned int
+ static inline unsigned int UnpackUnsignedInt(char* buffer);
+ static inline unsigned int UnpackUnsignedInt(const char* buffer);
+ // unpacks a buffer into an unsigned short
+ static inline unsigned short UnpackUnsignedShort(char* buffer);
+ static inline unsigned short UnpackUnsignedShort(const char* buffer);
+};
+
+// -------------------------------------------------------------
+// static 'utility' method implementations
+
+// checks BGZF block header
+inline
+bool BgzfData::CheckBlockHeader(char* header) {
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (char)GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );
+}
+
+// 'packs' an unsigned integer into the specified buffer
+inline
+void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+ buffer[2] = (char)(value >> 16);
+ buffer[3] = (char)(value >> 24);
+}
+
+// 'packs' an unsigned short into the specified buffer
+inline
+void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+}
+
+// 'unpacks' a buffer into a double (includes both non-const & const char* flavors)
+inline
+double BgzfData::UnpackDouble(char* buffer) {
+ union { double value; unsigned char valueBuffer[sizeof(double)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ un.valueBuffer[4] = buffer[4];
+ un.valueBuffer[5] = buffer[5];
+ un.valueBuffer[6] = buffer[6];
+ un.valueBuffer[7] = buffer[7];
+ return un.value;
+}
+
+inline
+double BgzfData::UnpackDouble(const char* buffer) {
+ union { double value; unsigned char valueBuffer[sizeof(double)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ un.valueBuffer[4] = buffer[4];
+ un.valueBuffer[5] = buffer[5];
+ un.valueBuffer[6] = buffer[6];
+ un.valueBuffer[7] = buffer[7];
+ return un.value;
+}
+
+// 'unpacks' a buffer into a float (includes both non-const & const char* flavors)
+inline
+float BgzfData::UnpackFloat(char* buffer) {
+ union { float value; unsigned char valueBuffer[sizeof(float)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+inline
+float BgzfData::UnpackFloat(const char* buffer) {
+ union { float value; unsigned char valueBuffer[sizeof(float)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors)
+inline
+signed int BgzfData::UnpackSignedInt(char* buffer) {
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+inline
+signed int BgzfData::UnpackSignedInt(const char* buffer) {
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors)
+inline
+signed short BgzfData::UnpackSignedShort(char* buffer) {
+ union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+inline
+signed short BgzfData::UnpackSignedShort(const char* buffer) {
+ union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors)
+inline
+unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+inline
+unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors)
+inline
+unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+inline
+unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+} // namespace BamTools
+
+#endif // BGZF_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.cpp
new file mode 100755
index 0000000..73a586c
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.cpp
@@ -0,0 +1,696 @@
+// ***************************************************************************
+// BamAlignment.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#include <BamAlignment.h>
+using namespace BamTools;
+
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <map>
+#include <utility>
+using namespace std;
+
+// default ctor
+BamAlignment::BamAlignment(void)
+ : RefID(-1)
+ , Position(-1)
+ , MateRefID(-1)
+ , MatePosition(-1)
+ , InsertSize(0)
+{ }
+
+// copy ctor
+BamAlignment::BamAlignment(const BamAlignment& other)
+ : Name(other.Name)
+ , Length(other.Length)
+ , QueryBases(other.QueryBases)
+ , AlignedBases(other.AlignedBases)
+ , Qualities(other.Qualities)
+ , TagData(other.TagData)
+ , RefID(other.RefID)
+ , Position(other.Position)
+ , Bin(other.Bin)
+ , MapQuality(other.MapQuality)
+ , AlignmentFlag(other.AlignmentFlag)
+ , CigarData(other.CigarData)
+ , MateRefID(other.MateRefID)
+ , MatePosition(other.MatePosition)
+ , InsertSize(other.InsertSize)
+ , SupportData(other.SupportData)
+{ }
+
+// dtor
+BamAlignment::~BamAlignment(void) { }
+
+// Queries against alignment flags
+bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
+bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
+bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
+bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
+bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
+bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
+bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
+bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
+bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
+
+// Manipulate alignment flags
+void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
+void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
+void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
+void BamAlignment::SetIsMapped(bool ok) { SetIsUnmapped(!ok); }
+void BamAlignment::SetIsMateMapped(bool ok) { SetIsMateUnmapped(!ok); }
+void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
+void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
+void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
+void BamAlignment::SetIsPrimaryAlignment(bool ok) { SetIsSecondaryAlignment(!ok); }
+void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
+void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
+void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
+void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
+void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
+
+// calculates alignment end position, based on starting position and CIGAR operations
+int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
+
+ // initialize alignment end to starting position
+ int alignEnd = Position;
+
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ const char cigarType = (*cigarIter).Type;
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' )
+ alignEnd += (*cigarIter).Length;
+ else if ( usePadded && cigarType == 'I' )
+ alignEnd += (*cigarIter).Length;
+ }
+
+ // adjust for zeroBased, if necessary
+ if (zeroBased)
+ return alignEnd - 1;
+ else
+ return alignEnd;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, copy tag data to temp buffer
+ string newTag = tag + type + value;
+ const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) {
+ return AddTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + value.size()];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ const unsigned int dataLength = strlen(value.c_str());
+ memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) {
+ return EditTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+// get "NM" tag data - originally contributed by Aaron Quinlan
+// stores data in 'editDistance', returns success/fail
+bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
+ return GetTag("NM", (uint32_t&)editDistance);
+}
+
+// get "RG" tag data
+// stores data in 'readGroup', returns success/fail
+bool BamAlignment::GetReadGroup(string& readGroup) const {
+ return GetTag("RG", readGroup);
+}
+
+bool BamAlignment::GetTag(const string& tag, string& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+ const unsigned int dataLength = strlen(pTagData);
+ destination.clear();
+ destination.resize(dataLength);
+ memcpy( (char*)destination.data(), pTagData, dataLength );
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch (type) {
+
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type for integer destination (float or var-length strings)
+ case 'f':
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const string& tag, int32_t& destination) const {
+ return GetTag(tag, (uint32_t&)destination);
+}
+
+bool BamAlignment::GetTag(const string& tag, float& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch(type) {
+
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'f':
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type (var-length strings)
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0.0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTagType(const string& tag, char& type) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // lookup tag
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // retrieve tag type code
+ type = *(pTagData - 1);
+
+ // validate that type is a proper BAM tag type
+ switch(type) {
+ case 'A':
+ case 'c':
+ case 'C':
+ case 's':
+ case 'S':
+ case 'f':
+ case 'i':
+ case 'I':
+ case 'Z':
+ case 'H':
+ return true;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::RemoveTag(const string& tag) {
+
+ // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
+ // also, return false if no data present to remove
+ if ( SupportData.HasCoreOnly || TagData.empty() ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ char newTagData[originalTagDataLength];
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
+
+ // save new tag data
+ TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, no removal - return failure
+ return false;
+}
+
+bool BamAlignment::FindTag(const string& tag,
+ char* &pTagData,
+ const unsigned int& tagDataLength,
+ unsigned int& numBytesParsed)
+{
+
+ while ( numBytesParsed < tagDataLength ) {
+
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+
+ // check the current tag, return true on match
+ if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
+ return true;
+
+ // get the storage class and find the next tag
+ if ( *pTagStorageType == '\0' ) return false;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
+ if ( *pTagData == '\0' ) return false;
+ }
+
+ // checked all tags, none match
+ return false;
+}
+
+bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
+
+ switch(storageType) {
+
+ case 'A':
+ case 'c':
+ case 'C':
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case 's':
+ case 'S':
+ numBytesParsed += 2;
+ pTagData += 2;
+ break;
+
+ case 'f':
+ case 'i':
+ case 'I':
+ numBytesParsed += 4;
+ pTagData += 4;
+ break;
+
+ case 'Z':
+ case 'H':
+ while(*pTagData) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ default:
+ // error case
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);
+ return false;
+ }
+
+ // return success
+ return true;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.h
new file mode 100755
index 0000000..f469f5b
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAlignment.h
@@ -0,0 +1,203 @@
+// ***************************************************************************
+// BamAlignment.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#include <api_global.h>
+#include <BamAux.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// forward declare BamAlignment's friend classes
+namespace Internal {
+ class BamReaderPrivate;
+ class BamWriterPrivate;
+} // namespace Internal
+
+// BamAlignment data structure
+// explicitly labeled as 'struct' to indicate that (most of) its fields are public
+struct API_EXPORT BamAlignment {
+
+ // constructors & destructor
+ public:
+ BamAlignment(void);
+ BamAlignment(const BamAlignment& other);
+ ~BamAlignment(void);
+
+ // Queries against alignment flags
+ public:
+ bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
+ bool IsFailedQC(void) const; // Returns true if this read failed quality control
+ bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
+ bool IsMapped(void) const; // Returns true if alignment is mapped
+ bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
+ bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
+ bool IsPaired(void) const; // Returns true if alignment part of paired-end read
+ bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
+ bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
+ bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
+ bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
+
+ // Manipulate alignment flags
+ public:
+ void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
+ void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
+ void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
+ void SetIsMapped(bool ok); // Sets "alignment is mapped" flag
+ void SetIsMateMapped(bool ok); // Sets "alignment's mate is mapped" flag
+ void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
+ void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
+ void SetIsPrimaryAlignment(bool ok); // Sets "position is primary alignment" flag
+ void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
+ void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
+ void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
+
+ // legacy methods (deprecated, but available)
+ void SetIsMateUnmapped(bool ok); // Complement of IsMateMapped() flag
+ void SetIsSecondaryAlignment(bool ok); // Complement of IsPrimaryAlignment() flag
+ void SetIsUnmapped(bool ok); // Complement of IsMapped() flag
+
+ // Tag data access methods
+ public:
+ // -------------------------------------------------------------------------------------
+ // N.B. - The following tag access methods may not be used on BamAlignments fetched
+ // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in
+ // error message (to keep output clean) but will ALWAYS return false. Only user-created
+ // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here.
+
+ // add tag data (create new TAG entry with TYPE and VALUE)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if new data added, false if error or TAG already exists
+ // N.B. - will NOT modify existing tag. Use EditTag() instead
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - value to associate with tag
+ bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if edit was successfaul, false if error
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - new value for tag
+ bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // specific tag data access methods - these only remain for legacy support
+ // returns whether specific tag could be retrieved
+ bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance))
+ bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup))
+
+ // generic tag data access methods
+ // returns whether tag is found & tag type is compatible with DESTINATION
+ // @tag - two character tag name
+ // @destination - if found, tag value is stored here
+ bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings
+ bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data
+ bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data
+ bool GetTag(const std::string& tag, float& destination) const; // access floating point data
+
+ // retrieve the tag type code for TAG
+ // returns true if tag could be found and type determined
+ bool GetTagType(const std::string& tag, char& type) const;
+
+ // remove tag data
+ // returns true if removal was successful, false if error
+ // N.B. - returns false if TAG does not exist (no removal can occur)
+ // @tag - two character tag name
+ bool RemoveTag(const std::string& tag);
+
+ // Additional data access methods
+ public:
+ // calculates & returns alignment end position, based on starting position and CIGAR operations
+ // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference
+ // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats.
+ int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
+
+ // 'internal' utility methods
+ private:
+ static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);
+ static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
+
+ // Data members
+ public:
+ std::string Name; // Read name
+ int32_t Length; // Query length
+ std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
+ std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
+ std::string TagData; // Tag data (accessor methods will pull the requested information out)
+ int32_t RefID; // ID number for reference sequence
+ int32_t Position; // Position (0-based) where alignment starts
+ uint16_t Bin; // Bin in BAM file where this alignment resides
+ uint16_t MapQuality; // Mapping quality score
+ uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
+ int32_t MatePosition; // Position (0-based) where alignment's mate starts
+ int32_t InsertSize; // Mate-pair insert size
+
+ // Internal data, inaccessible to client code
+ // but available BamReaderPrivate & BamWriterPrivate
+ private:
+ struct BamAlignmentSupportData {
+
+ // data members
+ std::string AllCharData;
+ uint32_t BlockLength;
+ uint32_t NumCigarOperations;
+ uint32_t QueryNameLength;
+ uint32_t QuerySequenceLength;
+ bool HasCoreOnly;
+
+ // constructor
+ BamAlignmentSupportData(void)
+ : BlockLength(0)
+ , NumCigarOperations(0)
+ , QueryNameLength(0)
+ , QuerySequenceLength(0)
+ , HasCoreOnly(false)
+ { }
+ };
+ BamAlignmentSupportData SupportData;
+ friend class Internal::BamReaderPrivate;
+ friend class Internal::BamWriterPrivate;
+
+ // Alignment flag query constants
+ // Use the get/set methods above instead
+ private:
+ enum { PAIRED = 1
+ , PROPER_PAIR = 2
+ , UNMAPPED = 4
+ , MATE_UNMAPPED = 8
+ , REVERSE = 16
+ , MATE_REVERSE = 32
+ , READ_1 = 64
+ , READ_2 = 128
+ , SECONDARY = 256
+ , QC_FAILED = 512
+ , DUPLICATE = 1024
+ };
+};
+
+// convenience typedef(s)
+typedef std::vector<BamAlignment> BamAlignmentVector;
+
+} // namespace BamTools
+
+#endif // BAMALIGNMENT_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAux.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAux.h
new file mode 100755
index 0000000..9671303
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamAux.h
@@ -0,0 +1,227 @@
+// ***************************************************************************
+// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic constants, data structures, utilities etc.
+// used throughout the API for handling BAM files
+// ***************************************************************************
+
+#ifndef BAMAUX_H
+#define BAMAUX_H
+
+#include <api_global.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// Platform-specific large-file support
+#ifndef BAMTOOLS_LFS
+#define BAMTOOLS_LFS
+ #ifdef WIN32
+ #define ftell64(a) _ftelli64(a)
+ #define fseek64(a,b,c) _fseeki64(a,b,c)
+ #else
+ #define ftell64(a) ftello(a)
+ #define fseek64(a,b,c) fseeko(a,b,c)
+ #endif
+#endif // BAMTOOLS_LFS
+
+// Platform-specific type definitions
+#ifndef BAMTOOLS_TYPES
+#define BAMTOOLS_TYPES
+ #ifdef _MSC_VER
+ typedef char int8_t;
+ typedef unsigned char uint8_t;
+ typedef short int16_t;
+ typedef unsigned short uint16_t;
+ typedef int int32_t;
+ typedef unsigned int uint32_t;
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+ #else
+ #include <stdint.h>
+ #endif
+#endif // BAMTOOLS_TYPES
+
+namespace BamTools {
+
+// ----------------------------------------------------------------
+// ----------------------------------------------------------------
+// BAM constants
+
+const int BAM_CMATCH = 0;
+const int BAM_CINS = 1;
+const int BAM_CDEL = 2;
+const int BAM_CREF_SKIP = 3;
+const int BAM_CSOFT_CLIP = 4;
+const int BAM_CHARD_CLIP = 5;
+const int BAM_CPAD = 6;
+const int BAM_CIGAR_SHIFT = 4;
+const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
+const int BAM_CORE_SIZE = 32;
+const int BT_SIZEOF_INT = 4;
+
+// ----------------------------------------------------------------
+// ----------------------------------------------------------------
+// Data structs & typedefs
+
+// CIGAR operation data structure
+struct API_EXPORT CigarOp {
+
+ // data members
+ char Type; // Operation type (MIDNSHP)
+ uint32_t Length; // Operation length (number of bases)
+
+ // constructor
+ CigarOp(const char type = '\0',
+ const uint32_t length = 0)
+ : Type(type)
+ , Length(length)
+ { }
+};
+
+// Reference data entry
+struct API_EXPORT RefData {
+
+ // data members
+ std::string RefName; // Name of reference sequence
+ int32_t RefLength; // Length of reference sequence
+ bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
+
+ // constructor
+ RefData(const int32_t& length = 0,
+ bool ok = false)
+ : RefLength(length)
+ , RefHasAlignments(ok)
+ { }
+};
+typedef std::vector<RefData> RefVector;
+
+// General (sequential) genome region
+struct API_EXPORT BamRegion {
+
+ // data members
+ int LeftRefID;
+ int LeftPosition;
+ int RightRefID;
+ int RightPosition;
+
+ // constructor
+ BamRegion(const int& leftID = -1,
+ const int& leftPos = -1,
+ const int& rightID = -1,
+ const int& rightPos = -1)
+ : LeftRefID(leftID)
+ , LeftPosition(leftPos)
+ , RightRefID(rightID)
+ , RightPosition(rightPos)
+ { }
+
+ // copy constructor
+ BamRegion(const BamRegion& other)
+ : LeftRefID(other.LeftRefID)
+ , LeftPosition(other.LeftPosition)
+ , RightRefID(other.RightRefID)
+ , RightPosition(other.RightPosition)
+ { }
+
+ // member functions
+ void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; }
+ bool isLeftBoundSpecified(void) const { return ( LeftRefID >= 0 && LeftPosition >= 0 ); }
+ bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); }
+ bool isRightBoundSpecified(void) const { return ( RightRefID >= 0 && RightPosition >= 0 ); }
+};
+
+// ----------------------------------------------------------------
+// ----------------------------------------------------------------
+// General utilities
+
+// returns true if system is big endian
+inline bool SystemIsBigEndian(void) {
+ const uint16_t one = 0x0001;
+ return ((*(char*) &one) == 0 );
+}
+
+// swaps endianness of 16-bit value 'in place'
+inline void SwapEndian_16(int16_t& x) {
+ x = ((x >> 8) | (x << 8));
+}
+
+inline void SwapEndian_16(uint16_t& x) {
+ x = ((x >> 8) | (x << 8));
+}
+
+// swaps endianness of 32-bit value 'in-place'
+inline void SwapEndian_32(int32_t& x) {
+ x = ( (x >> 24) |
+ ((x << 8) & 0x00FF0000) |
+ ((x >> 8) & 0x0000FF00) |
+ (x << 24)
+ );
+}
+
+inline void SwapEndian_32(uint32_t& x) {
+ x = ( (x >> 24) |
+ ((x << 8) & 0x00FF0000) |
+ ((x >> 8) & 0x0000FF00) |
+ (x << 24)
+ );
+}
+
+// swaps endianness of 64-bit value 'in-place'
+inline void SwapEndian_64(int64_t& x) {
+ x = ( (x >> 56) |
+ ((x << 40) & 0x00FF000000000000ll) |
+ ((x << 24) & 0x0000FF0000000000ll) |
+ ((x << 8) & 0x000000FF00000000ll) |
+ ((x >> 8) & 0x00000000FF000000ll) |
+ ((x >> 24) & 0x0000000000FF0000ll) |
+ ((x >> 40) & 0x000000000000FF00ll) |
+ (x << 56)
+ );
+}
+
+inline void SwapEndian_64(uint64_t& x) {
+ x = ( (x >> 56) |
+ ((x << 40) & 0x00FF000000000000ll) |
+ ((x << 24) & 0x0000FF0000000000ll) |
+ ((x << 8) & 0x000000FF00000000ll) |
+ ((x >> 8) & 0x00000000FF000000ll) |
+ ((x >> 24) & 0x0000000000FF0000ll) |
+ ((x >> 40) & 0x000000000000FF00ll) |
+ (x << 56)
+ );
+}
+
+// swaps endianness of 'next 2 bytes' in a char buffer (in-place)
+inline void SwapEndian_16p(char* data) {
+ uint16_t& value = (uint16_t&)*data;
+ SwapEndian_16(value);
+}
+
+// swaps endianness of 'next 4 bytes' in a char buffer (in-place)
+inline void SwapEndian_32p(char* data) {
+ uint32_t& value = (uint32_t&)*data;
+ SwapEndian_32(value);
+}
+
+// swaps endianness of 'next 8 bytes' in a char buffer (in-place)
+inline void SwapEndian_64p(char* data) {
+ uint64_t& value = (uint64_t&)*data;
+ SwapEndian_64(value);
+}
+
+// returns whether file exists (can be opened OK)
+inline bool FileExists(const std::string& filename) {
+ std::ifstream f(filename.c_str(), std::ifstream::in);
+ return !f.fail();
+}
+
+} // namespace BamTools
+
+#endif // BAMAUX_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.cpp
new file mode 100755
index 0000000..eab8a69
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.cpp
@@ -0,0 +1,230 @@
+// ***************************************************************************
+// BamIndex.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index functionality - both for the default (standardized) BAM
+// index format (.bai) as well as a BamTools-specific (nonstandard) index
+// format (.bti).
+// ***************************************************************************
+
+#include <BamIndex.h>
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamStandardIndex_p.h>
+#include <BamToolsIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+using namespace std;
+
+// --------------------------------------------------
+// BamIndex factory methods
+
+// returns index based on BAM filename 'stub'
+// checks first for preferred type, returns that type if found
+// (if not found, attmempts to load other type(s), returns 0 if NONE found)
+//
+// ** default preferred type is BamToolsIndex ** use this anytime it exists
+BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ const BamIndex::PreferredIndexType& type)
+{
+ // ---------------------------------------------------
+ // attempt to load preferred type first
+
+ const std::string bamtoolsIndexFilename = bamFilename + ".bti";
+ const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename);
+ if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists )
+ return new BamToolsIndex(bgzf, reader);
+
+ const std::string standardIndexFilename = bamFilename + ".bai";
+ const bool standardIndexExists = BamTools::FileExists(standardIndexFilename);
+ if ( (type == BamIndex::STANDARD) && standardIndexExists )
+ return new BamStandardIndex(bgzf, reader);
+
+ // ----------------------------------------------------
+ // preferred type could not be found, try other (non-preferred) types
+ // if none found, return 0
+
+ if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader);
+ if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader);
+ return 0;
+}
+
+// returns index based on explicitly named index file (or 0 if not found)
+BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader)
+{
+ // see if specified file exists
+ const bool indexExists = BamTools::FileExists(indexFilename);
+ if ( !indexExists ) return 0;
+
+ const std::string bamtoolsIndexExtension(".bti");
+ const std::string standardIndexExtension(".bai");
+
+ // if has bamtoolsIndexExtension
+ if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) )
+ return new BamToolsIndex(bgzf, reader);
+
+ // if has standardIndexExtension
+ if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) )
+ return new BamStandardIndex(bgzf, reader);
+
+ // otherwise, unsupported file type
+ return 0;
+}
+
+// -------------------------------
+// BamIndex implementation
+
+// ctor
+BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
+ : m_BGZF(bgzf)
+ , m_reader(reader)
+ , m_cacheMode(BamIndex::LimitedIndexCaching)
+ , m_indexStream(0)
+{
+ if ( m_reader && m_reader->IsOpen() )
+ m_references = m_reader->GetReferenceData();
+}
+
+// dtor
+BamIndex::~BamIndex(void) {
+ if ( IsOpen() )
+ fclose(m_indexStream);
+}
+
+// return true if FILE* is open
+bool BamIndex::IsOpen(void) const {
+ return ( m_indexStream != 0 );
+}
+
+// loads existing data from file into memory
+bool BamIndex::Load(const string& filename) {
+
+ // open index file, abort on error
+ if ( !OpenIndexFile(filename, "rb") ) {
+ fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
+ return false;
+ }
+
+ // check magic number
+ if ( !LoadHeader() ) {
+ fclose(m_indexStream);
+ return false;
+ }
+
+ // load reference data (but only keep in memory if full caching requested)
+ bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
+ if ( !LoadAllReferences(saveInitialLoad) ) {
+ fclose(m_indexStream);
+ return false;
+ }
+
+ // update index cache based on selected mode
+ UpdateCache();
+
+ // return success
+ return true;
+}
+
+// opens index file for reading/writing, return true if opened OK
+bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
+ m_indexStream = fopen(filename.c_str(), mode.c_str());
+ return ( m_indexStream != 0 );
+}
+
+// rewind index file to beginning of index data, return true if rewound OK
+bool BamIndex::Rewind(void) {
+ return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
+}
+
+// change the index caching behavior
+void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
+ if ( mode != m_cacheMode ) {
+ m_cacheMode = mode;
+ UpdateCache();
+ }
+}
+
+// updates in-memory cache of index data, depending on current cache mode
+void BamIndex::UpdateCache(void) {
+
+ // skip if file not open
+ if ( !IsOpen() ) return;
+
+ // reflect requested cache mode behavior
+ switch ( m_cacheMode ) {
+
+ case (BamIndex::FullIndexCaching) :
+ Rewind();
+ LoadAllReferences(true);
+ break;
+
+ case (BamIndex::LimitedIndexCaching) :
+ if ( HasFullDataCache() )
+ KeepOnlyFirstReferenceOffsets();
+ else {
+ ClearAllData();
+ SkipToFirstReference();
+ LoadFirstReference(true);
+ }
+ break;
+ case(BamIndex::NoIndexCaching) :
+ ClearAllData();
+ break;
+ default :
+ // unreachable
+ ;
+ }
+}
+
+// writes in-memory index data out to file
+bool BamIndex::Write(const string& bamFilename) {
+
+ // open index file for writing
+ string indexFilename = bamFilename + Extension();
+ if ( !OpenIndexFile(indexFilename, "wb") ) {
+ fprintf(stderr, "ERROR: Could not open file to save index.\n");
+ return false;
+ }
+
+ // write index header data
+ if ( !WriteHeader() ) {
+ fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
+ fflush(m_indexStream);
+ fclose(m_indexStream);
+ exit(1);
+ }
+
+ // write main index data
+ if ( !WriteAllReferences() ) {
+ fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
+ fflush(m_indexStream);
+ fclose(m_indexStream);
+ exit(1);
+ }
+
+ // flush any remaining output, rewind file, and return success
+ fflush(m_indexStream);
+ fclose(m_indexStream);
+
+ // re-open index file for later reading
+ if ( !OpenIndexFile(indexFilename, "rb") ) {
+ fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
+ return false;
+ }
+
+ // return success/failure of write
+ return true;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.h
new file mode 100755
index 0000000..a1dfbfe
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamIndex.h
@@ -0,0 +1,145 @@
+// ***************************************************************************
+// BamIndex.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic BAM index interface
+// ***************************************************************************
+
+#ifndef BAM_INDEX_H
+#define BAM_INDEX_H
+
+#include <api_global.h>
+#include <BamAux.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class BamReader;
+class BgzfData;
+
+namespace Internal {
+ class BamStandardIndex;
+ class BamToolsIndex;
+} // namespace Internal
+
+// --------------------------------------------------
+// BamIndex base class
+class API_EXPORT BamIndex {
+
+ // specify index-caching behavior
+ //
+ // @FullIndexCaching - store entire index file contents in memory
+ // @LimitedIndexCaching - store only index data for current reference
+ // being processed
+ // @NoIndexCaching - do not store any index data. Load as needed to
+ // calculate jump offset
+ public: enum BamIndexCacheMode { FullIndexCaching = 0
+ , LimitedIndexCaching
+ , NoIndexCaching
+ };
+
+ // ctor & dtor
+ public:
+ BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
+ virtual ~BamIndex(void);
+
+ // index interface
+ public:
+ // creates index data (in-memory) from current reader data
+ virtual bool Build(void) =0;
+ // returns supported file extension
+ virtual const std::string Extension(void) const =0;
+ // returns whether reference has alignments or no
+ virtual bool HasAlignments(const int& referenceID) const =0;
+ // attempts to use index to jump to region; returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0;
+ // loads existing data from file into memory
+ virtual bool Load(const std::string& filename);
+ // change the index caching behavior
+ virtual void SetCacheMode(const BamIndexCacheMode mode);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ virtual bool Write(const std::string& bamFilename);
+
+ // derived-classes MUST provide implementation
+ protected:
+ // clear all current index offset data in memory
+ virtual void ClearAllData(void) =0;
+ // return file position after header metadata
+ virtual const off_t DataBeginOffset(void) const =0;
+ // return true if all index data is cached
+ virtual bool HasFullDataCache(void) const =0;
+ // clears index data from all references except the first
+ virtual void KeepOnlyFirstReferenceOffsets(void) =0;
+ // load index data for all references, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ virtual bool LoadAllReferences(bool saveData = true) =0;
+ // load first reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ virtual bool LoadFirstReference(bool saveData = true) =0;
+ // load header data from index file, return true if loaded OK
+ virtual bool LoadHeader(void) =0;
+ // position file pointer to first reference begin, return true if skipped OK
+ virtual bool SkipToFirstReference(void) =0;
+ // write index reference data
+ virtual bool WriteAllReferences(void) =0;
+ // write index header data
+ virtual bool WriteHeader(void) =0;
+
+ // internal methods
+ protected:
+ // rewind index file to beginning of index data, return true if rewound OK
+ bool Rewind(void);
+
+ private:
+ // return true if FILE* is open
+ bool IsOpen(void) const;
+ // opens index file according to requested mode, return true if opened OK
+ bool OpenIndexFile(const std::string& filename, const std::string& mode);
+ // updates in-memory cache of index data, depending on current cache mode
+ void UpdateCache(void);
+
+ // factory methods for returning proper BamIndex-derived type based on available index files
+ public:
+
+ // returns index based on BAM filename 'stub'
+ // checks first for preferred type, returns that type if found
+ // (if not found, attmempts to load other type(s), returns 0 if NONE found)
+ //
+ // ** default preferred type is BamToolsIndex ** use this anytime it exists
+ enum PreferredIndexType { BAMTOOLS = 0, STANDARD };
+ static BamIndex* FromBamFilename(const std::string& bamFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS);
+
+ // returns index based on explicitly named index file (or 0 if not found)
+ static BamIndex* FromIndexFilename(const std::string& indexFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader);
+
+ // data members
+ protected:
+ BamTools::BgzfData* m_BGZF;
+ BamTools::BamReader* m_reader;
+ BamTools::RefVector m_references;
+ BamIndex::BamIndexCacheMode m_cacheMode;
+ FILE* m_indexStream;
+
+
+ // friends
+ friend class Internal::BamStandardIndex;
+ friend class Internal::BamToolsIndex;
+};
+
+} // namespace BamTools
+
+#endif // BAM_INDEX_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.cpp
new file mode 100755
index 0000000..15b8fb2
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.cpp
@@ -0,0 +1,450 @@
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset. It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+#include <BamMultiReader.h>
+#include <BGZF.h>
+using namespace BamTools;
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+using namespace std;
+
+// -----------------------------------------------------
+// BamMultiReader implementation
+// -----------------------------------------------------
+
+// constructor
+BamMultiReader::BamMultiReader(void)
+ : CurrentRefID(0)
+ , CurrentLeft(0)
+{ }
+
+// destructor
+BamMultiReader::~BamMultiReader(void) {
+ Close();
+}
+
+// close the BAM files
+void BamMultiReader::Close(void) {
+
+ // close all BAM readers and clean up pointers
+ vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin();
+ vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter) {
+
+ BamReader* reader = (*readerIter).first;
+ BamAlignment* alignment = (*readerIter).second;
+
+ // close the reader
+ if ( reader) reader->Close();
+
+ // delete reader pointer
+ delete reader;
+ reader = 0;
+
+ // delete alignment pointer
+ delete alignment;
+ alignment = 0;
+ }
+
+ // clear out the container
+ readers.clear();
+}
+
+// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->CreateIndex(useStandardIndex);
+ }
+ return result;
+}
+
+// sets the index caching mode on the readers
+void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ reader->SetIndexCacheMode(mode);
+ }
+}
+
+// for debugging
+void BamMultiReader::DumpAlignmentIndex(void) {
+ for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
+ }
+}
+
+// makes a virtual, unified header for all the bam files in the multireader
+const string BamMultiReader::GetHeaderText(void) const {
+
+ string mergedHeader = "";
+ map<string, bool> readGroups;
+
+ // foreach extraction entry (each BAM file)
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
+
+ BamReader* reader = rs->first;
+ string headerText = reader->GetHeaderText();
+ if ( headerText.empty() ) continue;
+
+ map<string, bool> currentFileReadGroups;
+ stringstream header(headerText);
+ vector<string> lines;
+ string item;
+ while (getline(header, item))
+ lines.push_back(item);
+
+ for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+
+ // get next line from header, skip if empty
+ string headerLine = *it;
+ if ( headerLine.empty() ) { continue; }
+
+ // if first file, save HD & SQ entries
+ if ( rs == readers.begin() ) {
+ if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
+ mergedHeader.append(headerLine.c_str());
+ mergedHeader.append(1, '\n');
+ }
+ }
+
+ // (for all files) append RG entries if they are unique
+ if ( headerLine.find("@RG") == 0 ) {
+ stringstream headerLineSs(headerLine);
+ string part, readGroupPart, readGroup;
+ while(std::getline(headerLineSs, part, '\t')) {
+ stringstream partSs(part);
+ string subtag;
+ std::getline(partSs, subtag, ':');
+ if (subtag == "ID") {
+ std::getline(partSs, readGroup, ':');
+ break;
+ }
+ }
+ if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+ mergedHeader.append(headerLine.c_str() );
+ mergedHeader.append(1, '\n');
+ readGroups[readGroup] = true;
+ currentFileReadGroups[readGroup] = true;
+ } else {
+ // warn iff we are reading one file and discover duplicated @RG tags in the header
+ // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+ if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+ cerr << "WARNING: duplicate @RG tag " << readGroup
+ << " entry in header of " << reader->GetFilename() << endl;
+ }
+ }
+ }
+ }
+ }
+
+ // return merged header text
+ return mergedHeader;
+}
+
+// get next alignment among all files
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignment(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+ //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignmentCore(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// ---------------------------------------------------------------------------------------
+//
+// NB: The following GetReferenceX() functions assume that we have identical
+// references for all BAM files. We enforce this by invoking the above
+// validation function (ValidateReaders) to verify that our reference data
+// is the same across all files on Open, so we will not encounter a situation
+// in which there is a mismatch and we are still live.
+//
+// ---------------------------------------------------------------------------------------
+
+// returns the number of reference sequences
+const int BamMultiReader::GetReferenceCount(void) const {
+ return readers.front().first->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
+ return readers.front().first->GetReferenceData();
+}
+
+// returns refID from reference name
+const int BamMultiReader::GetReferenceID(const string& refName) const {
+ return readers.front().first->GetReferenceID(refName);
+}
+
+// ---------------------------------------------------------------------------------------
+
+// checks if any readers still have alignments
+bool BamMultiReader::HasOpenReaders() {
+ return alignments.size() > 0;
+}
+
+// returns whether underlying BAM readers ALL have an index loaded
+// this is useful to indicate whether Jump() or SetRegion() are possible
+bool BamMultiReader::IsIndexLoaded(void) const {
+ bool ok = true;
+ vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin();
+ vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd = readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ const BamReader* reader = (*readerIter).first;
+ if ( reader ) ok &= reader->IsIndexLoaded();
+ }
+ return ok;
+}
+
+// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
+bool BamMultiReader::Jump(int refID, int position) {
+
+ //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
+ CurrentRefID = refID;
+ CurrentLeft = position;
+
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Jump(refID, position);
+ if (!result) {
+ cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
+ exit(1);
+ }
+ }
+ if (result) UpdateAlignments();
+ return result;
+}
+
+// opens BAM files
+bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) {
+
+ // for filename in filenames
+ fileNames = filenames; // save filenames in our multireader
+ for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
+
+ const string filename = *it;
+ BamReader* reader = new BamReader;
+
+ bool openedOK = true;
+ openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex);
+
+ // if file opened ok, check that it can be read
+ if ( openedOK ) {
+
+ bool fileOK = true;
+ BamAlignment* alignment = new BamAlignment;
+ fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) );
+
+ if (fileOK) {
+ readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else {
+ cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+ // if only file available & could not be read, return failure
+ if ( filenames.size() == 1 ) return false;
+ }
+ }
+
+ // TODO; any further error handling when openedOK is false ??
+ else
+ return false;
+ }
+
+ // files opened ok, at least one alignment could be read,
+ // now need to check that all files use same reference data
+ ValidateReaders();
+ return true;
+}
+
+void BamMultiReader::PrintFilenames(void) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ cout << reader->GetFilename() << endl;
+ }
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReader::Rewind(void) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Rewind();
+ }
+ return result;
+}
+
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
+ BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
+ return SetRegion(region);
+}
+
+bool BamMultiReader::SetRegion(const BamRegion& region) {
+
+ Region = region;
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ if (!it->first->SetRegion(region)) {
+ cerr << "ERROR: could not jump " << it->first->GetFilename() << " to "
+ << region.LeftRefID << ":" << region.LeftPosition
+ << ".." << region.RightRefID << ":" << region.RightPosition << endl;
+ }
+ }
+
+ UpdateAlignments();
+ return true;
+}
+
+void BamMultiReader::UpdateAlignments(void) {
+ // Update Alignments
+ alignments.clear();
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* br = it->first;
+ BamAlignment* ba = it->second;
+ if (br->GetNextAlignment(*ba)) {
+ alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
+ make_pair(br, ba)));
+ } else {
+ // assume BamReader end of region / EOF
+ }
+ }
+}
+
+// updates the reference id stored in the BamMultiReader
+// to reflect the current state of the readers
+void BamMultiReader::UpdateReferenceID(void) {
+ // the alignments are sorted by position, so the first alignment will always have the lowest reference ID
+ if (alignments.begin()->second.second->RefID != CurrentRefID) {
+ // get the next reference id
+ // while there aren't any readers at the next ref id
+ // increment the ref id
+ int nextRefID = CurrentRefID;
+ while (alignments.begin()->second.second->RefID != nextRefID) {
+ ++nextRefID;
+ }
+ //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+ CurrentRefID = nextRefID;
+ }
+}
+
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+void BamMultiReader::ValidateReaders(void) const {
+ int firstRefCount = readers.front().first->GetReferenceCount();
+ BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ BamTools::RefVector currentRefData = reader->GetReferenceData();
+ BamTools::RefVector::const_iterator f = firstRefData.begin();
+ BamTools::RefVector::const_iterator c = currentRefData.begin();
+ if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
+ cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
+ << " expected " << firstRefCount
+ << " reference sequences but only found " << reader->GetReferenceCount() << endl;
+ exit(1);
+ }
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while (f != firstRefData.end()) {
+ if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
+ cerr << "ERROR: mismatched references found in " << reader->GetFilename()
+ << " expected: " << endl;
+ for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ cerr << "but found: " << endl;
+ for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ exit(1);
+ }
+ ++f; ++c;
+ }
+ }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.h
new file mode 100755
index 0000000..e3c05cc
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamMultiReader.h
@@ -0,0 +1,136 @@
+// ***************************************************************************
+// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// ***************************************************************************
+
+#ifndef BAMMULTIREADER_H
+#define BAMMULTIREADER_H
+
+#include <api_global.h>
+#include <BamReader.h>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+
+namespace BamTools {
+
+// index mapping reference/position pairings to bamreaders and their alignments
+typedef std::multimap<std::pair<int, int>, std::pair<BamReader*, BamAlignment*> > AlignmentIndex;
+
+class API_EXPORT BamMultiReader {
+
+ // constructor / destructor
+ public:
+ BamMultiReader(void);
+ ~BamMultiReader(void);
+
+ // public interface
+ public:
+
+ // positioning
+ int CurrentRefID;
+ int CurrentLeft;
+
+ // region under analysis, specified using SetRegion
+ BamRegion Region;
+
+ // ----------------------
+ // BAM file operations
+ // ----------------------
+
+ // close BAM files
+ void Close(void);
+
+ // opens BAM files (and optional BAM index files, if provided)
+ // @openIndexes - triggers index opening, useful for suppressing
+ // error messages during merging of files in which we may not have
+ // indexes.
+ // @coreMode - setup our first alignments using GetNextAlignmentCore();
+ // also useful for merging
+ // @preferStandardIndex - look for standard BAM index ".bai" first. If false,
+ // will look for BamTools index ".bti".
+ bool Open(const std::vector<std::string>& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false);
+
+ // returns whether underlying BAM readers ALL have an index loaded
+ // this is useful to indicate whether Jump() or SetRegion() are possible
+ bool IsIndexLoaded(void) const;
+
+ // performs random-access jump to reference, position
+ bool Jump(int refID, int position = 0);
+
+ // sets the target region
+ bool SetRegion(const BamRegion& region);
+ bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above
+
+ // returns file pointers to beginning of alignments
+ bool Rewind(void);
+
+ // ----------------------
+ // access alignment data
+ // ----------------------
+ // updates the reference id marker to match the lower limit of our readers
+ void UpdateReferenceID(void);
+
+ // retrieves next available alignment (returns success/fail) from all files
+ bool GetNextAlignment(BamAlignment&);
+ // retrieves next available alignment (returns success/fail) from all files
+ // and populates the support data with information about the alignment
+ // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT
+ bool GetNextAlignmentCore(BamAlignment&);
+ // ... should this be private?
+ bool HasOpenReaders(void);
+
+ // ----------------------
+ // access auxiliary data
+ // ----------------------
+
+ // returns unified SAM header text for all files
+ const std::string GetHeaderText(void) const;
+ // returns number of reference sequences
+ const int GetReferenceCount(void) const;
+ // returns vector of reference objects
+ const BamTools::RefVector GetReferenceData(void) const;
+ // returns reference id (used for BamMultiReader::Jump()) for the given reference name
+ const int GetReferenceID(const std::string& refName) const;
+ // validates that we have a congruent set of BAM files that are aligned against the same reference sequences
+ void ValidateReaders() const;
+
+ // ----------------------
+ // BAM index operations
+ // ----------------------
+
+ // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")
+ bool CreateIndexes(bool useStandardIndex = true);
+
+ // sets the index caching mode for the readers
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ //const int GetReferenceID(const string& refName) const;
+
+ // utility
+ void PrintFilenames(void);
+ void DumpAlignmentIndex(void);
+ void UpdateAlignments(void); // updates our alignment cache
+
+ // private implementation
+ private:
+
+ // the set of readers and alignments which we operate on, maintained throughout the life of this class
+ std::vector<std::pair<BamReader*, BamAlignment*> > readers;
+
+ // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment
+ // when a reader reaches EOF, its entry is removed from this index
+ AlignmentIndex alignments;
+
+ std::vector<std::string> fileNames;
+};
+
+} // namespace BamTools
+
+#endif // BAMMULTIREADER_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.cpp
new file mode 100755
index 0000000..70339a6
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.cpp
@@ -0,0 +1,66 @@
+// ***************************************************************************
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include <BamReader.h>
+#include <BamReader_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <vector>
+using namespace std;
+
+// constructor
+BamReader::BamReader(void) {
+ d = new BamReaderPrivate(this);
+}
+
+// destructor
+BamReader::~BamReader(void) {
+ delete d;
+ d = 0;
+}
+
+// file operations
+void BamReader::Close(void) { d->Close(); }
+bool BamReader::HasIndex(void) const { return d->HasIndex; }
+bool BamReader::IsIndexLoaded(void) const { return HasIndex(); }
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }
+bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); }
+bool BamReader::Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex)
+{
+ return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex);
+}
+bool BamReader::Rewind(void) { return d->Rewind(); }
+bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {
+ return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );
+}
+
+// access alignment data
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }
+
+// access auxiliary data
+const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); }
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }
+const std::string BamReader::GetFilename(void) const { return d->Filename; }
+
+// index operations
+bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }
+void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); }
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.h
new file mode 100755
index 0000000..b5d9a26
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader.h
@@ -0,0 +1,130 @@
+// ***************************************************************************
+// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+#include <api_global.h>
+#include <BamAlignment.h>
+#include <BamIndex.h>
+#include <string>
+
+namespace BamTools {
+
+namespace Internal {
+ class BamReaderPrivate;
+} // namespace Internal
+
+class API_EXPORT BamReader {
+
+ // constructor / destructor
+ public:
+ BamReader(void);
+ ~BamReader(void);
+
+ // public interface
+ public:
+
+ // ----------------------
+ // BAM file operations
+ // ----------------------
+
+ // close BAM file
+ void Close(void);
+ // returns whether reader is open for reading or not
+ bool IsOpen(void) const;
+ // performs random-access jump using (reference, position) as a left-bound
+ bool Jump(int refID, int position = 0);
+ // opens BAM file (and optional BAM index file, if provided)
+ // @lookForIndex - if no indexFilename provided, look in BAM file's directory for an existing index file
+ // default behavior is to skip index file search if no index filename given
+ // @preferStandardIndex - if true, give priority in index file searching to standard BAM index (*.bai)
+ // default behavior is to prefer the BamToolsIndex (*.bti) if both are available
+ bool Open(const std::string& filename,
+ const std::string& indexFilename = "",
+ const bool lookForIndex = false,
+ const bool preferStandardIndex = false);
+ // returns file pointer to beginning of alignments
+ bool Rewind(void);
+ // sets a region of interest (with left & right bound reference/position)
+ // returns success/failure of seeking to left bound of region
+ bool SetRegion(const BamRegion& region);
+ bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound);
+
+ // ----------------------
+ // access alignment data
+ // ----------------------
+
+ // retrieves next available alignment (returns success/fail)
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ // retrieves next available alignment core data (returns success/fail)
+ // ** DOES NOT parse any character data (read name, bases, qualities, tag data) **
+ // useful for operations requiring ONLY aligner-related information
+ // (refId/position, alignment flags, CIGAR, mapQuality, etc)
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);
+
+ // ----------------------
+ // access auxiliary data
+ // ----------------------
+
+ // returns SAM header text
+ const std::string GetHeaderText(void) const;
+ // returns number of reference sequences
+ int GetReferenceCount(void) const;
+ // returns vector of reference objects
+ const BamTools::RefVector& GetReferenceData(void) const;
+ // returns reference id (used for BamReader::Jump()) for the given reference name
+ int GetReferenceID(const std::string& refName) const;
+ // returns the name of the file associated with this BamReader
+ const std::string GetFilename(void) const;
+
+ // ----------------------
+ // BAM index operations
+ // ----------------------
+
+ // creates index for BAM file, saves to file
+ // default behavior is to create the BAM standard index (".bai")
+ // set flag to false to create the BamTools-specific index (".bti")
+ bool CreateIndex(bool useStandardIndex = true);
+ // returns whether index data is available for reading
+ // (e.g. if true, BamReader should be able to seek to a region)
+ bool HasIndex(void) const;
+ // change the index caching behavior
+ // default BamReader/Index mode is LimitedIndexCaching
+ // @mode - can be either FullIndexCaching, LimitedIndexCaching,
+ // or NoIndexCaching. See BamIndex.h for more details
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ // deprecated methods
+ public:
+
+ // deprecated (but still available): prefer HasIndex() instead
+ //
+ // Deprecated purely for API semantic clarity - HasIndex() should be clearer
+ // than IsIndexLoaded() in light of the new caching modes that may clear the
+ // index data from memory, but leave the index file open for later random access
+ // seeks.
+ //
+ // For example, what would (IsIndexLoaded() == true) mean when cacheMode has been
+ // explicitly set to NoIndexCaching? This is confusing at best, misleading about
+ // current memory behavior at worst.
+ //
+ // returns whether index data is available
+ // (e.g. if true, BamReader should be able to seek to a region)
+ bool IsIndexLoaded(void) const;
+
+ // private implementation
+ private:
+ Internal::BamReaderPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMREADER_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.cpp
new file mode 100755
index 0000000..f319a1e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.cpp
@@ -0,0 +1,729 @@
+// ***************************************************************************
+// BamReader_p.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamReader_p.h>
+#include <BamStandardIndex_p.h>
+#include <BamToolsIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <vector>
+using namespace std;
+
+// constructor
+BamReaderPrivate::BamReaderPrivate(BamReader* parent)
+ : HeaderText("")
+ , Index(0)
+ , HasIndex(false)
+ , AlignmentsBeginOffset(0)
+// , m_header(0)
+ , IndexCacheMode(BamIndex::LimitedIndexCaching)
+ , HasAlignmentsInRegion(true)
+ , Parent(parent)
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
+ , CIGAR_LOOKUP("MIDNSHP")
+{
+ IsBigEndian = SystemIsBigEndian();
+}
+
+// destructor
+BamReaderPrivate::~BamReaderPrivate(void) {
+ Close();
+}
+
+// adjusts requested region if necessary (depending on where data actually begins)
+void BamReaderPrivate::AdjustRegion(BamRegion& region) {
+
+ // check for valid index first
+ if ( Index == 0 ) return;
+
+ // see if any references in region have alignments
+ HasAlignmentsInRegion = false;
+ int currentId = region.LeftRefID;
+
+ const int rightBoundRefId = ( region.isRightBoundSpecified() ? region.RightRefID : References.size() - 1 );
+ while ( currentId <= rightBoundRefId ) {
+ HasAlignmentsInRegion = Index->HasAlignments(currentId);
+ if ( HasAlignmentsInRegion ) break;
+ ++currentId;
+ }
+
+ // if no data found on any reference in region
+ if ( !HasAlignmentsInRegion ) return;
+
+ // if left bound of desired region had no data, use first reference that had data
+ // otherwise, leave requested region as-is
+ if ( currentId != region.LeftRefID ) {
+ region.LeftRefID = currentId;
+ region.LeftPosition = 0;
+ }
+}
+
+// fills out character data for BamAlignment data
+bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {
+
+ // calculate character lengths/offsets
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;
+ const unsigned int tagDataLength = dataLength - tagDataOffset;
+
+ // check offsets to see what char data exists
+ const bool hasSeqData = ( seqDataOffset < dataLength );
+ const bool hasQualData = ( qualDataOffset < dataLength );
+ const bool hasTagData = ( tagDataOffset < dataLength );
+
+ // set up char buffers
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();
+ const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
+ const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
+ char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
+
+ // store alignment name (relies on null char in name as terminator)
+ bAlignment.Name.assign((const char*)(allCharData));
+
+ // save query sequence
+ bAlignment.QueryBases.clear();
+ if ( hasSeqData ) {
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ bAlignment.QueryBases.append(1, singleBase);
+ }
+ }
+
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
+ bAlignment.Qualities.clear();
+ if ( hasQualData ) {
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleQuality = (char)(qualData[i]+33);
+ bAlignment.Qualities.append(1, singleQuality);
+ }
+ }
+
+ // if QueryBases is empty (and this is a allowed case)
+ if ( bAlignment.QueryBases.empty() )
+ bAlignment.AlignedBases = bAlignment.QueryBases;
+
+ // if QueryBases contains data, then build AlignedBases using CIGAR data
+ else {
+
+ // resize AlignedBases
+ bAlignment.AlignedBases.clear();
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+
+ // iterate over CigarOps
+ int k = 0;
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+
+ const CigarOp& op = (*cigarIter);
+ switch(op.Type) {
+
+ case ('M') :
+ case ('I') :
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases
+ // fall through
+
+ case ('S') :
+ k += op.Length; // for 'S' - soft clip, skip over query bases
+ break;
+
+ case ('D') :
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character
+ break;
+
+ case ('P') :
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character
+ break;
+
+ case ('N') :
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence
+ break;
+
+ case ('H') :
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op
+
+ default:
+ fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // save tag data
+ bAlignment.TagData.clear();
+ if ( hasTagData ) {
+ if ( IsBigEndian ) {
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i += sizeof(uint64_t);
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // store tagData in alignment
+ bAlignment.TagData.resize(tagDataLength);
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);
+ }
+
+ // clear the core-only flag
+ bAlignment.SupportData.HasCoreOnly = false;
+
+ // return success
+ return true;
+}
+
+// clear index data structure
+void BamReaderPrivate::ClearIndex(void) {
+ delete Index;
+ Index = 0;
+ HasIndex = false;
+}
+
+// closes the BAM file
+void BamReaderPrivate::Close(void) {
+
+ // close BGZF file stream
+ mBGZF.Close();
+
+ // clear out index data
+ ClearIndex();
+
+ // clear out header data
+ HeaderText.clear();
+// if ( m_header ) {
+// delete m_header;
+// m_header = 0;
+// }
+
+ // clear out region flags
+ Region.clear();
+}
+
+// creates index for BAM file, saves to file
+// default behavior is to create the BAM standard index (".bai")
+// set flag to false to create the BamTools-specific index (".bti")
+bool BamReaderPrivate::CreateIndex(bool useStandardIndex) {
+
+ // clear out prior index data
+ ClearIndex();
+
+ // create index based on type requested
+ if ( useStandardIndex )
+ Index = new BamStandardIndex(&mBGZF, Parent);
+ else
+ Index = new BamToolsIndex(&mBGZF, Parent);
+
+ // set index cache mode to full for writing
+ Index->SetCacheMode(BamIndex::FullIndexCaching);
+
+ // build new index
+ bool ok = true;
+ ok &= Index->Build();
+ HasIndex = ok;
+
+ // mark empty references
+ MarkReferences();
+
+ // attempt to save index data to file
+ ok &= Index->Write(Filename);
+
+ // set client's desired index cache mode
+ Index->SetCacheMode(IndexCacheMode);
+
+ // return success/fail of both building & writing index
+ return ok;
+}
+
+const string BamReaderPrivate::GetHeaderText(void) const {
+
+ return HeaderText;
+
+// if ( m_header )
+// return m_header->Text();
+// else
+// return string("");
+}
+
+// get next alignment (from specified region, if given)
+bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
+
+ // if valid alignment found, attempt to parse char data, and return success/failure
+ if ( GetNextAlignmentCore(bAlignment) )
+ return BuildCharData(bAlignment);
+
+ // no valid alignment found
+ else return false;
+}
+
+// retrieves next available alignment core data (returns success/fail)
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)
+// these can be accessed, if necessary, from the supportData
+// useful for operations requiring ONLY positional or other alignment-related information
+bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {
+
+ // if region is set but has no alignments
+ if ( !Region.isNull() && !HasAlignmentsInRegion )
+ return false;
+
+ // if valid alignment available
+ if ( LoadNextAlignment(bAlignment) ) {
+
+ // set core-only flag
+ bAlignment.SupportData.HasCoreOnly = true;
+
+ // if region not specified with at least a left boundary, return success
+ if ( !Region.isLeftBoundSpecified() ) return true;
+
+ // determine region state (before, within, after)
+ BamReaderPrivate::RegionState state = IsOverlap(bAlignment);
+
+ // if alignment lies after region, return false
+ if ( state == AFTER_REGION ) return false;
+
+ while ( state != WITHIN_REGION ) {
+ // if no valid alignment available (likely EOF) return failure
+ if ( !LoadNextAlignment(bAlignment) ) return false;
+ // if alignment lies after region, return false (no available read within region)
+ state = IsOverlap(bAlignment);
+ if ( state == AFTER_REGION ) return false;
+ }
+
+ // return success (alignment found that overlaps region)
+ return true;
+ }
+
+ // no valid alignment
+ else return false;
+}
+
+// returns RefID for given RefName (returns References.size() if not found)
+int BamReaderPrivate::GetReferenceID(const string& refName) const {
+
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = References.begin();
+ RefVector::const_iterator refEnd = References.end();
+ for ( ; refIter != refEnd; ++refIter)
+ refNames.push_back( (*refIter).RefName );
+
+ // return 'index-of' refName ( if not found, returns refNames.size() )
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
+}
+
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
+BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
+
+ // if alignment is on any reference sequence before left bound
+ if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;
+
+ // if alignment starts on left bound reference
+ else if ( bAlignment.RefID == Region.LeftRefID ) {
+
+ // if alignment starts at or after left boundary
+ if ( bAlignment.Position >= Region.LeftPosition) {
+
+ // if right boundary is specified AND
+ // left/right boundaries are on same reference AND
+ // alignment starts past right boundary
+ if ( Region.isRightBoundSpecified() &&
+ Region.LeftRefID == Region.RightRefID &&
+ bAlignment.Position > Region.RightPosition )
+ return AFTER_REGION;
+
+ // otherwise, alignment is within region
+ return WITHIN_REGION;
+ }
+
+ // alignment starts before left boundary
+ else {
+ // check if alignment overlaps left boundary
+ if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
+ else return BEFORE_REGION;
+ }
+ }
+
+ // alignment starts on a reference after the left bound
+ else {
+
+ // if region has a right boundary
+ if ( Region.isRightBoundSpecified() ) {
+
+ // alignment is on reference between boundaries
+ if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;
+
+ // alignment is on reference after right boundary
+ else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;
+
+ // alignment is on right bound reference
+ else {
+ // check if alignment starts before or at right boundary
+ if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
+ else return AFTER_REGION;
+ }
+ }
+
+ // otherwise, alignment is after left bound reference, but there is no right boundary
+ else return WITHIN_REGION;
+ }
+}
+
+// load BAM header data
+void BamReaderPrivate::LoadHeaderData(void) {
+
+// m_header = new BamHeader(&mBGZF);
+// bool headerLoadedOk = m_header->Load();
+// if ( !headerLoadedOk )
+// cerr << "BamReader could not load header" << endl;
+
+ // check to see if proper BAM header
+ char buffer[4];
+ if (mBGZF.Read(buffer, 4) != 4) {
+ fprintf(stderr, "Could not read header type\n");
+ exit(1);
+ }
+
+ if (strncmp(buffer, "BAM\001", 4)) {
+ fprintf(stderr, "wrong header type!\n");
+ exit(1);
+ }
+
+ // get BAM header text length
+ mBGZF.Read(buffer, 4);
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(headerTextLength);
+
+ // get BAM header text
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);
+ mBGZF.Read(headerText, headerTextLength);
+ HeaderText = (string)((const char*)headerText);
+
+ // clean up calloc-ed temp variable
+ free(headerText);
+}
+
+// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail
+bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {
+
+ // clear out any existing index data
+ ClearIndex();
+
+ // if no index filename provided, so we need to look for available index files
+ if ( IndexFilename.empty() ) {
+
+ // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag
+ const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);
+ Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+
+ // generate proper IndexFilename based on type of index created
+ IndexFilename = Filename + Index->Extension();
+ }
+
+ else {
+
+ // attempt to load BamIndex based on IndexFilename provided by client
+ Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+ }
+
+ // set cache mode for BamIndex
+ Index->SetCacheMode(IndexCacheMode);
+
+ // loading the index data from file
+ HasIndex = Index->Load(IndexFilename);
+
+ // mark empty references
+ MarkReferences();
+
+ // return index status
+ return HasIndex;
+}
+
+// populates BamAlignment with alignment data under file pointer, returns success/fail
+bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
+
+ // read in the 'block length' value, make sure it's not zero
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }
+ if ( bAlignment.SupportData.BlockLength == 0 ) return false;
+
+ // read in core alignment data, make sure the right size of data was read
+ char x[BAM_CORE_SIZE];
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false;
+
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) )
+ SwapEndian_32p(&x[i]);
+ }
+
+ // set BamAlignment 'core' and 'support' data
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
+
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
+ bAlignment.Bin = tempValue >> 16;
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;
+
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
+ bAlignment.AlignmentFlag = tempValue >> 16;
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;
+
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
+
+ // set BamAlignment length
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;
+
+ // read in character data - make sure proper data size was read
+ bool readCharDataOK = false;
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);
+
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) {
+
+ // store 'allCharData' in supportData structure
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
+
+ // set success flag
+ readCharDataOK = true;
+
+ // save CIGAR ops
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
+ // even when GetNextAlignmentCore() is called
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+ CigarOp op;
+ bAlignment.CigarData.clear();
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {
+
+ // swap if necessary
+ if ( IsBigEndian ) SwapEndian_32(cigarData[i]);
+
+ // build CigarOp structure
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+ // save CigarOp
+ bAlignment.CigarData.push_back(op);
+ }
+ }
+
+ free(allCharData);
+ return readCharDataOK;
+}
+
+// loads reference data from BAM file
+void BamReaderPrivate::LoadReferenceData(void) {
+
+ // get number of reference sequences
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);
+ if ( numberRefSeqs == 0 ) return;
+ References.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
+
+ // get length of reference name
+ mBGZF.Read(buffer, 4);
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refNameLength);
+ char* refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ mBGZF.Read(refName, refNameLength);
+ mBGZF.Read(buffer, 4);
+ int refLength = BgzfData::UnpackSignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refLength);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ References.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+}
+
+// mark references with no alignment data
+void BamReaderPrivate::MarkReferences(void) {
+
+ // ensure index is available
+ if ( !HasIndex ) return;
+
+ // mark empty references
+ for ( int i = 0; i < (int)References.size(); ++i )
+ References.at(i).RefHasAlignments = Index->HasAlignments(i);
+}
+
+// opens BAM file (and index)
+bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {
+
+ // store filenames
+ Filename = filename;
+ IndexFilename = indexFilename;
+
+ // open the BGZF file for reading, return false on failure
+ if ( !mBGZF.Open(filename, "rb") ) return false;
+
+ // retrieve header text & reference data
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store file offset of first alignment
+ AlignmentsBeginOffset = mBGZF.Tell();
+
+ // if no index filename provided
+ if ( IndexFilename.empty() ) {
+
+ // client did not specify that index SHOULD be found
+ // useful for cases where sequential access is all that is required
+ if ( !lookForIndex ) return true;
+
+ // otherwise, look for index file, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex) ;
+ }
+
+ // client supplied an index filename
+ // attempt to load index data, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex);
+}
+
+// returns BAM file pointer to beginning of alignment data
+bool BamReaderPrivate::Rewind(void) {
+
+ // rewind to first alignment, return false if unable to seek
+ if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;
+
+ // retrieve first alignment data, return false if unable to read
+ BamAlignment al;
+ if ( !LoadNextAlignment(al) ) return false;
+
+ // reset default region info using first alignment in file
+ Region.clear();
+ HasAlignmentsInRegion = true;
+
+ // rewind back to beginning of first alignment
+ // return success/fail of seek
+ return mBGZF.Seek(AlignmentsBeginOffset);
+}
+
+// change the index caching behavior
+void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
+ IndexCacheMode = mode;
+ if ( Index == 0 ) return;
+ Index->SetCacheMode(mode);
+}
+
+// asks Index to attempt a Jump() to specified region
+// returns success/failure
+bool BamReaderPrivate::SetRegion(const BamRegion& region) {
+
+ // clear out any prior BamReader region data
+ //
+ // N.B. - this is cleared so that BamIndex now has free reign to call
+ // GetNextAlignmentCore() and do overlap checking without worrying about BamReader
+ // performing any overlap checking of its own and moving on to the next read... Calls
+ // to GetNextAlignmentCore() with no Region set, simply return the next alignment.
+ // This ensures that the Index is able to do just that. (All without exposing
+ // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)
+ Region.clear();
+
+ // check for existing index
+ if ( !HasIndex ) return false;
+
+ // adjust region if necessary to reflect where data actually begins
+ BamRegion adjustedRegion(region);
+ AdjustRegion(adjustedRegion);
+
+ // if no data present, return true
+ // not an error, but BamReader knows that no data is there for future alignment access
+ // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions
+ // that other BAMs have data)
+ if ( !HasAlignmentsInRegion ) {
+ Region = adjustedRegion;
+ return true;
+ }
+
+ // attempt jump to user-specified region return false if jump could not be performed at all
+ // (invalid index, unknown reference, etc)
+ //
+ // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag
+ // * This covers case where a region is requested that lies beyond the last alignment on a reference
+ // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false
+ // BamMultiReader is then able to successfully pull alignments from a region from multiple files
+ // even if one or more have no data.
+ if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false;
+
+ // save region and return success
+ Region = adjustedRegion;
+ return true;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.h
new file mode 100755
index 0000000..8011a1f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamReader_p.h
@@ -0,0 +1,137 @@
+// ***************************************************************************
+// BamReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_P_H
+#define BAMREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAlignment.h>
+#include <BamIndex.h>
+#include <BGZF.h>
+#include <string>
+
+namespace BamTools {
+
+class BamReader;
+
+namespace Internal {
+
+class BamReaderPrivate {
+
+ // enums
+ public: enum RegionState { BEFORE_REGION = 0
+ , WITHIN_REGION
+ , AFTER_REGION
+ };
+
+ // ctor & dtor
+ public:
+ BamReaderPrivate(BamReader* parent);
+ ~BamReaderPrivate(void);
+
+ // 'public' interface to BamReader
+ public:
+
+ // file operations
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex);
+ bool Rewind(void);
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);
+
+ // access auxiliary data
+ const std::string GetHeaderText(void) const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // index operations
+ bool CreateIndex(bool useStandardIndex);
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ // 'internal' methods
+ public:
+
+ // ---------------------------------------
+ // reading alignments and auxiliary data
+
+ // adjusts requested region if necessary (depending on where data actually begins)
+ void AdjustRegion(BamRegion& region);
+ // fills out character data for BamAlignment data
+ bool BuildCharData(BamAlignment& bAlignment);
+ // checks to see if alignment overlaps current region
+ RegionState IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // mark references with 'HasAlignments' status
+ void MarkReferences(void);
+
+ // ---------------------------------
+ // index file handling
+
+ // clear out inernal index data structure
+ void ClearIndex(void);
+ // loads index from BAM index file
+ bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);
+
+ // data members
+ public:
+
+ // general file data
+ BgzfData mBGZF;
+ std::string HeaderText;
+ BamIndex* Index;
+ RefVector References;
+ bool HasIndex;
+ int64_t AlignmentsBeginOffset;
+ std::string Filename;
+ std::string IndexFilename;
+
+// Internal::BamHeader* m_header;
+
+ // index caching mode
+ BamIndex::BamIndexCacheMode IndexCacheMode;
+
+ // system data
+ bool IsBigEndian;
+
+ // user-specified region values
+ BamRegion Region;
+ bool HasAlignmentsInRegion;
+
+ // parent BamReader
+ BamReader* Parent;
+
+ // BAM character constants
+ const char* DNA_LOOKUP;
+ const char* CIGAR_LOOKUP;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMREADER_P_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.cpp
new file mode 100755
index 0000000..af9d093
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.cpp
@@ -0,0 +1,910 @@
+// ***************************************************************************
+// BamStandardIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamStandardIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+using namespace std;
+
+BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader)
+ : BamIndex(bgzf, reader)
+ , m_dataBeginOffset(0)
+ , m_hasFullDataCache(false)
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+BamStandardIndex::~BamStandardIndex(void) {
+ ClearAllData();
+}
+
+// calculate bins that overlap region
+int BamStandardIndex::BinsFromRegion(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ uint16_t bins[MAX_BIN])
+{
+ // get region boundaries
+ uint32_t begin = (unsigned int)region.LeftPosition;
+ uint32_t end;
+
+ // if right bound specified AND left&right bounds are on same reference
+ // OK to use right bound position
+ if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
+ end = (unsigned int)region.RightPosition;
+
+ // otherwise, use end of left bound reference as cutoff
+ else
+ end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1;
+
+ // initialize list, bin '0' always a valid bin
+ int i = 0;
+ bins[i++] = 0;
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
+
+ // return number of bins stored
+ return i;
+}
+
+// creates index data (in-memory) from current reader data
+bool BamStandardIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ m_reader->Rewind();
+
+ // get reference count, reserve index space
+ const int numReferences = (int)m_references.size();
+ m_indexData.clear();
+ m_hasFullDataCache = false;
+ SetReferenceCount(numReferences);
+
+ // sets default constant for bin, ID, offset, coordinate variables
+ const uint32_t defaultValue = 0xffffffffu;
+
+ // bin data
+ uint32_t saveBin(defaultValue);
+ uint32_t lastBin(defaultValue);
+
+ // reference ID data
+ int32_t saveRefID(defaultValue);
+ int32_t lastRefID(defaultValue);
+
+ // offset data
+ uint64_t saveOffset = m_BGZF->Tell();
+ uint64_t lastOffset = saveOffset;
+
+ // coordinate data
+ int32_t lastCoordinate = defaultValue;
+
+ BamAlignment bAlignment;
+ while ( m_reader->GetNextAlignmentCore(bAlignment) ) {
+
+ // change of chromosome, save ID, reset bin
+ if ( lastRefID != bAlignment.RefID ) {
+ lastRefID = bAlignment.RefID;
+ lastBin = defaultValue;
+ }
+
+ // if lastCoordinate greater than BAM position - file not sorted properly
+ else if ( lastCoordinate > bAlignment.Position ) {
+ fprintf(stderr, "BAM file not properly sorted:\n");
+ fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
+ lastCoordinate, bAlignment.Position, bAlignment.RefID);
+ exit(1);
+ }
+
+ // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
+ if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
+
+ // save linear offset entry (matched to BAM entry refID)
+ BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ ReferenceIndex& refIndex = (*indexIter).second;
+ LinearOffsetVector& offsets = refIndex.Offsets;
+ SaveLinearOffset(offsets, bAlignment, lastOffset);
+ }
+
+ // if current BamAlignment bin != lastBin, "then possibly write the binning index"
+ if ( bAlignment.Bin != lastBin ) {
+
+ // if not first time through
+ if ( saveBin != defaultValue ) {
+
+ // save Bam bin entry
+ BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ ReferenceIndex& refIndex = (*indexIter).second;
+ BamBinMap& binMap = refIndex.Bins;
+ SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // update saveOffset
+ saveOffset = lastOffset;
+
+ // update bin values
+ saveBin = bAlignment.Bin;
+ lastBin = bAlignment.Bin;
+
+ // update saveRefID
+ saveRefID = bAlignment.RefID;
+
+ // if invalid RefID, break out
+ if ( saveRefID < 0 ) break;
+ }
+
+ // make sure that current file pointer is beyond lastOffset
+ if ( m_BGZF->Tell() <= (int64_t)lastOffset ) {
+ fprintf(stderr, "Error in BGZF offsets.\n");
+ exit(1);
+ }
+
+ // update lastOffset
+ lastOffset = m_BGZF->Tell();
+
+ // update lastCoordinate
+ lastCoordinate = bAlignment.Position;
+ }
+
+ // save any leftover BAM data (as long as refID is valid)
+ if ( saveRefID >= 0 ) {
+ // save Bam bin entry
+ BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ ReferenceIndex& refIndex = (*indexIter).second;
+ BamBinMap& binMap = refIndex.Bins;
+ SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // simplify index by merging chunks
+ MergeChunks();
+
+ // iterate through references in index
+ // sort offsets in linear offset vector
+ BamStandardIndexData::iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::iterator indexEnd = m_indexData.end();
+ for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
+
+ // get reference index data
+ ReferenceIndex& refIndex = (*indexIter).second;
+ LinearOffsetVector& offsets = refIndex.Offsets;
+
+ // sort linear offsets
+ sort(offsets.begin(), offsets.end());
+ }
+
+ // rewind file pointer to beginning of alignments, return success/fail
+ return m_reader->Rewind();
+}
+
+// check index file magic number, return true if OK
+bool BamStandardIndex::CheckMagicNumber(void) {
+
+ // read in magic number
+ char magic[4];
+ size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream);
+
+ // compare to expected value
+ if ( strncmp(magic, "BAI\1", 4) != 0 ) {
+ fprintf(stderr, "Problem with index file - invalid format.\n");
+ fclose(m_indexStream);
+ return false;
+ }
+
+ // return success/failure of load
+ return (elementsRead == 4);
+}
+
+// clear all current index offset data in memory
+void BamStandardIndex::ClearAllData(void) {
+ BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+ const int& refId = (*indexIter).first;
+ ClearReferenceOffsets(refId);
+ }
+}
+
+// clear all index offset data for desired reference
+void BamStandardIndex::ClearReferenceOffsets(const int& refId) {
+
+ // look up refId, skip if not found
+ BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end() ) return ;
+
+ // clear reference data
+ ReferenceIndex& refEntry = (*indexIter).second;
+ refEntry.Bins.clear();
+ refEntry.Offsets.clear();
+
+ // set flag
+ m_hasFullDataCache = false;
+}
+
+// return file position after header metadata
+const off_t BamStandardIndex::DataBeginOffset(void) const {
+ return m_dataBeginOffset;
+}
+
+// calculates offset(s) for a given region
+bool BamStandardIndex::GetOffsets(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ vector<int64_t>& offsets,
+ bool* hasAlignmentsInRegion)
+{
+ // return false if leftBound refID is not found in index data
+ if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
+ return false;
+
+ // load index data for region if not already cached
+ if ( !IsDataLoaded(region.LeftRefID) ) {
+ bool loadedOk = true;
+ loadedOk &= SkipToReference(region.LeftRefID);
+ loadedOk &= LoadReference(region.LeftRefID);
+ if ( !loadedOk ) return false;
+ }
+
+ // calculate which bins overlap this region
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+ int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
+
+ // get bins for this reference
+ BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ const ReferenceIndex& refIndex = (*indexIter).second;
+ const BamBinMap& binMap = refIndex.Bins;
+
+ // get minimum offset to consider
+ const LinearOffsetVector& linearOffsets = refIndex.Offsets;
+ const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
+ ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
+
+ // store all alignment 'chunk' starts (file offsets) for bins in this region
+ for ( int i = 0; i < numBins; ++i ) {
+
+ const uint16_t binKey = bins[i];
+ map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
+ if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
+
+ // iterate over chunks
+ const ChunkVector& chunks = (*binIter).second;
+ std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
+ std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
+ for ( ; chunksIter != chunksEnd; ++chunksIter) {
+
+ // if valid chunk found, store its file offset
+ const Chunk& chunk = (*chunksIter);
+ if ( chunk.Stop > minOffset )
+ offsets.push_back( chunk.Start );
+ }
+ }
+ }
+
+ // clean up memory
+ free(bins);
+
+ // sort the offsets before returning
+ sort(offsets.begin(), offsets.end());
+
+ // set flag & return success
+ *hasAlignmentsInRegion = (offsets.size() != 0 );
+
+ // if cache mode set to none, dump the data we just loaded
+ if (m_cacheMode == BamIndex::NoIndexCaching )
+ ClearReferenceOffsets(region.LeftRefID);
+
+ // return succes
+ return true;
+}
+
+// returns whether reference has alignments or no
+bool BamStandardIndex::HasAlignments(const int& refId) const {
+ BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ const ReferenceIndex& refEntry = (*indexIter).second;
+ return refEntry.HasAlignments;
+}
+
+// return true if all index data is cached
+bool BamStandardIndex::HasFullDataCache(void) const {
+ return m_hasFullDataCache;
+}
+
+// returns true if index cache has data for desired reference
+bool BamStandardIndex::IsDataLoaded(const int& refId) const {
+
+ // look up refId, return false if not found
+ BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end() ) return false;
+
+ // see if reference has alignments
+ // if not, it's not a problem to have no offset data
+ const ReferenceIndex& refEntry = (*indexIter).second;
+ if ( !refEntry.HasAlignments ) return true;
+
+ // return whether bin map contains data
+ return ( !refEntry.Bins.empty() );
+}
+
+// attempts to use index to jump to region; returns success/fail
+bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // make sure left-bound position is valid
+ if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength )
+ return false;
+
+ // calculate offsets for this region
+ // if failed, print message, set flag, and return failure
+ vector<int64_t> offsets;
+ if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
+ fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
+ *hasAlignmentsInRegion = false;
+ return false;
+ }
+
+ // iterate through offsets
+ BamAlignment bAlignment;
+ bool result = true;
+ for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
+
+ // attempt seek & load first available alignment
+ // set flag to true if data exists
+ result &= m_BGZF->Seek(*o);
+ *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment);
+
+ // if this alignment corresponds to desired position
+ // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
+ if ( ((bAlignment.RefID == region.LeftRefID) &&
+ ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
+ (bAlignment.RefID > region.LeftRefID) )
+ {
+ if ( o != offsets.begin() ) --o;
+ return m_BGZF->Seek(*o);
+ }
+ }
+
+ // if error in jumping, print message & set flag
+ if ( !result ) {
+ fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
+ *hasAlignmentsInRegion = false;
+ }
+
+ // return success/failure
+ return result;
+}
+
+// clears index data from all references except the first
+void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) {
+ BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
+ KeepOnlyReferenceOffsets((*indexBegin).first);
+}
+
+// clears index data from all references except the one specified
+void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) {
+ BamStandardIndexData::iterator mapIter = m_indexData.begin();
+ BamStandardIndexData::iterator mapEnd = m_indexData.end();
+ for ( ; mapIter != mapEnd; ++mapIter ) {
+ const int entryRefId = (*mapIter).first;
+ if ( entryRefId != refId )
+ ClearReferenceOffsets(entryRefId);
+ }
+}
+
+bool BamStandardIndex::LoadAllReferences(bool saveData) {
+
+ // skip if data already loaded
+ if ( m_hasFullDataCache ) return true;
+
+ // get number of reference sequences
+ uint32_t numReferences;
+ if ( !LoadReferenceCount((int&)numReferences) )
+ return false;
+
+ // iterate over reference entries
+ bool loadedOk = true;
+ for ( int i = 0; i < (int)numReferences; ++i )
+ loadedOk &= LoadReference(i, saveData);
+
+ // set flag
+ if ( loadedOk && saveData )
+ m_hasFullDataCache = true;
+
+ // return success/failure of loading references
+ return loadedOk;
+}
+
+// load header data from index file, return true if loaded OK
+bool BamStandardIndex::LoadHeader(void) {
+
+ bool loadedOk = CheckMagicNumber();
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of load
+ return loadedOk;
+}
+
+// load a single index bin entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // get bin ID
+ uint32_t binId;
+ elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(binId);
+
+ // load alignment chunks for this bin
+ ChunkVector chunks;
+ bool chunksOk = LoadChunks(chunks, saveData);
+
+ // store bin entry
+ if ( chunksOk && saveData )
+ refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
+
+ // return success/failure of load
+ return ( (elementsRead == 1) && chunksOk );
+}
+
+bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // get number of bins
+ int32_t numBins;
+ elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numBins);
+
+ // set flag
+ refEntry.HasAlignments = ( numBins != 0 );
+
+ // iterate over bins
+ bool binsOk = true;
+ for ( int i = 0; i < numBins; ++i )
+ binsOk &= LoadBin(refEntry, saveData);
+
+ // return success/failure of load
+ return ( (elementsRead == 1) && binsOk );
+}
+
+// load a single index bin entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // read in chunk data
+ uint64_t start;
+ uint64_t stop;
+ elementsRead += fread(&start, sizeof(start), 1, m_indexStream);
+ elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // save data if requested
+ if ( saveData ) chunks.push_back( Chunk(start, stop) );
+
+ // return success/failure of load
+ return ( elementsRead == 2 );
+}
+
+bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // read in number of chunks
+ uint32_t numChunks;
+ elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numChunks);
+
+ // initialize space for chunks if we're storing this data
+ if ( saveData ) chunks.reserve(numChunks);
+
+ // iterate over chunks
+ bool chunksOk = true;
+ for ( int i = 0; i < (int)numChunks; ++i )
+ chunksOk &= LoadChunk(chunks, saveData);
+
+ // sort chunk vector
+ sort( chunks.begin(), chunks.end(), ChunkLessThan );
+
+ // return success/failure of load
+ return ( (elementsRead == 1) && chunksOk );
+}
+
+// load a single index linear offset entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // read in number of linear offsets
+ int32_t numLinearOffsets;
+ elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
+
+ // set up destination vector (if we're saving the data)
+ LinearOffsetVector linearOffsets;
+ if ( saveData ) linearOffsets.reserve(numLinearOffsets);
+
+ // iterate over linear offsets
+ uint64_t linearOffset;
+ for ( int i = 0; i < numLinearOffsets; ++i ) {
+ elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+ if ( saveData ) linearOffsets.push_back(linearOffset);
+ }
+
+ // sort linear offsets
+ sort ( linearOffsets.begin(), linearOffsets.end() );
+
+ // save in reference index entry if desired
+ if ( saveData ) refEntry.Offsets = linearOffsets;
+
+ // return success/failure of load
+ return ( elementsRead == (size_t)(numLinearOffsets + 1) );
+}
+
+bool BamStandardIndex::LoadFirstReference(bool saveData) {
+ BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
+ return LoadReference((*indexBegin).first, saveData);
+}
+
+// load a single reference from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadReference(const int& refId, bool saveData) {
+
+ // look up refId
+ BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
+
+ // if reference not previously loaded, create new entry
+ if ( indexIter == m_indexData.end() ) {
+ ReferenceIndex newEntry;
+ newEntry.HasAlignments = false;
+ m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
+ }
+
+ // load reference data
+ indexIter = m_indexData.find(refId);
+ ReferenceIndex& entry = (*indexIter).second;
+ bool loadedOk = true;
+ loadedOk &= LoadBins(entry, saveData);
+ loadedOk &= LoadLinearOffsets(entry, saveData);
+ return loadedOk;
+}
+
+// loads number of references, return true if loaded OK
+bool BamStandardIndex::LoadReferenceCount(int& numReferences) {
+
+ size_t elementsRead = 0;
+
+ // read reference count
+ elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // return success/failure of load
+ return ( elementsRead == 1 );
+}
+
+// merges 'alignment chunks' in BAM bin (used for index building)
+void BamStandardIndex::MergeChunks(void) {
+
+ // iterate over reference enties
+ BamStandardIndexData::iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ // get BAM bin map for this reference
+ ReferenceIndex& refIndex = (*indexIter).second;
+ BamBinMap& bamBinMap = refIndex.Bins;
+
+ // iterate over BAM bins
+ BamBinMap::iterator binIter = bamBinMap.begin();
+ BamBinMap::iterator binEnd = bamBinMap.end();
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // get chunk vector for this bin
+ ChunkVector& binChunks = (*binIter).second;
+ if ( binChunks.size() == 0 ) continue;
+
+ ChunkVector mergedChunks;
+ mergedChunks.push_back( binChunks[0] );
+
+ // iterate over chunks
+ int i = 0;
+ ChunkVector::iterator chunkIter = binChunks.begin();
+ ChunkVector::iterator chunkEnd = binChunks.end();
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
+
+ // get 'currentChunk' based on numeric index
+ Chunk& currentChunk = mergedChunks[i];
+
+ // get iteratorChunk based on vector iterator
+ Chunk& iteratorChunk = (*chunkIter);
+
+ // if chunk ends where (iterator) chunk starts, then merge
+ if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
+ currentChunk.Stop = iteratorChunk.Stop;
+
+ // otherwise
+ else {
+ // set currentChunk + 1 to iteratorChunk
+ mergedChunks.push_back(iteratorChunk);
+ ++i;
+ }
+ }
+
+ // saved merged chunk vector
+ (*binIter).second = mergedChunks;
+ }
+ }
+}
+
+// saves BAM bin entry for index
+void BamStandardIndex::SaveBinEntry(BamBinMap& binMap,
+ const uint32_t& saveBin,
+ const uint64_t& saveOffset,
+ const uint64_t& lastOffset)
+{
+ // look up saveBin
+ BamBinMap::iterator binIter = binMap.find(saveBin);
+
+ // create new chunk
+ Chunk newChunk(saveOffset, lastOffset);
+
+ // if entry doesn't exist
+ if ( binIter == binMap.end() ) {
+ ChunkVector newChunks;
+ newChunks.push_back(newChunk);
+ binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
+ }
+
+ // otherwise
+ else {
+ ChunkVector& binChunks = (*binIter).second;
+ binChunks.push_back( newChunk );
+ }
+}
+
+// saves linear offset entry for index
+void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets,
+ const BamAlignment& bAlignment,
+ const uint64_t& lastOffset)
+{
+ // get converted offsets
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
+ int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
+
+ // resize vector if necessary
+ int oldSize = offsets.size();
+ int newSize = endOffset + 1;
+ if ( oldSize < newSize )
+ offsets.resize(newSize, 0);
+
+ // store offset
+ for( int i = beginOffset + 1; i <= endOffset; ++i ) {
+ if ( offsets[i] == 0 )
+ offsets[i] = lastOffset;
+ }
+}
+
+// initializes index data structure to hold @count references
+void BamStandardIndex::SetReferenceCount(const int& count) {
+ for ( int i = 0; i < count; ++i )
+ m_indexData[i].HasAlignments = false;
+}
+
+bool BamStandardIndex::SkipToFirstReference(void) {
+ BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
+ return SkipToReference( (*indexBegin).first );
+}
+
+// position file pointer to desired reference begin, return true if skipped OK
+bool BamStandardIndex::SkipToReference(const int& refId) {
+
+ // attempt rewind
+ if ( !Rewind() ) return false;
+
+ // read in number of references
+ uint32_t numReferences;
+ size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // iterate over reference entries
+ bool skippedOk = true;
+ int currentRefId = 0;
+ while (currentRefId != refId) {
+ skippedOk &= LoadReference(currentRefId, false);
+ ++currentRefId;
+ }
+
+ // return success
+ return skippedOk;
+}
+
+// write header to new index file
+bool BamStandardIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write magic number
+ elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream);
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of write
+ return (elementsWritten == 4);
+}
+
+// write index data for all references to new index file
+bool BamStandardIndex::WriteAllReferences(void) {
+
+ size_t elementsWritten = 0;
+
+ // write number of reference sequences
+ int32_t numReferenceSeqs = m_indexData.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
+ elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream);
+
+ // iterate over reference sequences
+ bool refsOk = true;
+ BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++ indexIter )
+ refsOk &= WriteReference( (*indexIter).second );
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && refsOk );
+}
+
+// write index data for bin to new index file
+bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
+
+ size_t elementsWritten = 0;
+
+ // write BAM bin ID
+ uint32_t binKey = binId;
+ if ( m_isBigEndian ) SwapEndian_32(binKey);
+ elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream);
+
+ // write chunks
+ bool chunksOk = WriteChunks(chunks);
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && chunksOk );
+}
+
+// write index data for bins to new index file
+bool BamStandardIndex::WriteBins(const BamBinMap& bins) {
+
+ size_t elementsWritten = 0;
+
+ // write number of bins
+ int32_t binCount = bins.size();
+ if ( m_isBigEndian ) SwapEndian_32(binCount);
+ elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream);
+
+ // iterate over bins
+ bool binsOk = true;
+ BamBinMap::const_iterator binIter = bins.begin();
+ BamBinMap::const_iterator binEnd = bins.end();
+ for ( ; binIter != binEnd; ++binIter )
+ binsOk &= WriteBin( (*binIter).first, (*binIter).second );
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && binsOk );
+}
+
+// write index data for chunk entry to new index file
+bool BamStandardIndex::WriteChunk(const Chunk& chunk) {
+
+ size_t elementsWritten = 0;
+
+ // localize alignment chunk offsets
+ uint64_t start = chunk.Start;
+ uint64_t stop = chunk.Stop;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // write to index file
+ elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream);
+ elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream);
+
+ // return success/failure of write
+ return ( elementsWritten == 2 );
+}
+
+// write index data for chunk entry to new index file
+bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) {
+
+ size_t elementsWritten = 0;
+
+ // write chunks
+ int32_t chunkCount = chunks.size();
+ if ( m_isBigEndian ) SwapEndian_32(chunkCount);
+ elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream);
+
+ // iterate over chunks
+ bool chunksOk = true;
+ ChunkVector::const_iterator chunkIter = chunks.begin();
+ ChunkVector::const_iterator chunkEnd = chunks.end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter )
+ chunksOk &= WriteChunk( (*chunkIter) );
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && chunksOk );
+}
+
+// write index data for linear offsets entry to new index file
+bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) {
+
+ size_t elementsWritten = 0;
+
+ // write number of linear offsets
+ int32_t offsetCount = offsets.size();
+ if ( m_isBigEndian ) SwapEndian_32(offsetCount);
+ elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream);
+
+ // iterate over linear offsets
+ LinearOffsetVector::const_iterator offsetIter = offsets.begin();
+ LinearOffsetVector::const_iterator offsetEnd = offsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+
+ // write linear offset
+ uint64_t linearOffset = (*offsetIter);
+ if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+ elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream);
+ }
+
+ // return success/failure of write
+ return ( elementsWritten == (size_t)(offsetCount + 1) );
+}
+
+// write index data for a single reference to new index file
+bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) {
+ bool refOk = true;
+ refOk &= WriteBins(refEntry.Bins);
+ refOk &= WriteLinearOffsets(refEntry.Offsets);
+ return refOk;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.h
new file mode 100755
index 0000000..4a40ac0
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamStandardIndex_p.h
@@ -0,0 +1,213 @@
+// ***************************************************************************
+// BamStandardIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#ifndef BAM_STANDARD_INDEX_FORMAT_H
+#define BAM_STANDARD_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAux.h>
+#include <BamIndex.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class BamAlignment;
+
+namespace Internal {
+
+// BAM index constants
+const int MAX_BIN = 37450; // =(8^6-1)/7+1
+const int BAM_LIDX_SHIFT = 14;
+
+// --------------------------------------------------
+// BamStandardIndex data structures & typedefs
+struct Chunk {
+
+ // data members
+ uint64_t Start;
+ uint64_t Stop;
+
+ // constructor
+ Chunk(const uint64_t& start = 0,
+ const uint64_t& stop = 0)
+ : Start(start)
+ , Stop(stop)
+ { }
+};
+
+inline
+bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
+ return lhs.Start < rhs.Start;
+}
+
+typedef std::vector<Chunk> ChunkVector;
+typedef std::map<uint32_t, ChunkVector> BamBinMap;
+typedef std::vector<uint64_t> LinearOffsetVector;
+
+struct ReferenceIndex {
+
+ // data members
+ BamBinMap Bins;
+ LinearOffsetVector Offsets;
+ bool HasAlignments;
+
+ // constructor
+ ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
+ const LinearOffsetVector& offsets = LinearOffsetVector(),
+ const bool hasAlignments = false)
+ : Bins(binMap)
+ , Offsets(offsets)
+ , HasAlignments(hasAlignments)
+ { }
+};
+
+typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData;
+
+class BamStandardIndex : public BamIndex {
+
+ // ctor & dtor
+ public:
+ BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
+ ~BamStandardIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // returns supported file extension
+ const std::string Extension(void) const { return std::string(".bai"); }
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index to jump to region; returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ public:
+ // clear all current index offset data in memory
+ void ClearAllData(void);
+ // return file position after header metadata
+ const off_t DataBeginOffset(void) const;
+ // return true if all index data is cached
+ bool HasFullDataCache(void) const;
+ // clears index data from all references except the first
+ void KeepOnlyFirstReferenceOffsets(void);
+ // load index data for all references, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadAllReferences(bool saveData = true);
+ // load first reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadFirstReference(bool saveData = true);
+ // load header data from index file, return true if loaded OK
+ bool LoadHeader(void);
+ // position file pointer to first reference begin, return true if skipped OK
+ bool SkipToFirstReference(void);
+ // write index reference data
+ bool WriteAllReferences(void);
+ // write index header data
+ bool WriteHeader(void);
+
+ // 'internal' methods
+ public:
+
+ // -----------------------
+ // index file operations
+
+ // check index file magic number, return true if OK
+ bool CheckMagicNumber(void);
+ // check index file version, return true if OK
+ bool CheckVersion(void);
+ // load a single index bin entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
+ bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
+ // load a single index bin entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadChunk(ChunkVector& chunks, bool saveData = true);
+ bool LoadChunks(ChunkVector& chunks, bool saveData = true);
+ // load a single index linear offset entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
+ // load a single reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadReference(const int& refId, bool saveData = true);
+ // loads number of references, return true if loaded OK
+ bool LoadReferenceCount(int& numReferences);
+ // position file pointer to desired reference begin, return true if skipped OK
+ bool SkipToReference(const int& refId);
+ // write index data for bin to new index file
+ bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
+ // write index data for bins to new index file
+ bool WriteBins(const BamBinMap& bins);
+ // write index data for chunk entry to new index file
+ bool WriteChunk(const Chunk& chunk);
+ // write index data for chunk entry to new index file
+ bool WriteChunks(const ChunkVector& chunks);
+ // write index data for linear offsets entry to new index file
+ bool WriteLinearOffsets(const LinearOffsetVector& offsets);
+ // write index data single reference to new index file
+ bool WriteReference(const ReferenceIndex& refEntry);
+
+ // -----------------------
+ // index data operations
+
+ // calculate bins that overlap region
+ int BinsFromRegion(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ uint16_t bins[MAX_BIN]);
+ // clear all index offset data for desired reference
+ void ClearReferenceOffsets(const int& refId);
+ // calculates offset(s) for a given region
+ bool GetOffsets(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ std::vector<int64_t>& offsets,
+ bool* hasAlignmentsInRegion);
+ // returns true if index cache has data for desired reference
+ bool IsDataLoaded(const int& refId) const;
+ // clears index data from all references except the one specified
+ void KeepOnlyReferenceOffsets(const int& refId);
+ // simplifies index by merging 'chunks'
+ void MergeChunks(void);
+ // saves BAM bin entry for index
+ void SaveBinEntry(BamBinMap& binMap,
+ const uint32_t& saveBin,
+ const uint64_t& saveOffset,
+ const uint64_t& lastOffset);
+ // saves linear offset entry for index
+ void SaveLinearOffset(LinearOffsetVector& offsets,
+ const BamAlignment& bAlignment,
+ const uint64_t& lastOffset);
+ // initializes index data structure to hold @count references
+ void SetReferenceCount(const int& count);
+
+ // data members
+ private:
+
+ BamStandardIndexData m_indexData;
+ off_t m_dataBeginOffset;
+ bool m_hasFullDataCache;
+ bool m_isBigEndian;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAM_STANDARD_INDEX_FORMAT_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.cpp
new file mode 100755
index 0000000..1728b62
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.cpp
@@ -0,0 +1,577 @@
+// ***************************************************************************
+// BamToolsIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamToolsIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+using namespace std;
+
+BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
+ : BamIndex(bgzf, reader)
+ , m_blockSize(1000)
+ , m_dataBeginOffset(0)
+ , m_hasFullDataCache(false)
+ , m_inputVersion(0)
+ , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamToolsIndex::~BamToolsIndex(void) {
+ ClearAllData();
+}
+
+// creates index data (in-memory) from current reader data
+bool BamToolsIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ if ( !m_reader->Rewind() ) return false;
+
+ // initialize index data structure with space for all references
+ const int numReferences = (int)m_references.size();
+ m_indexData.clear();
+ m_hasFullDataCache = false;
+ SetReferenceCount(numReferences);
+
+ // set up counters and markers
+ int32_t currentBlockCount = 0;
+ int64_t currentAlignmentOffset = m_BGZF->Tell();
+ int32_t blockRefId = 0;
+ int32_t blockMaxEndPosition = 0;
+ int64_t blockStartOffset = currentAlignmentOffset;
+ int32_t blockStartPosition = -1;
+
+ // plow through alignments, storing index entries
+ BamAlignment al;
+ while ( m_reader->GetNextAlignmentCore(al) ) {
+
+ // if block contains data (not the first time through) AND alignment is on a new reference
+ if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
+
+ // store previous data
+ BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ SaveOffsetEntry(blockRefId, entry);
+
+ // intialize new block for current alignment's reference
+ currentBlockCount = 0;
+ blockMaxEndPosition = al.GetEndPosition();
+ blockStartOffset = currentAlignmentOffset;
+ }
+
+ // if beginning of block, save first alignment's refID & position
+ if ( currentBlockCount == 0 ) {
+ blockRefId = al.RefID;
+ blockStartPosition = al.Position;
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // check end position
+ int32_t alignmentEndPosition = al.GetEndPosition();
+ if ( alignmentEndPosition > blockMaxEndPosition )
+ blockMaxEndPosition = alignmentEndPosition;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if ( currentBlockCount == m_blockSize ) {
+ BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ SaveOffsetEntry(blockRefId, entry);
+ blockStartOffset = m_BGZF->Tell();
+ currentBlockCount = 0;
+ }
+
+ // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
+ // necessary because we won't know if this next alignment is on a new reference until we actually read it
+ currentAlignmentOffset = m_BGZF->Tell();
+ }
+
+ // store final block with data
+ BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ SaveOffsetEntry(blockRefId, entry);
+
+ // set flag
+ m_hasFullDataCache = true;
+
+ // return success/failure of rewind
+ return m_reader->Rewind();
+}
+
+// check index file magic number, return true if OK
+bool BamToolsIndex::CheckMagicNumber(void) {
+
+ // see if index is valid BAM index
+ char magic[4];
+ size_t elementsRead = fread(magic, 1, 4, m_indexStream);
+ if ( elementsRead != 4 ) return false;
+ if ( strncmp(magic, "BTI\1", 4) != 0 ) {
+ fprintf(stderr, "Problem with index file - invalid format.\n");
+ return false;
+ }
+
+ // otherwise ok
+ return true;
+}
+
+// check index file version, return true if OK
+bool BamToolsIndex::CheckVersion(void) {
+
+ // read version from file
+ size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
+
+ // if version is negative, or zero
+ if ( m_inputVersion <= 0 ) {
+ fprintf(stderr, "Problem with index file - invalid version.\n");
+ return false;
+ }
+
+ // if version is newer than can be supported by this version of bamtools
+ else if ( m_inputVersion > m_outputVersion ) {
+ fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
+ fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
+ return false;
+ }
+
+ // ------------------------------------------------------------------
+ // check for deprecated, unsupported versions
+ // (typically whose format did not accomodate a particular bug fix)
+
+ else if ( (Version)m_inputVersion == BTI_1_0 ) {
+ fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
+ fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
+ return false;
+ }
+
+ else if ( (Version)m_inputVersion == BTI_1_1 ) {
+ fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
+ fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
+ return false;
+ }
+
+ // otherwise ok
+ else return true;
+}
+
+// clear all current index offset data in memory
+void BamToolsIndex::ClearAllData(void) {
+ BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
+ BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+ const int& refId = (*indexIter).first;
+ ClearReferenceOffsets(refId);
+ }
+}
+
+// clear all index offset data for desired reference
+void BamToolsIndex::ClearReferenceOffsets(const int& refId) {
+ if ( m_indexData.find(refId) == m_indexData.end() ) return;
+ vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
+ offsets.clear();
+ m_hasFullDataCache = false;
+}
+
+// return file position after header metadata
+const off_t BamToolsIndex::DataBeginOffset(void) const {
+ return m_dataBeginOffset;
+}
+
+// calculate BAM file offset for desired region
+// return true if no error (*NOT* equivalent to "has alignments or valid offset")
+// check @hasAlignmentsInRegion to determine this status
+// @region - target region
+// @offset - resulting seek target
+// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
+// N.B. - ignores isRightBoundSpecified
+bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
+
+ // return false if leftBound refID is not found in index data
+ BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
+ if ( indexIter == m_indexData.end()) return false;
+
+ // load index data for region if not already cached
+ if ( !IsDataLoaded(region.LeftRefID) ) {
+ bool loadedOk = true;
+ loadedOk &= SkipToReference(region.LeftRefID);
+ loadedOk &= LoadReference(region.LeftRefID);
+ if ( !loadedOk ) return false;
+ }
+
+ // localize index data for this reference (& sanity check that data actually exists)
+ indexIter = m_indexData.find(region.LeftRefID);
+ if ( indexIter == m_indexData.end()) return false;
+ const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
+ if ( referenceOffsets.empty() ) return false;
+
+ // -------------------------------------------------------
+ // calculate nearest index to jump to
+
+ // save first offset
+ offset = (*referenceOffsets.begin()).StartOffset;
+
+ // iterate over offsets entries on this reference
+ vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
+ vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+ const BamToolsIndexEntry& entry = (*offsetIter);
+ // break if alignment 'entry' overlaps region
+ if ( entry.MaxEndPosition >= region.LeftPosition ) break;
+ offset = (*offsetIter).StartOffset;
+ }
+
+ // set flag based on whether an index entry was found for this region
+ *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
+
+ // if cache mode set to none, dump the data we just loaded
+ if (m_cacheMode == BamIndex::NoIndexCaching )
+ ClearReferenceOffsets(region.LeftRefID);
+
+ // return success
+ return true;
+}
+
+// returns whether reference has alignments or no
+bool BamToolsIndex::HasAlignments(const int& refId) const {
+
+ BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end()) return false;
+ const BamToolsReferenceEntry& refEntry = (*indexIter).second;
+ return refEntry.HasAlignments;
+}
+
+// return true if all index data is cached
+bool BamToolsIndex::HasFullDataCache(void) const {
+ return m_hasFullDataCache;
+}
+
+// returns true if index cache has data for desired reference
+bool BamToolsIndex::IsDataLoaded(const int& refId) const {
+
+ BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end()) return false;
+ const BamToolsReferenceEntry& refEntry = (*indexIter).second;
+
+ if ( !refEntry.HasAlignments ) return true; // no data period
+
+ // return whether offsets list contains data
+ return !refEntry.Offsets.empty();
+}
+
+// attempts to use index to jump to region; returns success/fail
+bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // clear flag
+ *hasAlignmentsInRegion = false;
+
+ // check valid BamReader state
+ if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) {
+ fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
+ return false;
+ }
+
+ // make sure left-bound position is valid
+ if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength )
+ return false;
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
+ fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
+ return false;
+ }
+
+ // return success/failure of seek
+ return m_BGZF->Seek(offset);
+}
+
+// clears index data from all references except the first
+void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) {
+ BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
+ KeepOnlyReferenceOffsets( (*indexBegin).first );
+}
+
+// clears index data from all references except the one specified
+void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) {
+ BamToolsIndexData::iterator mapIter = m_indexData.begin();
+ BamToolsIndexData::iterator mapEnd = m_indexData.end();
+ for ( ; mapIter != mapEnd; ++mapIter ) {
+ const int entryRefId = (*mapIter).first;
+ if ( entryRefId != refId )
+ ClearReferenceOffsets(entryRefId);
+ }
+}
+
+// load index data for all references, return true if loaded OK
+bool BamToolsIndex::LoadAllReferences(bool saveData) {
+
+ // skip if data already loaded
+ if ( m_hasFullDataCache ) return true;
+
+ // read in number of references
+ int32_t numReferences;
+ if ( !LoadReferenceCount(numReferences) ) return false;
+ //SetReferenceCount(numReferences);
+
+ // iterate over reference entries
+ bool loadedOk = true;
+ for ( int i = 0; i < numReferences; ++i )
+ loadedOk &= LoadReference(i, saveData);
+
+ // set flag
+ if ( loadedOk && saveData )
+ m_hasFullDataCache = true;
+
+ // return success/failure of load
+ return loadedOk;
+}
+
+// load header data from index file, return true if loaded OK
+bool BamToolsIndex::LoadHeader(void) {
+
+ // check magic number
+ if ( !CheckMagicNumber() ) return false;
+
+ // check BTI version
+ if ( !CheckVersion() ) return false;
+
+ // read in block size
+ size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of load
+ return (elementsRead == 1);
+}
+
+// load a single index entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) {
+
+ // read in index entry data members
+ size_t elementsRead = 0;
+ BamToolsIndexEntry entry;
+ elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream);
+ elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream);
+ elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream);
+ if ( elementsRead != 3 ) {
+ cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
+ return false;
+ }
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(entry.MaxEndPosition);
+ SwapEndian_64(entry.StartOffset);
+ SwapEndian_32(entry.StartPosition);
+ }
+
+ // save data
+ if ( saveData )
+ SaveOffsetEntry(refId, entry);
+
+ // return success/failure of load
+ return true;
+}
+
+// load a single reference from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamToolsIndex::LoadFirstReference(bool saveData) {
+ BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
+ return LoadReference( (*indexBegin).first, saveData );
+}
+
+// load a single reference from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamToolsIndex::LoadReference(const int& refId, bool saveData) {
+
+ // read in number of offsets for this reference
+ uint32_t numOffsets;
+ size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(numOffsets);
+
+ // initialize offsets container for this reference
+ SetOffsetCount(refId, (int)numOffsets);
+
+ // iterate over offset entries
+ for ( unsigned int j = 0; j < numOffsets; ++j )
+ LoadIndexEntry(refId, saveData);
+
+ // return success/failure of load
+ return true;
+}
+
+// loads number of references, return true if loaded OK
+bool BamToolsIndex::LoadReferenceCount(int& numReferences) {
+
+ size_t elementsRead = 0;
+
+ // read reference count
+ elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // return success/failure of load
+ return ( elementsRead == 1 );
+}
+
+// saves an index offset entry in memory
+void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
+ BamToolsReferenceEntry& refEntry = m_indexData[refId];
+ refEntry.HasAlignments = true;
+ refEntry.Offsets.push_back(entry);
+}
+
+// pre-allocates size for offset vector
+void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) {
+ BamToolsReferenceEntry& refEntry = m_indexData[refId];
+ refEntry.Offsets.reserve(offsetCount);
+ refEntry.HasAlignments = ( offsetCount > 0);
+}
+
+// initializes index data structure to hold @count references
+void BamToolsIndex::SetReferenceCount(const int& count) {
+ for ( int i = 0; i < count; ++i )
+ m_indexData[i].HasAlignments = false;
+}
+
+// position file pointer to first reference begin, return true if skipped OK
+bool BamToolsIndex::SkipToFirstReference(void) {
+ BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
+ return SkipToReference( (*indexBegin).first );
+}
+
+// position file pointer to desired reference begin, return true if skipped OK
+bool BamToolsIndex::SkipToReference(const int& refId) {
+
+ // attempt rewind
+ if ( !Rewind() ) return false;
+
+ // read in number of references
+ int32_t numReferences;
+ size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // iterate over reference entries
+ bool skippedOk = true;
+ int currentRefId = 0;
+ while (currentRefId != refId) {
+ skippedOk &= LoadReference(currentRefId, false);
+ ++currentRefId;
+ }
+
+ // return success/failure of skip
+ return skippedOk;
+}
+
+// write header to new index file
+bool BamToolsIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write BTI index format 'magic number'
+ elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream);
+
+ // write BTI index format version
+ int32_t currentVersion = (int32_t)m_outputVersion;
+ if ( m_isBigEndian ) SwapEndian_32(currentVersion);
+ elementsWritten += fwrite(&currentVersion, sizeof(currentVersion), 1, m_indexStream);
+
+ // write block size
+ int32_t blockSize = m_blockSize;
+ if ( m_isBigEndian ) SwapEndian_32(blockSize);
+ elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of write
+ return ( elementsWritten == 6 );
+}
+
+// write index data for all references to new index file
+bool BamToolsIndex::WriteAllReferences(void) {
+
+ size_t elementsWritten = 0;
+
+ // write number of references
+ int32_t numReferences = (int32_t)m_indexData.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
+
+ // iterate through references in index
+ bool refOk = true;
+ BamToolsIndexData::const_iterator refIter = m_indexData.begin();
+ BamToolsIndexData::const_iterator refEnd = m_indexData.end();
+ for ( ; refIter != refEnd; ++refIter )
+ refOk &= WriteReferenceEntry( (*refIter).second );
+
+ return ( (elementsWritten == 1) && refOk );
+}
+
+// write current reference index data to new index file
+bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
+
+ size_t elementsWritten = 0;
+
+ // write number of offsets listed for this reference
+ uint32_t numOffsets = refEntry.Offsets.size();
+ if ( m_isBigEndian ) SwapEndian_32(numOffsets);
+ elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
+
+ // iterate over offset entries
+ bool entriesOk = true;
+ vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
+ vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter )
+ entriesOk &= WriteIndexEntry( (*offsetIter) );
+
+ return ( (elementsWritten == 1) && entriesOk );
+}
+
+// write current index offset entry to new index file
+bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) {
+
+ // copy entry data
+ int32_t maxEndPosition = entry.MaxEndPosition;
+ int64_t startOffset = entry.StartOffset;
+ int32_t startPosition = entry.StartPosition;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(maxEndPosition);
+ SwapEndian_64(startOffset);
+ SwapEndian_32(startPosition);
+ }
+
+ // write the reference index entry
+ size_t elementsWritten = 0;
+ elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
+ elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream);
+ elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream);
+ return ( elementsWritten == 3 );
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.h
new file mode 100755
index 0000000..3305fb6
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamToolsIndex_p.h
@@ -0,0 +1,192 @@
+// ***************************************************************************
+// BamToolsIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#ifndef BAMTOOLS_INDEX_FORMAT_H
+#define BAMTOOLS_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAux.h>
+#include <BamIndex.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+namespace Internal {
+
+// individual index offset entry
+struct BamToolsIndexEntry {
+
+ // data members
+ int32_t MaxEndPosition;
+ int64_t StartOffset;
+ int32_t StartPosition;
+
+ // ctor
+ BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
+ const int64_t& startOffset = 0,
+ const int32_t& startPosition = 0)
+ : MaxEndPosition(maxEndPosition)
+ , StartOffset(startOffset)
+ , StartPosition(startPosition)
+ { }
+};
+
+// reference index entry
+struct BamToolsReferenceEntry {
+
+ // data members
+ bool HasAlignments;
+ std::vector<BamToolsIndexEntry> Offsets;
+
+ // ctor
+ BamToolsReferenceEntry(void)
+ : HasAlignments(false)
+ { }
+};
+
+// the actual index data structure
+typedef std::map<int, BamToolsReferenceEntry> BamToolsIndexData;
+
+class BamToolsIndex : public BamIndex {
+
+ // keep a list of any supported versions here
+ // (might be useful later to handle any 'legacy' versions if the format changes)
+ // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
+ //
+ // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
+ //
+ // if ( indexVersion >= BTI_1_2 )
+ // do something new
+ // else
+ // do the old thing
+ enum Version { BTI_1_0 = 1
+ , BTI_1_1
+ , BTI_1_2
+ };
+
+
+ // ctor & dtor
+ public:
+ BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
+ ~BamToolsIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // returns supported file extension
+ const std::string Extension(void) const { return std::string(".bti"); }
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index to jump to region; returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ public:
+ // clear all current index offset data in memory
+ void ClearAllData(void);
+ // return file position after header metadata
+ const off_t DataBeginOffset(void) const;
+ // return true if all index data is cached
+ bool HasFullDataCache(void) const;
+ // clears index data from all references except the first
+ void KeepOnlyFirstReferenceOffsets(void);
+ // load index data for all references, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadAllReferences(bool saveData = true);
+ // load first reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadFirstReference(bool saveData = true);
+ // load header data from index file, return true if loaded OK
+ bool LoadHeader(void);
+ // position file pointer to first reference begin, return true if skipped OK
+ bool SkipToFirstReference(void);
+ // write index reference data
+ bool WriteAllReferences(void);
+ // write index header data
+ bool WriteHeader(void);
+
+ // 'internal' methods
+ public:
+
+ // -----------------------
+ // index file operations
+
+ // check index file magic number, return true if OK
+ bool CheckMagicNumber(void);
+ // check index file version, return true if OK
+ bool CheckVersion(void);
+ // return true if FILE* is open
+ bool IsOpen(void) const;
+ // load a single index entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadIndexEntry(const int& refId, bool saveData = true);
+ // load a single reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadReference(const int& refId, bool saveData = true);
+ // loads number of references, return true if loaded OK
+ bool LoadReferenceCount(int& numReferences);
+ // position file pointer to desired reference begin, return true if skipped OK
+ bool SkipToReference(const int& refId);
+ // write current reference index data to new index file
+ bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
+ // write current index offset entry to new index file
+ bool WriteIndexEntry(const BamToolsIndexEntry& entry);
+
+ // -----------------------
+ // index data operations
+
+ // clear all index offset data for desired reference
+ void ClearReferenceOffsets(const int& refId);
+ // calculate BAM file offset for desired region
+ // return true if no error (*NOT* equivalent to "has alignments or valid offset")
+ // check @hasAlignmentsInRegion to determine this status
+ // @region - target region
+ // @offset - resulting seek target
+ // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
+ bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
+ // returns true if index cache has data for desired reference
+ bool IsDataLoaded(const int& refId) const;
+ // clears index data from all references except the one specified
+ void KeepOnlyReferenceOffsets(const int& refId);
+ // saves an index offset entry in memory
+ void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
+ // pre-allocates size for offset vector
+ void SetOffsetCount(const int& refId, const int& offsetCount);
+ // initializes index data structure to hold @count references
+ void SetReferenceCount(const int& count);
+
+ // data members
+ private:
+ int32_t m_blockSize;
+ BamToolsIndexData m_indexData;
+ off_t m_dataBeginOffset;
+ bool m_hasFullDataCache;
+ bool m_isBigEndian;
+ int32_t m_inputVersion; // Version is serialized as int
+ Version m_outputVersion;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMTOOLS_INDEX_FORMAT_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.cpp
new file mode 100755
index 0000000..f168a2f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.cpp
@@ -0,0 +1,47 @@
+// ***************************************************************************
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include <BamWriter.h>
+#include <BamWriter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+using namespace std;
+
+// constructor
+BamWriter::BamWriter(void) {
+ d = new BamWriterPrivate;
+}
+
+// destructor
+BamWriter::~BamWriter(void) {
+ delete d;
+ d = 0;
+}
+
+// closes the alignment archive
+void BamWriter::Close(void) {
+ d->Close();
+}
+
+// opens the alignment archive
+bool BamWriter::Open(const string& filename,
+ const string& samHeader,
+ const RefVector& referenceSequences,
+ bool isWriteUncompressed)
+{
+ return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);
+}
+
+// saves the alignment to the alignment archive
+void BamWriter::SaveAlignment(const BamAlignment& al) {
+ d->SaveAlignment(al);
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.h
new file mode 100755
index 0000000..bcbdddd
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter.h
@@ -0,0 +1,50 @@
+// ***************************************************************************
+// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_H
+#define BAMWRITER_H
+
+#include <api_global.h>
+#include <BamAlignment.h>
+#include <string>
+
+namespace BamTools {
+
+namespace Internal {
+ class BamWriterPrivate;
+} // namespace Internal
+
+class API_EXPORT BamWriter {
+
+ // constructor/destructor
+ public:
+ BamWriter(void);
+ ~BamWriter(void);
+
+ // public interface
+ public:
+ // closes the alignment archive
+ void Close(void);
+ // opens the alignment archive
+ bool Open(const std::string& filename,
+ const std::string& samHeader,
+ const BamTools::RefVector& referenceSequences,
+ bool writeUncompressed = false);
+ // saves the alignment to the alignment archive
+ void SaveAlignment(const BamTools::BamAlignment& al);
+
+ // private implementation
+ private:
+ Internal::BamWriterPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMWRITER_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.cpp
new file mode 100755
index 0000000..bc3beb0
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.cpp
@@ -0,0 +1,379 @@
+// ***************************************************************************
+// BamWriter_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamWriter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+BamWriterPrivate::BamWriterPrivate(void) {
+ IsBigEndian = SystemIsBigEndian();
+}
+
+BamWriterPrivate::~BamWriterPrivate(void) {
+ mBGZF.Close();
+}
+
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+ mBGZF.Close();
+}
+
+// calculates minimum bin for a BAM alignment interval
+const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
+ --end;
+ if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
+ if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
+ if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
+ if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
+ if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
+ return 0;
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+ // initialize
+ const unsigned int numCigarOperations = cigarOperations.size();
+ packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
+
+ // pack the cigar data into the string
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+ unsigned int cigarOp;
+ vector<CigarOp>::const_iterator coIter;
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
+
+ switch(coIter->Type) {
+ case 'M':
+ cigarOp = BAM_CMATCH;
+ break;
+ case 'I':
+ cigarOp = BAM_CINS;
+ break;
+ case 'D':
+ cigarOp = BAM_CDEL;
+ break;
+ case 'N':
+ cigarOp = BAM_CREF_SKIP;
+ break;
+ case 'S':
+ cigarOp = BAM_CSOFT_CLIP;
+ break;
+ case 'H':
+ cigarOp = BAM_CHARD_CLIP;
+ break;
+ case 'P':
+ cigarOp = BAM_CPAD;
+ break;
+ default:
+ fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+ exit(1);
+ }
+
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
+ }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+ // prepare the encoded query string
+ const unsigned int queryLen = query.size();
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+ encodedQuery.resize(encodedQueryLen);
+ char* pEncodedQuery = (char*)encodedQuery.data();
+ const char* pQuery = (const char*)query.data();
+
+ unsigned char nucleotideCode;
+ bool useHighWord = true;
+
+ while(*pQuery) {
+
+ switch(*pQuery) {
+
+ case '=':
+ nucleotideCode = 0;
+ break;
+
+ case 'A':
+ nucleotideCode = 1;
+ break;
+
+ case 'C':
+ nucleotideCode = 2;
+ break;
+
+ case 'G':
+ nucleotideCode = 4;
+ break;
+
+ case 'T':
+ nucleotideCode = 8;
+ break;
+
+ case 'N':
+ nucleotideCode = 15;
+ break;
+
+ default:
+ fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+ exit(1);
+ }
+
+ // pack the nucleotide code
+ if(useHighWord) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ pEncodedQuery++;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ pQuery++;
+ }
+}
+
+// opens the alignment archive
+bool BamWriterPrivate::Open(const string& filename,
+ const string& samHeader,
+ const RefVector& referenceSequences,
+ bool isWriteUncompressed)
+{
+ // open the BGZF file for writing, return failure if error
+ if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
+ return false;
+
+ // ================
+ // write the header
+ // ================
+
+ // write the BAM signature
+ const unsigned char SIGNATURE_LENGTH = 4;
+ const char* BAM_SIGNATURE = "BAM\1";
+ mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+ // write the SAM header text length
+ uint32_t samHeaderLen = samHeader.size();
+ if (IsBigEndian) SwapEndian_32(samHeaderLen);
+ mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
+
+ // write the SAM header text
+ if(samHeaderLen > 0)
+ mBGZF.Write(samHeader.data(), samHeaderLen);
+
+ // write the number of reference sequences
+ uint32_t numReferenceSequences = referenceSequences.size();
+ if (IsBigEndian) SwapEndian_32(numReferenceSequences);
+ mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
+
+ // =============================
+ // write the sequence dictionary
+ // =============================
+
+ RefVector::const_iterator rsIter;
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+ // write the reference sequence name length
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
+
+ // write the reference sequence name
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+ // write the reference sequence length
+ int32_t referenceLength = rsIter->RefLength;
+ if (IsBigEndian) SwapEndian_32(referenceLength);
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
+ }
+
+ // return success
+ return true;
+}
+
+// saves the alignment to the alignment archive
+void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
+
+ // if BamAlignment contains only the core data and a raw char data buffer
+ // (as a result of BamReader::GetNextAlignmentCore())
+ if ( al.SupportData.HasCoreOnly ) {
+
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the raw char data
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
+ }
+
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+ // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
+ else {
+
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+ // force calculation of Bin before storing
+ const int endPosition = al.GetEndPosition();
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLength = encodedQuery.size();
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the query name
+ mBGZF.Write(al.Name.c_str(), nameLength);
+
+ // write the packed cigar
+ if ( IsBigEndian ) {
+
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {
+ if ( IsBigEndian )
+ SwapEndian_32p(&cigarData[i]);
+ }
+
+ mBGZF.Write(cigarData, packedCigarLength);
+ free(cigarData);
+ }
+ else
+ mBGZF.Write(packedCigar.data(), packedCigarLength);
+
+ // write the encoded query sequence
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ string baseQualities(al.Qualities);
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for(unsigned int i = 0; i < queryLength; i++) {
+ pBaseQualities[i] -= 33;
+ }
+ mBGZF.Write(pBaseQualities, queryLength);
+
+ // write the read group tag
+ if ( IsBigEndian ) {
+
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i+=2; // sizeof(uint16_t)
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i+=4; // sizeof(uint32_t)
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i+=8; // sizeof(uint64_t)
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ free(tagData);
+ exit(1);
+ }
+ }
+
+ mBGZF.Write(tagData, tagDataLength);
+ free(tagData);
+ }
+ else
+ mBGZF.Write(al.TagData.data(), tagDataLength);
+ }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.h
new file mode 100755
index 0000000..f738da7
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/BamWriter_p.h
@@ -0,0 +1,63 @@
+// ***************************************************************************
+// BamWriter_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_P_H
+#define BAMWRITER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAux.h>
+#include <BGZF.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+class BamWriterPrivate {
+
+ // ctor & dtor
+ public:
+ BamWriterPrivate(void);
+ ~BamWriterPrivate(void);
+
+ // "public" interface to BamWriter
+ public:
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& samHeader,
+ const BamTools::RefVector& referenceSequences,
+ bool isWriteUncompressed);
+ void SaveAlignment(const BamAlignment& al);
+
+ // internal methods
+ public:
+ const unsigned int CalculateMinimumBin(const int begin, int end) const;
+ void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar);
+ void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+
+ // data members
+ public:
+ BgzfData mBGZF;
+ bool IsBigEndian;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMWRITER_P_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/Makevars.in b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/Makevars.in
new file mode 100755
index 0000000..1cf255a
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/Makevars.in
@@ -0,0 +1,4 @@
+PKG_LIBS=@LIBS@ -lz
+PKG_CFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@
+PKG_CXXFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/api_global.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/api_global.h
new file mode 100755
index 0000000..24f72f2
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/api_global.h
@@ -0,0 +1,22 @@
+// ***************************************************************************
+// api_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides macros for exporting & importing BamTools API library symbols
+// ***************************************************************************
+
+#ifndef API_GLOBAL_H
+#define API_GLOBAL_H
+
+#include "bamtools_global.h"
+
+#ifdef BAMTOOLS_API_LIBRARY
+# define API_EXPORT BAMTOOLS_LIBRARY_EXPORT
+#else
+# define API_EXPORT BAMTOOLS_LIBRARY_IMPORT
+#endif
+
+#endif // API_GLOBAL_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamread.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamread.cpp
new file mode 100755
index 0000000..ff79c41
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamread.cpp
@@ -0,0 +1,222 @@
+#include "pc.h"
+#include "config.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <ext/hash_map>
+#include <boost/tokenizer.hpp>
+
+#include "BamAlignment.h"
+#include "BamAux.h" /* RefVector/RefData */
+#include "BamReader.h"
+
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+
+class lessAbsoluteValue {
+public:
+ bool operator()(int a, int b) const {
+ return abs(a) < abs(b);
+ }
+};
+
+
+
+
+
+//#define DEBUG 1
+
+extern "C" {
+
+
+ // read in bam file
+ SEXP read_bam(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+ boost::char_separator<char> sep2(",");
+
+ BamTools::BamReader bamf;
+
+ if (!bamf.Open(fname)) {
+ cout << "ERROR: failed to open BAM file '" << fname << "'" << endl;
+ } else {
+
+ Rprintf("opened %s\n",fname);
+ BamTools::RefVector refs = bamf.GetReferenceData();
+ BamTools::BamAlignment al;
+
+ int fcount=0;
+ while (bamf.GetNextAlignment(al)) {
+ if (!al.IsMapped() || !al.IsPrimaryAlignment()) {
+ continue;
+ }
+
+ string tagname=al.Name;
+ string chr=refs[al.RefID].RefName;
+ int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0));
+ if(al.IsReverseStrand()) { fpos=-1*fpos; }
+
+ uint32_t nms;
+ int nm=0;
+ if (al.GetEditDistance(nms)) {
+ nm=nms;
+ }
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(al.Name);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ bamf.Close();
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+
+
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamtools_global.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamtools_global.h
new file mode 100755
index 0000000..be7e034
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bamtools_global.h
@@ -0,0 +1,32 @@
+// ***************************************************************************
+// bamtools_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic definitions for exporting & importing library symbols
+// ***************************************************************************
+
+#ifndef BAMTOOLS_GLOBAL_H
+#define BAMTOOLS_GLOBAL_H
+
+// BAMTOOLS_LIBRARY_EXPORT
+#ifndef BAMTOOLS_LIBRARY_EXPORT
+# if defined(WIN32)
+# define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport)
+# else
+# define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default")))
+# endif
+#endif // BAMTOOLS_LIBRARY_EXPORT
+
+// BAMTOOLS_LIBRARY_IMPORT
+#ifndef BAMTOOLS_LIBRARY_IMPORT
+# if defined(WIN32)
+# define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport)
+# else
+# define BAMTOOLS_LIBRARY_IMPORT
+# endif
+#endif // BAMTOOLS_LIBRARY_IMPORT
+
+#endif // BAMTOOLS_GLOBAL_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bed2vector.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bed2vector.cpp
new file mode 100755
index 0000000..8380d33
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/bed2vector.cpp
@@ -0,0 +1,2628 @@
+#include "pc.h"
+#include "config.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <ext/hash_map>
+#include <boost/tokenizer.hpp>
+
+#ifdef HAVE_LIBBZ2
+#include <bzlib.h>
+#endif
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+
+class lessAbsoluteValue {
+public:
+ bool operator()(int a, int b) const {
+ return abs(a) < abs(b);
+ }
+};
+
+
+
+#ifdef HAVE_LIBBZ2
+int get_bzline(BZFILE* b,string& line) {
+ char c;
+ int nBuf;
+ int bzerror=BZ_OK;
+
+ while(bzerror == BZ_OK) {
+ nBuf=BZ2_bzRead(&bzerror, b, &c, 1);
+ if(bzerror==BZ_OK) {
+ if(c=='\n') {
+ return bzerror;
+ } else {
+ line+=c;
+ }
+ }
+ }
+ return bzerror;
+}
+
+int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) {
+ line="";
+ if(bz2file) {
+ int bzerror=get_bzline(b,line);
+ if(bzerror==BZ_OK) {
+ return(1);
+ } else {
+ if(bzerror!=BZ_STREAM_END) {
+ cerr<<"encountered BZERROR="<<bzerror<<endl;
+ }
+ return(0);
+ }
+ } else {
+ char *cline=NULL;
+ size_t n;
+ if(getline(&cline,&n,f) != -1) {
+ if(cline) {
+ cline[strlen(cline)-1]='\0';
+ line+=cline;
+ free(cline);
+ }
+ return(1);
+ } else {
+ return(0);
+ }
+ }
+}
+#endif
+
+
+/**
+ * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign
+ * corresponding to the strand.
+ */
+
+//#define DEBUG 1
+
+extern "C" {
+SEXP read_bed_ends(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ ifstream bed_file(fname);
+
+#ifdef DEBUG
+ Rprintf("opened %s\n",fname);
+#endif
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+
+ int fcount=0;
+ while(getline(bed_file,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str()));
+ string str_start=*sit++;
+ int fstart=atoi(str_start.c_str());
+ string str_end=*sit++;
+ int fend=atoi(str_end.c_str());
+ int fpos=fstart;
+ if(sit!=tok.end()) {
+ string u0=*sit++;
+ string nfield=*sit++;
+ string strand=*sit++;
+ if(strand=="-") {
+ fpos=-1*fend;
+ }
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ bed_file.close();
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ }
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ SEXP nv;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ int* i_nv=INTEGER(nv);
+ int i=0;
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_nv[i++]=*pi;
+ }
+ SET_VECTOR_ELT(ans, csi-pos.begin(), nv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+SEXP read_meland_old(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ ifstream bed_file(fname);
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+
+ int fcount=0;
+ while(getline(bed_file,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ sit++; sit++;
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string str_len=*sit++;
+ int len=atoi(str_len.c_str());
+ string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ bed_file.close();
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi,lsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+
+
+
+ SEXP tv,nv,lv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator ili=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*ili++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 3)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ int get_a_line(FILE *f,string& line) {
+ line="";
+ char cline[1024];
+ if(fgets(cline,1024,f)) {
+ line+=cline;
+ return(1);
+ } else {
+ return(0);
+ }
+ }
+
+
+ SEXP read_meland(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+
+ Rprintf("opened %s\n",fname);
+
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ sit++;
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string str_len=*sit++;
+ int len=atoi(str_len.c_str());
+ string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi,lsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 3, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,lv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator ili=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*ili++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 3, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+// reads regular eland files, recording mismatch positions
+SEXP read_eland_mismatches(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > mm1; // position of the first mismatch (or 0 for none)
+ vector< vector<int> > mm2; // position of the second mismatch
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ sit++;
+ string seq=*sit++;
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string chr=*sit++;
+ // extract chromosome name from this
+ int chrp=chr.find("chr");
+ int pp=chr.find('.');
+ chr=chr.substr(chrp+3,pp-chrp-3);
+
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+
+ string strand=*sit++;
+ int nstrand=0;
+ if(strand=="R") {
+ fpos=-1*(fpos+seq.size()-1);
+ nstrand=1;
+ }
+
+ sit++;
+
+ int nm1=0; int nm2=0;
+ if(sit!=tok.end()) {
+ string nms=*sit++;
+ nm1=atoi(nms.substr(0,nms.size()-1).c_str());
+ if(nstrand) { nm1=seq.size()-nm1+1; }
+ }
+ if(sit!=tok.end()) {
+ string nms=*sit++;
+ nm2=atoi(nms.substr(0,nms.size()-1).c_str());
+ if(nstrand) { nm2=seq.size()-nm2+1; }
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ mm1.push_back(vector<int>());
+ mm2.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (mm1[cind]).push_back(nm1);
+ (mm2[cind]).push_back(nm2);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi,lsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=mm1.begin()+(csi-pos.begin());
+ lsi=mm2.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("f"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+
+
+
+ SEXP tv,nv,lv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator ili=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*ili++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 3)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in regular eland files, adjusting the negative strand coordinate by sequence length
+ SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+ int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string sequence=*sit++;
+ int len=sequence.size();
+ // adjust probe length if eland length limit was specified
+ if(eland_tag_length>0 && len>eland_tag_length) {
+ len=eland_tag_length;
+ }
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string chr=*sit++;
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+ string str_strand=*sit++;
+
+ if(str_strand[0]=='R') {
+ fpos=-1*(fpos+len-1);
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+ // read in extended eland files, adjusting the negative strand coordinate by sequence length
+ SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+ int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string machinename=*sit++;
+ string runnumber=*sit++;
+ string lanenumber=*sit++;
+ *sit++;
+
+ string str_x=*sit++;
+ string str_y=*sit++;
+
+ string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y;
+
+
+
+ *sit++;
+ *sit++;
+
+
+ string sequence=*sit++;
+ *sit++;
+
+ string chr=*sit++;
+ string contig=*sit++;
+ chr=chr+contig;
+
+ int len=sequence.size();
+ // adjust probe length if eland length limit was specified
+ if(eland_tag_length>0 && len>eland_tag_length) {
+ len=eland_tag_length;
+ }
+
+
+
+ string str_pos=*sit++;
+ if(str_pos.size()<1) { continue; }
+ int fpos=atoi(str_pos.c_str());
+ string str_strand=*sit++;
+
+ if(str_strand[0]=='R') {
+ fpos=-1*(fpos+len-1);
+ }
+
+ string str_nm=*sit++;
+ // count non-digit characters
+ int nm=0;
+ for(int i=0;i<str_nm.size();i++) {
+ if(!isdigit(str_nm[i])) { nm++; }
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in eland multi files, adjusting the negative strand coordinate by sequence length
+SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG
+ Rprintf("read_eland_muti() : start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+ int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t","");
+ boost::char_separator<char> comsep(",","",boost::keep_empty_tokens);
+ boost::char_separator<char> colsep(":","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int nline=0;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+ nline++;
+ // chomp
+ size_t elpos = line.find_last_not_of("\n");
+ if(elpos != string::npos) {
+ line = line.substr(0, elpos+1);
+ }
+#ifdef DEBUG
+ Rprintf("line %d: %s\n",nline,line.c_str());
+#endif
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string sequence=*sit++;
+ string mspec=*sit++;
+ // parse out match spec
+
+ if(mspec=="NM" || mspec=="QC") { continue; }
+#ifdef DEBUG
+ Rprintf("parsing out spec \"%s\" : ",mspec.c_str());
+#endif
+
+ tokType stok(mspec, colsep);
+ tokType::iterator ssit=stok.begin();
+ string str_nm0=*ssit++;
+
+ int nm=0;
+ int nm0=atoi(str_nm0.c_str());
+ if(nm0>1) {
+#ifdef DEBUG
+ Rprintf("rejected for nm0\n");
+#endif
+ continue;
+ }
+ if(nm0==0) {
+ string str_nm1=*ssit++;
+ int nm1=atoi(str_nm1.c_str());
+ if(nm1>1) {
+#ifdef DEBUG
+ Rprintf("rejected for nm1\n");
+#endif
+ continue;
+ }
+ if(nm1==0) {
+ string str_nm2=*ssit++;
+ int nm2=atoi(str_nm2.c_str());
+ if(nm2>1) {
+#ifdef DEBUG
+ Rprintf("rejected for nm2\n");
+#endif
+ continue;
+ }
+ nm=2;
+ } else {
+ nm=1;
+ }
+ }
+
+#ifdef DEBUG
+ Rprintf("accepted (nm=%d)\n",nm);
+#endif
+ int npos=0;
+ string mpos=*sit++;
+ vector<string> mposc;
+ vector<int> mposp;
+ tokType ptok(mpos, comsep);
+ string prevchr;
+ for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) {
+ string cpos=*psit;
+ npos++;
+ int strand=1;
+ if(cpos.size()<5) {
+ Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str());
+ }
+ char lc=cpos.at(cpos.size()-1);
+
+ if(atoi(&lc)==nm) {
+ switch(cpos.at(cpos.size()-2)) {
+ case 'R': strand=-1; break;
+ case 'F': strand=1; break;
+ default:
+ Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break;
+ continue;
+ }
+ string chr,str_pos;
+ size_t colpos=cpos.find(":");
+ if(colpos==string::npos) {
+ if(npos>1) {
+ chr=prevchr;
+ str_pos=cpos.substr(0,cpos.size()-2);
+ } else {
+ Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str());
+ continue;
+ }
+ } else {
+ chr=cpos.substr(0,colpos);
+ str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos);
+ }
+#ifdef DEBUG
+ Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand);
+#endif
+ int pos=strand*atoi(str_pos.c_str());
+ mposc.push_back(chr);
+ mposp.push_back(pos);
+ }
+ }
+
+ string chr;
+ int fpos;
+ if(mposc.size()!=1) {
+ if(mposc.size()==0) {
+ Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str());
+ } else {
+ Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str());
+ }
+ continue;
+ } else {
+ chr=*mposc.begin();
+ fpos=*mposp.begin();
+ }
+
+ int len=sequence.size();
+ // adjust probe length if eland length limit was specified
+ if(eland_tag_length>0 && len>eland_tag_length) {
+ len=eland_tag_length;
+ }
+
+ if(fpos<0) {
+ fpos=-1*(-1*fpos+len-1);
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in regular eland files, adjusting the negative strand coordinate by sequence length
+ SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+ boost::char_separator<char> sep2(",");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
+ } else {
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string str_strand=*sit++;
+ string chr=*sit++;
+
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+ string sequence=*sit++;
+ sit++; sit++;
+ string mm=*sit++;
+
+ int len=sequence.size();
+ if(str_strand[0]=='-') {
+ fpos=-1*(fpos+len-1);
+ }
+ // determine number of mismatches
+ int nm=0;
+ if(mm.size()>0) {
+ nm++;
+ string::size_type tp(0);
+ while(tp!=string::npos) {
+ tp = mm.find(",",tp);
+ if(tp!=string::npos) {
+ tp++;
+ ++nm;
+ }
+ }
+ }
+
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in helicos tab-separated alignment output (regular or bz2)
+ SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length of the match
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+ boost::char_separator<char> sep2(",");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
+ } else {
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ int nlines=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+ nlines++;
+ // skip comments
+ if(line[0]=='#') { continue; }
+ if(line.compare(0,12,"Reference_ID")==0) {
+#ifdef DEBUG
+ Rprintf("matched header on line %d\n",nlines);
+#endif
+ continue;
+ }
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++;
+ string tagname=*sit++;
+ string str_startpos=*sit++;
+ string str_endpos=*sit++;
+
+ string str_tstart=*sit++;
+ string str_tend=*sit++;
+ int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str());
+
+ sit++; sit++;
+ string str_ndel=*sit++;
+ string str_nins=*sit++;
+ string str_nsub=*sit++;
+
+ string str_strand=*sit++;
+ int fpos;
+ if(str_strand[0]=='-') {
+ fpos=-1*atoi(str_endpos.c_str());
+ } else {
+ fpos=atoi(str_startpos.c_str());
+ }
+
+ // determine number of mismatches
+ int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<int> >::const_iterator lsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 3, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,lv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator lni=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*lni++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 3, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+ // read in text version of maq map
+ SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string chr=*sit++;
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+ string str_strand=*sit++;
+ sit++; sit++; sit++; sit++; sit++;
+ string str_nm=*sit++;
+ sit++; sit++; sit++;
+ string str_len=*sit++;
+ int nm=atoi(str_nm.c_str());
+ int len=atoi(str_len.c_str());
+
+ if(str_strand[0]=='-') {
+ fpos=-1*(fpos+len-1);
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+
+
+ // read in tagalign file
+ SEXP read_tagalign(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++;
+ string str_spos=*sit++;
+ string str_epos=*sit++;
+ sit++;
+ string str_qual=*sit++;
+ string str_strand=*sit;
+
+ int fpos;
+ if(str_strand[0]=='+') {
+ fpos=atoi(str_spos.c_str());
+ } else {
+ fpos=-1*atoi(str_epos.c_str());
+ }
+ int nm=atoi(str_qual.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+
+
+ SEXP tv,nv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 2)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+
+ // arachne madness
+ SEXP read_arachne(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++;
+ string str_spos=*sit++;
+ int nm=0;
+ if(sit!=tok.end()) {
+ string str_mm=*sit;
+ nm=atoi(str_mm.c_str());
+ }
+
+ int fpos=atoi(str_spos.c_str());;
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+
+
+ SEXP tv,nv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 2)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // arachne madness
+ SEXP read_arachne_long(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length of the match
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string query=*sit++;
+ if(query!="QUERY") { continue; }
+ *sit++; *sit++; *sit++; *sit++;
+ string str_strand=*sit++;
+ string chr=*sit++;
+ string str_startpos=*sit++;
+ string str_endpos=*sit++;
+
+ int fpos;
+ if(str_strand[0]=='1') {
+ fpos=-1*atoi(str_endpos.c_str());
+ } else {
+ fpos=atoi(str_startpos.c_str());
+ }
+#ifdef DEBUG
+ Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos);
+#endif
+ *sit++;
+ string str_nblocks=*sit++;
+ int nblocks=atoi(str_nblocks.c_str());
+#ifdef DEBUG
+ Rprintf("nblocks=%d\n",nblocks);
+#endif
+ // tally up the read length and the number of mismatches for all blocks
+ int len=0; int nm=0;
+ for(int i=0;i<nblocks;i++) {
+ string str_sgs=*sit++;
+ int sgs=atoi(str_sgs.c_str());
+ string str_slen=*sit++;
+ int slen=atoi(str_slen.c_str());
+ string str_snm=*sit++;
+ int snm=atoi(str_snm.c_str());
+#ifdef DEBUG
+ Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm);
+#endif
+ len+=slen;
+ nm+=abs(sgs)+snm;
+ }
+ nm+=nblocks-1;
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<int> >::const_iterator lsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+
+
+ SEXP tv,nv,lv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator lni=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*lni++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 3)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/cdensum.c b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/cdensum.c
new file mode 100755
index 0000000..fdf3138
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/cdensum.c
@@ -0,0 +1,144 @@
+#include <math.h>
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+
+
+#undef DEBUG 1
+
+// dout is npos-length output array.
+// n - number of positions in pos (and length of tc count array)
+// spos - starting position
+void cdensum(int *n, double *pos, double *tc, double *spos, int *bw,int *dw, int *npos, int *step,double *dout)
+{
+ int i,j;
+
+ double epos= *spos + ((double) *npos);
+ double dbw=(double) *bw;
+ for(i = 0; i< *n; i++) {
+ // size of the window to which the contributions should be added
+ int in=(int) (pos[i]- *spos);
+ int ic=tc[i];
+ int whs=(*dw)*(*bw)*ic;
+ int ws=(int) floor((in-whs)/(*step));
+ int we=(int) ceil((in+whs)/(*step));
+ if(ws<0) { ws=0; }
+ if(we>= *npos) { we= *npos -1; }
+
+ for(j=ws;j<we;j++) {
+ double beta=((double)(j*(*step)-in))/dbw;
+ dout[j]+=((double)ic)*exp(-0.5*beta*beta);
+ }
+ }
+}
+
+
+// window tag counts
+// dout is npos-length output array that will contain window tag counts
+// windows are of a specified size, moved at a specified step
+// n - number of positions in sorted tag array (positive only)
+// spos - starting position
+void window_n_tags(int *n, double *pos, double *spos, int *window_size, int *window_step, int *npos, int *dout)
+{
+ int i;
+ int cs=0; int ce=0; // current array start/end indecies
+ int ctc=0; // current tag count
+ double wpos=*spos-(*window_size)/2; // left-edge position
+ //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",*n,*window_size,*window_step,*npos,*spos);
+ for(i=0;i<*npos;i++) {
+ // advance end if needed
+ double ep=wpos+(*window_size);
+ while(ce<(*n) && pos[ce]<=ep) {
+ ctc++; ce++;
+ }
+ // advance start
+ while(cs<*n && pos[cs]<wpos) {
+ ctc--; cs++;
+ }
+ dout[i]=ctc;
+ // advance window position
+ wpos+=*window_step;
+ }
+}
+
+// window tag counts
+// windows are of a specified size, moved at a specified step
+// pos - tag positions (positive, pre-shifted)y
+// spos - starting position
+// returns nsteps-length output array that will contain window tag counts
+SEXP cwindow_n_tags(SEXP pos_R, SEXP spos_R, SEXP window_size_R, SEXP window_step_R, SEXP nsteps_R) {
+ double* pos=REAL(pos_R);
+ int n=LENGTH(pos_R);
+ int window_size=*INTEGER(window_size_R);
+ int window_step=*INTEGER(window_step_R);
+ int nsteps=*INTEGER(nsteps_R);
+ double spos=*REAL(spos_R);
+
+ // allocate return array
+ SEXP tc_R;
+ PROTECT(tc_R=allocVector(INTSXP,nsteps));
+ int* dout=INTEGER(tc_R);
+
+ int i;
+ int cs=0; int ce=0; // current array start/end indecies
+ int ctc=0; // current tag count
+ double wpos=spos-window_size/2; // left-edge position
+ //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",n,window_size,window_step,nsteps,spos);
+ for(i=0;i<nsteps;i++) {
+ // advance end if needed
+ double ep=wpos+window_size;
+ while(ce<n && pos[ce]<=ep) {
+ ctc++; ce++;
+ }
+ // advance start
+ while(cs<n && pos[cs]<wpos) {
+ ctc--; cs++;
+ }
+ dout[i]=ctc;
+ // advance window position
+ wpos+=window_step;
+ }
+ UNPROTECT(1);
+ return(tc_R);
+}
+
+// tag counts in windows around specified positions
+// pos - tag positions
+// ntags - number of tags in each position
+// wpos - window positions
+// returns a pos-length vector giving number of tags that fall within window_half_size from the provided positions
+SEXP cwindow_n_tags_around(SEXP pos_R, SEXP ntags_R, SEXP wpos_R, SEXP window_half_size_R) {
+ double* pos=REAL(pos_R);
+ int* ntags=INTEGER(ntags_R);
+ int n=LENGTH(pos_R);
+ double* wpos=REAL(wpos_R);
+ int nw=LENGTH(wpos_R); // number of windows
+ double whs=(double) *INTEGER(window_half_size_R);
+
+ // allocate return array
+ SEXP tc_R;
+ PROTECT(tc_R=allocVector(INTSXP,nw));
+ int* dout=INTEGER(tc_R);
+
+ int i;
+ int cs=0; int ce=0; // current array start/end indecies
+ int ctc=0; // current tag count
+ for(i=0;i<nw;i++) {
+ //if(i>(nw-2)) { Rprintf("-i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); }
+ // advance end if needed
+ double ep=wpos[i]+whs;
+ while(ce<n && pos[ce]<=ep) {
+ ctc+=ntags[ce]; ce++;
+ }
+ // advance start
+ double sp=wpos[i]-whs;
+ while(cs<n && pos[cs]<sp) {
+ ctc-=ntags[cs]; cs++;
+ }
+ dout[i]=ctc;
+ // if(i>(nw-2)) { Rprintf("+i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); }
+ }
+ UNPROTECT(1);
+ return(tc_R);
+}
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/const.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/const.h
new file mode 100755
index 0000000..2a06313
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/const.h
@@ -0,0 +1,18 @@
+#ifndef NST_CONST_H
+#define NST_CONST_H
+
+#define MAX_ULL 0xffffffffffffffffull
+
+typedef unsigned long long bit64_t;
+typedef unsigned bit32_t;
+typedef unsigned short bit16_t;
+typedef unsigned char bit8_t;
+
+extern bit8_t nst_nt4_table[];
+extern bit8_t nst_nt16_table[];
+extern char *nst_nt4_rev_table;
+extern char *nst_nt16_rev_table;
+extern bit8_t nst_nt16_nt4_table[];
+extern int nst_nt16_count_table[];
+
+#endif
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.c b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.c
new file mode 100755
index 0000000..96b4fff
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.c
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include "const.h"
+#include "maqmap.h"
+
+maqmap_t *maq_new_maqmap()
+{
+ maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t));
+ mm->format = MAQMAP_FORMAT_NEW;
+ return mm;
+}
+void maq_delete_maqmap(maqmap_t *mm)
+{
+ int i;
+ if (mm == 0) return;
+ for (i = 0; i < mm->n_ref; ++i)
+ free(mm->ref_name[i]);
+ free(mm->ref_name);
+ free(mm->mapped_reads);
+ free(mm);
+}
+void maqmap_write_header(gzFile fp, const maqmap_t *mm)
+{
+ int i, len;
+ gzwrite(fp, &mm->format, sizeof(int));
+ gzwrite(fp, &mm->n_ref, sizeof(int));
+ for (i = 0; i != mm->n_ref; ++i) {
+ len = strlen(mm->ref_name[i]) + 1;
+ gzwrite(fp, &len, sizeof(int));
+ gzwrite(fp, mm->ref_name[i], len);
+ }
+ gzwrite(fp, &mm->n_mapped_reads, sizeof(bit64_t));
+}
+maqmap_t *maqmap_read_header(gzFile fp)
+{
+ maqmap_t *mm;
+ int k, len;
+ mm = maq_new_maqmap();
+ gzread(fp, &mm->format, sizeof(int));
+ if (mm->format != MAQMAP_FORMAT_NEW) {
+ if (mm->format > 0) {
+ fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n");
+ exit(3);
+ }
+ assert(mm->format == MAQMAP_FORMAT_NEW);
+ }
+ gzread(fp, &mm->n_ref, sizeof(int));
+ mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*));
+ for (k = 0; k != mm->n_ref; ++k) {
+ gzread(fp, &len, sizeof(int));
+ mm->ref_name[k] = (char*)malloc(len * sizeof(char));
+ gzread(fp, mm->ref_name[k], len);
+ }
+ /* read number of mapped reads */
+ gzread(fp, &mm->n_mapped_reads, sizeof(bit64_t));
+ return mm;
+}
+
+/* mapvalidate */
+
+static void mapvalidate_core(gzFile fpin)
+{
+ maqmap_t *m = maqmap_read_header(fpin);
+ maqmap1_t *m1, mm1;
+ bit64_t n = 0;
+ int i, l;
+ bit64_t *cnt;
+ m1 = &mm1;
+ cnt = (bit64_t*)calloc(m->n_ref, 8);
+ printf("[message] number of reference sequences: %d\n", m->n_ref);
+ while ((l = maqmap_read1(fpin, m1)) != 0) {
+ if (l != sizeof(maqmap1_t)) {
+ printf("[fatal error] truncated map file.\n");
+ break;
+ }
+ ++n;
+ if ((int)m1->seqid >= m->n_ref) {
+ printf("[fatal error] maqmap1_t::seqid is invalid (%d >= %d).\n", m1->seqid, m->n_ref);
+ break;
+ }
+ ++cnt[m1->seqid];
+ if (m1->size >= MAX_READLEN - 1) {
+ printf("[faltal error] maqmap1_t::size is invalid (%d >= %d).\n", m1->size, MAX_READLEN - 1);
+ break;
+ }
+ }
+ if (m->n_mapped_reads != 0) {
+ if (m->n_mapped_reads != n) {
+ printf("[warning] maqmap1_t::n_mapped_reads is set, but not equals the real number (%llu != %llu).\n",
+ m->n_mapped_reads, n);
+ }
+ }
+ for (i = 0; i != m->n_ref; ++i)
+ printf("[message] %s : %llu\n", m->ref_name[i], cnt[i]);
+ free(cnt);
+ maq_delete_maqmap(m);
+}
+
+/* mapview */
+
+static void mapview_core(FILE *fpout, gzFile fpin, int is_verbose, int is_mm)
+{
+ bit32_t j;
+ maqmap_t *m = maqmap_read_header(fpin);
+ maqmap1_t *m1, mm1;
+ m1 = &mm1;
+ while (maqmap_read1(fpin, m1)) {
+ fprintf(fpout, "%s\t%s\t%d\t%c\t%d\t%u\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",
+ m1->name, m->ref_name[m1->seqid], (m1->pos>>1) + 1,
+ (m1->pos&1)? '-' : '+', m1->dist, m1->flag, m1->map_qual, (signed char)m1->seq[MAX_READLEN-1],
+ m1->alt_qual, m1->info1&0xf, m1->info2, m1->c[0], m1->c[1], m1->size);
+ if (is_verbose) {
+ fputc('\t', fpout);
+ for (j = 0; j != m1->size; ++j) {
+ if (m1->seq[j] == 0) fputc('n', fpout);
+ else if ((m1->seq[j]&0x3f) < 27) fputc("acgt"[m1->seq[j]>>6&3], fpout);
+ else fputc("ACGT"[m1->seq[j]>>6&3], fpout);
+ }
+ fputc('\t', fpout);
+ for (j = 0; j != m1->size; ++j)
+ fputc((m1->seq[j]&0x3f) + 33, fpout);
+ }
+ if (is_mm) {
+ bit64_t *p = (bit64_t*)(m1->seq + 55);
+ fprintf(fpout, "\t%llx", *p);
+ }
+ fputc('\n', fpout);
+ }
+ maq_delete_maqmap(m);
+}
+
+int ma_mapview(int argc, char *argv[])
+{
+ int c, is_verbose = 1, is_mm = 0;
+ while ((c = getopt(argc, argv, "bN")) >= 0) {
+ switch (c) {
+ case 'b': is_verbose = 0; break;
+ case 'N': is_mm = 1; break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: maq mapview [-bN] <in.map>\n");
+ return 1;
+ }
+ gzFile fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[optind], "r");
+ mapview_core(stdout, fp, is_verbose, is_mm);
+ gzclose(fp);
+ return 0;
+}
+
+int ma_mapvalidate(int argc, char *argv[])
+{
+ gzFile fp;
+ if (argc < 2) {
+ fprintf(stderr, "Usage: maq mapvalidate <in.map>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[1], "r");
+ mapvalidate_core(fp);
+ gzclose(fp);
+ return 0;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.h
new file mode 100755
index 0000000..9beba0c
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqmap.h
@@ -0,0 +1,70 @@
+#ifndef MAQMAP_H_
+#define MAQMAP_H_
+
+#ifdef MAQ_LONGREADS
+# define MAX_READLEN 128
+#else
+# define MAX_READLEN 64
+#endif
+
+#define MAX_NAMELEN 36
+#define MAQMAP_FORMAT_OLD 0
+#define MAQMAP_FORMAT_NEW -1
+
+#define PAIRFLAG_FF 0x01
+#define PAIRFLAG_FR 0x02
+#define PAIRFLAG_RF 0x04
+#define PAIRFLAG_RR 0x08
+#define PAIRFLAG_PAIRED 0x10
+#define PAIRFLAG_DIFFCHR 0x20
+#define PAIRFLAG_NOMATCH 0x40
+#define PAIRFLAG_SW 0x80
+
+#include <string.h>
+#include <zlib.h>
+#include "const.h"
+
+/*
+ name: read name
+ size: the length of the read
+ seq: read sequence (see also below)
+ seq[MAX_READLEN-1]: single end mapping quality (equals to map_qual if not paired)
+ map_qual: the final mapping quality
+ alt_qual: the lower quality of the two ends (equals to map_qual if not paired)
+ flag: status of the pair
+ dist: offset of the mate (zero if not paired)
+ info1: mismatches in the 24bp (higher 4 bits) and mismatches (lower 4 bits)
+ info2: sum of errors of the best hit
+ c[2]: count of all 0- and 1-mismatch hits on the reference
+ */
+typedef struct
+{
+ bit8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */
+ bit8_t size, map_qual, info1, info2, c[2], flag, alt_qual;
+ bit32_t seqid, pos;
+ int dist;
+ char name[MAX_NAMELEN];
+} maqmap1_t;
+
+typedef struct
+{
+ int format, n_ref;
+ char **ref_name;
+ bit64_t n_mapped_reads;
+ maqmap1_t *mapped_reads;
+} maqmap_t;
+
+#define maqmap_read1(fp, m1) gzread((fp), (m1), sizeof(maqmap1_t))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ maqmap_t *maq_new_maqmap();
+ void maq_delete_maqmap(maqmap_t *mm);
+ void maqmap_write_header(gzFile fp, const maqmap_t *mm);
+ maqmap_t *maqmap_read_header(gzFile fp);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqread.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqread.cpp
new file mode 100755
index 0000000..2c1334a
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/maqread.cpp
@@ -0,0 +1,207 @@
+#include "pc.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <zlib.h>
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+#include "maqmap.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+
+class lessAbsoluteValue {
+public:
+ bool operator()(int a, int b) const {
+ return abs(a) < abs(b);
+ }
+};
+
+
+
+//#define DEBUG 1
+
+extern "C" {
+
+ // read in text version of maq map
+ SEXP read_binmaqmap(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ gzFile f=gzopen(fname,"r");
+
+ maqmap_t *m = maqmap_read_header(f);
+ maqmap1_t *m1, mm1;
+ m1 = &mm1;
+
+ if (!f) {
+ cout<<"can't open input file \""<<fname<<"\"\n";
+ } else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(maqmap_read1(f, m1)) {
+ string tagname=string(m1->name);
+ string chr=string(m->ref_name[m1->seqid]);
+ int len=m1->size;
+ int fpos=(m1->pos>>1) + 1;
+ if(m1->pos&1) {
+ fpos=-1*(fpos+len-1);
+ }
+ int nm=m1->info1&0xf;
+
+#ifdef DEBUG
+ Rprintf("read in map line chr=%s tagname=%s fpos=%d, nm=%d, len=%d\n",chr.c_str(),tagname.c_str(),fpos,nm,len);
+#endif
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ gzclose(f);
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/pc.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/pc.h
new file mode 100755
index 0000000..8be0911
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/pc.h
@@ -0,0 +1,20 @@
+#ifndef PC_H
+#define PC_H 1
+#include <functional>
+//#include <hash_map.h>
+#include <ext/hash_set>
+#include <ext/hash_map>
+
+
+namespace __gnu_cxx
+{
+ template<> struct hash< std::string >
+ {
+ size_t operator()( const std::string& x ) const
+ {
+ return hash< const char* >()( x.c_str() );
+ }
+ };
+}
+
+#endif
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/peaks.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/peaks.cpp
new file mode 100755
index 0000000..ace6855
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/peaks.cpp
@@ -0,0 +1,804 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <string>
+#include <set>
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+/**
+ * Calculate all local peaks
+ */
+
+//#define DEBUG 1
+
+extern "C" {
+ SEXP find_peaks(SEXP x_R,SEXP thr_R,SEXP max_span_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ int nx=LENGTH(x_R);
+ int max_span=*INTEGER(max_span_R);
+ double thr=REAL(thr_R)[0];
+#ifdef DEBUG
+ Rprintf("n=%d; thr=%f; max_span=%d\n",nx,thr,max_span);
+#endif
+
+ vector<int> pos;
+
+ double pv=x[0];
+ double ppv=0; // previous peak value
+ int ppp=-max_span-1; // previous peak position
+
+ for(int i=1;i<(nx-1);i++) {
+ if(x[i]>pv && x[i]>=thr && x[i]>x[i+1]) {
+ if(max_span>2) {
+ //Rprintf("i=%d; ppp=%d\n",i,ppp);
+ if(i-ppp > max_span) {
+ if(ppp>=0) {
+ pos.push_back(ppp);
+ }
+ //Rprintf("recorded %d; now %d\n",ppp,i);
+ ppp=i; ppv=x[i];
+ } else {
+ if(x[i]>ppv) {
+ //Rprintf("reset from %d to %d\n",ppp,i);
+ ppp=i; ppv=x[i];
+ }
+ }
+ } else {
+ pos.push_back(i);
+ }
+ }
+ if(x[i]!=x[i+1]) { pv=x[i]; }
+ }
+
+ // add remaining peak
+ if(max_span>2 && ppp>=0) {
+ pos.push_back(ppp);
+ }
+
+ SEXP nv;
+ PROTECT(nv=allocVector(INTSXP,pos.size()));
+ int* i_nv=INTEGER(nv);
+ int i=0;
+ for(vector<int> ::const_iterator pi=pos.begin();pi!=pos.end();++pi) {
+ i_nv[i++]=1+(*pi);
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+
+
+
+ /************************************************************************/
+ // given a data vector d (positive values) and a set of signed center coordinates pos,
+ // returns coordinates of data points relative to the centers
+ // size is the size of the region around the centers
+ // return: vector of relative coordinates (x) and indecies of centers relative the coordinate
+ // was calculated (i).
+ SEXP get_relative_coordinates(SEXP d_R,
+ SEXP pos_R,
+ SEXP size_R)
+ {
+ int *d, *pos;
+ int npos,nd,size;
+
+ d = INTEGER(d_R); pos = INTEGER(pos_R);
+ npos=LENGTH(pos_R); nd=LENGTH(d_R);
+ size = INTEGER(size_R)[0];
+#ifdef DEBUG
+ Rprintf("|d|=%d, |c|=%d, size=%d\n",nd,npos,size);
+#endif
+
+ vector<int> x; vector<int> xi;
+ int k=0; // current pos index
+
+ for(int i=0;i<nd;i++) {
+ // increment k until pos[k]+size>=d[i]
+ while((abs(pos[k])+size) < d[i]) { k++; if(k==npos) { break; };
+#ifdef DEBUG
+ Rprintf("advancing k to %d\n",k);
+#endif
+ }
+ if(k==npos) { break; };
+ // increment i until d[i]>=pos[k]-size
+ while((abs(pos[k])-size) > d[i]) { i++; if(i==nd) { break; }
+#ifdef DEBUG
+ Rprintf("advancing i to %d\n",i);
+#endif
+ }
+ if(i==nd) { break; }
+
+
+ int l=k;
+ while((l<npos) && ((abs(pos[l])-size) <= d[i])) { l++;
+#ifdef DEBUG
+ Rprintf("advancing l to %d\n",l);
+#endif
+ }
+ for(int j=k;j<l;j++) {
+ int pd=d[i]-abs(pos[j]);
+ if(abs(pd)<=size) {
+ // record
+ if(pos[j]>0) {
+ x.push_back(pd);
+ } else {
+ x.push_back(-1*pd);
+ }
+ xi.push_back(j);
+#ifdef DEBUG
+ Rprintf("recorded i=%d, j=%d\n",i,j);
+#endif
+ } else {
+ break;
+ }
+ }
+ }
+
+ SEXP xv_R,xiv_R;
+ PROTECT(xv_R=allocVector(INTSXP,x.size()));
+ PROTECT(xiv_R=allocVector(INTSXP,x.size()));
+ int* xv=INTEGER(xv_R);
+ int* xiv=INTEGER(xiv_R);
+
+ int i=0;
+ for(vector<int> ::const_iterator pi=x.begin();pi!=x.end();++pi) {
+ xv[i++]=*pi;
+ }
+ i=0;
+ for(vector<int> ::const_iterator pi=xi.begin();pi!=xi.end();++pi) {
+ xiv[i++]=1+(*pi);
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("x"));
+ SET_STRING_ELT(names_R, 1, mkChar("i"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, xv_R);
+ SET_VECTOR_ELT(ans_R, 1, xiv_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+ }
+
+
+ // determines a set of points within a set of fragments
+ // note: all vectors sorted in ascending order
+ // note: all vectors are integers
+ // x_R - vector of point positions
+ // se_R - vector of start and end positions
+ // fi_R - vector of signed fragment indecies
+ // return_list_R - whether a list of fragments should be returned for each point
+ // return_unique_R - whether points in multiple fragments should be omitted
+ SEXP points_within(SEXP x_R,SEXP se_R,SEXP fi_R,SEXP return_list_R,SEXP return_unique_R,SEXP return_point_counts_R) {
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ int* x=INTEGER(x_R);
+ int nx=LENGTH(x_R);
+ int* se=INTEGER(se_R);
+ int* fi=INTEGER(fi_R);
+ int nf=LENGTH(se_R);
+
+ int return_list=*(INTEGER(return_list_R));
+ int return_unique=*(INTEGER(return_unique_R));
+ int return_point_counts=*(INTEGER(return_point_counts_R));
+
+#ifdef DEBUG
+ Rprintf("nf=%d; nx=%d, return_list=%d, return_unique=%d, return_point_counts=%d\n",nf/2,nx,return_list,return_unique,return_point_counts);
+#endif
+ set<int> fset;
+
+
+ SEXP nv; int *i_nv;
+ int np=0;
+ if(return_point_counts) {
+ PROTECT(nv = allocVector(INTSXP, nf/2)); np++;
+ i_nv=INTEGER(nv);
+ for(int i=0;i<nf/2;i++) { i_nv[i]=0; }
+ } else if(return_list) {
+ PROTECT(nv = allocVector(VECSXP, nx)); np++;
+ } else {
+ PROTECT(nv=allocVector(INTSXP,nx)); np++;
+ i_nv=INTEGER(nv);
+ }
+
+ int j=0;
+
+ for(int i=0;i<nx;i++) {
+ // advance j
+ while(j<nf && se[j]<x[i]) {
+ int frag=fi[j];
+ if(frag>0) { // insert
+ fset.insert(frag);
+#ifdef DEBUG
+ Rprintf("inserted frag %d, size=%d\n",frag,fset.size());
+#endif
+ } else { // remove
+ fset.erase(-frag);
+#ifdef DEBUG
+ Rprintf("removed frag %d, size=%d\n",-frag,fset.size());
+#endif
+ }
+ j++;
+ }
+#ifdef DEBUG
+ Rprintf("i=%d j=%d\n",i,j);
+#endif
+ if(return_list) {
+ if(fset.empty() || (return_unique && fset.size()>1)) {
+ // assign null list?
+ } else {
+ SEXP fil_R;
+ PROTECT(fil_R=allocVector(INTSXP,fset.size())); np++;
+ int* fil=INTEGER(fil_R);
+ int k=0;
+ for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) {
+ fil[k]=*ki; k++;
+ }
+ SET_VECTOR_ELT(nv, i, fil_R);
+ UNPROTECT(1); np--;
+ }
+ } else {
+ if(return_point_counts) {
+ for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) {
+ i_nv[*ki-1]++;
+ }
+ } else {
+ if(fset.empty() || (return_unique && fset.size()>1)) {
+ i_nv[i]=-1;
+ } else {
+ i_nv[i]=*fset.begin();
+ }
+ }
+ }
+ }
+
+ UNPROTECT(np);
+ return(nv);
+ }
+
+
+ SEXP expuni_lr(SEXP x_R, // positions and their number (assumed sorted in ascending order)
+ SEXP mdist_R, // max distance at which points should be considered
+ SEXP lambda_R, // lambda value
+ SEXP spos_R, // starting position
+ SEXP epos_R, // ending position
+ SEXP step_R, // step size
+ SEXP return_peaks_R, // whether peak positions should be returned, or entire score vector
+ SEXP min_peak_lr_R // min peak height (lr)
+ )
+ {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ int* x=INTEGER(x_R);
+ int nx=LENGTH(x_R);
+ int mdist=INTEGER(mdist_R)[0];
+ double lambda=*(REAL(lambda_R));
+
+ int return_peaks=*(INTEGER(return_peaks_R));
+ double min_peak=*(REAL(min_peak_lr_R));
+
+ int spos=*(INTEGER(spos_R));
+ int epos=*(INTEGER(epos_R));
+ int step=*(INTEGER(step_R));
+
+ int nsteps=(int) (epos-spos)/step;
+
+
+#ifdef DEBUG
+ Rprintf("n=%d; lambda=%f; mdist=%d; spos=%d; epos=%d; step=%d; nsteps=%d\n",nx,lambda,mdist,spos,epos,step,nsteps);
+#endif
+
+
+ SEXP nv;
+ double *d_nv;
+ if(!return_peaks) {
+ PROTECT(nv=allocVector(REALSXP,nsteps+1));
+ d_nv=REAL(nv);
+ }
+
+
+ int i=0; // current index of the first point being used in the calculations
+ int j=0; // current index of the last point being used in the calculations
+ int sx=0; // current sum of all positions
+ int n=0;
+
+ for(int k=0; k<=nsteps; k++) {
+ int cpos=spos+k*step;
+ // increase i until x[i]>=cpos-mdist; remove x from sx; decrement n;
+ while(i<nx && x[i]<(cpos-mdist)) {
+ n--; sx-=x[i]; i++;
+ //Rprintf("incremented i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]);
+ }
+ //Rprintf("stable i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]);
+
+ //if(i>j) { j=i; }
+
+ // increase j until x[j]>cpos
+ while(j<nx && x[j]<=cpos) {
+ n++; sx+=x[j]; j++;
+ //Rprintf("incremented j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j-1]);
+ }
+ //Rprintf("stable j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j]);
+
+ // calculate lr
+ d_nv[k]=((double)(1-n))*log(lambda)-lambda*((double)(n*(cpos+1)-sx));
+ //Rprintf("recorded lr[%d]=%f\n",k-1,d_nv[k-1]);
+ }
+ UNPROTECT(1);
+ return(nv);
+ }
+
+
+ SEXP allpdist(SEXP x_R,SEXP max_dist_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ int nx=LENGTH(x_R);
+ double max_dist=*REAL(max_dist_R);
+#ifdef DEBUG
+ Rprintf("n=%d; max_dist=%d\n",nx,max_dist);
+#endif
+
+ vector<double> dist;
+
+ for(int i=0;i<nx;i++) {
+ for(int j=i+1;j<nx;j++) {
+
+ double d=x[j]-x[i];
+#ifdef DEBUG
+ Rprintf("i=%d; j=%d; d=%f\n",i,j,d);
+#endif
+ if(d<=max_dist) {
+ dist.push_back(d);
+ } else {
+ break;
+ }
+ }
+ }
+
+ SEXP nv;
+ PROTECT(nv=allocVector(REALSXP,dist.size()));
+ double* i_nv=REAL(nv);
+ int i=0;
+ for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) {
+ i_nv[i++]=*pi;
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ // same as above, but for two different sets
+ SEXP allxpdist(SEXP x_R,SEXP y_R, SEXP max_dist_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ double* y=REAL(y_R);
+ int nx=LENGTH(x_R);
+ int ny=LENGTH(y_R);
+ double max_dist=*REAL(max_dist_R);
+#ifdef DEBUG
+ Rprintf("nx=%d; ny=%d; max_dist=%d\n",nx,ny,max_dist);
+#endif
+
+ vector<double> dist;
+ int yi=0; // latest y start index
+
+ for(int i=0;i<nx;i++) {
+ // adjust yi so that yi>=x[i]-max_dist_R
+ while(y[yi]<(x[i]-max_dist) && yi<ny) { yi++; }
+ if(yi==ny) { break; }
+
+ for(int j=yi;j<ny;j++) {
+ double d=y[j]-x[i];
+#ifdef DEBUG
+ Rprintf("i=%d; j=%d; d=%f\n",i,j,d);
+#endif
+ if(d<=max_dist) {
+ dist.push_back(d);
+ } else {
+ break;
+ }
+ }
+ }
+
+ SEXP nv;
+ PROTECT(nv=allocVector(REALSXP,dist.size()));
+ double* i_nv=REAL(nv);
+ int i=0;
+ for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) {
+ i_nv[i++]=*pi;
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ // returns a vector giving for each point,
+ // number of points within a given max_dist
+ SEXP nwithindist(SEXP x_R,SEXP max_dist_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ int nx=LENGTH(x_R);
+ double max_dist=*REAL(max_dist_R);
+
+ SEXP nv;
+ PROTECT(nv=allocVector(REALSXP,nx));
+ double* i_nv=REAL(nv);
+ for(int i=0;i<nx;i++) { i_nv[i]=0; }
+
+#ifdef DEBUG
+ Rprintf("n=%d; max_dist=%d\n",nx,max_dist);
+#endif
+
+ for(int i=0;i<nx;i++) {
+ for(int j=i+1;j<nx;j++) {
+
+ double d=x[j]-x[i];
+#ifdef DEBUG
+ Rprintf("i=%d; j=%d; d=%f\n",i,j,d);
+#endif
+ if(d<=max_dist) {
+ i_nv[i]++;
+ i_nv[j]++;
+ } else {
+ break;
+ }
+ }
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+
+
+
+ // given a list of sorted chromosome signal and background vectors (unscaled), determine
+ // cluster contigs exceeding thr poisson P value, based on a whs window size,
+ // and satisfying mcs cluster size
+ SEXP find_poisson_enrichment_clusters(SEXP pos_R,SEXP flag_R,SEXP wsize_R,SEXP thr_R,SEXP mcs_R,SEXP bgm_R,SEXP mintag_R,SEXP either_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* pos=REAL(pos_R);
+ int* flag=INTEGER(flag_R);
+ int nt=LENGTH(pos_R);
+
+ int mcs=*INTEGER(mcs_R);
+ int wsize=*INTEGER(wsize_R);
+ int either=*INTEGER(either_R);
+ double thr=REAL(thr_R)[0];
+ double bgm=REAL(bgm_R)[0];
+ double mintag=REAL(mintag_R)[0];
+
+#ifdef DEBUG
+ Rprintf("nt=%d; wsize=%d; thr=%f; mcs=%d; min.tag=%f; bgm=%f\n",nt,wsize,thr,mcs,mintag,bgm);
+#endif
+
+
+ vector< pair<double,double> > contigs;
+
+ // running indecies (start and end)
+ int si=0;
+ int ei=0;
+
+ // current window coordinate
+ double ws=pos[0];
+
+ // current window tag counts
+ int cc[2]={0,0};
+
+
+ if(nt>0) {
+ cc[flag[si]]++;
+ // increment window end
+ while(ei<(nt-1) && (pos[ei+1]-ws) <= wsize) {
+ ei++;
+ cc[flag[ei]]++;
+ }
+
+
+ // cluster start,end positions
+ double cs,ce;
+ int inclust=0;
+
+ while(si<nt-1) {
+
+ if((pos[si+1]-ws) > (pos[ei+1] - ws - wsize) && ei!=(nt-1)) {
+ // move end boudnary
+ ei++;
+ ws=pos[ei]-wsize;
+ cc[flag[ei]]++;
+ while(ei<(nt-1) && pos[ei+1]==ws+wsize) {
+ ei++;
+ cc[flag[ei]]++;
+ }
+
+ // increment window start
+ while(si<(nt-1) && pos[si] < ws) {
+ cc[flag[si]]--;
+ si++;
+ }
+
+ } else {
+ // move up start boundary
+ ws=pos[si+1];
+ cc[flag[si]]--;
+ si++;
+ while(si<(nt-1) && pos[si+1]==ws) {
+ cc[flag[si]]--;
+ si++;
+ }
+
+ // increment window end
+ while(ei<(nt-1) && (pos[ei+1] - ws) <= wsize) {
+ ei++;
+ cc[flag[ei]]++;
+ }
+
+ }
+
+ // calculate z score
+ double dc0=((double)cc[0])+0.5;
+ double dc1=((double)cc[1])+0.5;
+ double rte=dc0+dc1-0.25*thr*thr;
+ double lb;
+ if(rte<=0) {
+ lb=0;
+ } else {
+ lb=(sqrt(dc1*dc0) - 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr);
+ if(lb<0) { lb=0; }
+ lb*=lb;
+ }
+
+ //Rprintf("%f=f(%f,%f,%f); %f=f(%f,%f,%f)\n",lb,1.0-thr,2.0*dc1,2.0*dc0,ub,thr,2.0*dc1,2.0*dc0);
+
+#ifdef DEBUG
+ //double ub=gsl_cdf_fdist_Qinv(thr,2.0*dc1,2.0*dc0)*dc1/dc0;
+ double ub=(sqrt(dc1*dc0) + 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr);
+ ub*=ub;
+ Rprintf("s=%d (%f); e=%d (%f); window: %f-%f; cc=[%d,%d]; lb=%f; ub=%f\n",si,pos[si],ei,pos[ei],ws,ws+wsize,cc[0],cc[1],lb,ub);
+#endif
+
+ int bc=lb>=bgm && cc[1]>=mintag;
+ if(either) {
+ bc=lb>=bgm || cc[1]>=mintag;
+ }
+ if(bc) {
+ if(inclust) {
+ double nce=ws+wsize/2.0;
+ if(nce-ce > wsize/2.0) {
+ // next point is too far removed, end cluster
+ if(ce-cs >= mcs) {
+ contigs.push_back(pair<double,double>(cs,ce));
+#ifdef DEBUG
+ Rprintf("recorded cluster %f-%f\n",cs,ce);
+#endif
+ }
+ inclust=0;
+ } else {
+ ce=nce;
+ }
+ } else {
+ inclust=1;
+ cs=ws+wsize/2.0;
+ ce=cs;
+ }
+ } else {
+ if(inclust) {
+ if(ce-cs >= mcs) {
+ contigs.push_back(pair<double,double>(cs,ce));
+#ifdef DEBUG
+ Rprintf("recorded cluster %f-%f\n",cs,ce);
+#endif
+ }
+ inclust=0;
+ }
+ }
+
+ }
+
+ if(inclust) {
+ if(ce-cs >= mcs) {
+ contigs.push_back(pair<double,double>(cs,ce));
+#ifdef DEBUG
+ Rprintf("recorded cluster %f-%f\n",cs,ce);
+#endif
+ }
+ inclust=0;
+ }
+ }
+
+ SEXP cs_R,ce_R;
+ PROTECT(cs_R=allocVector(REALSXP,contigs.size()));
+ PROTECT(ce_R=allocVector(REALSXP,contigs.size()));
+ double* csa=REAL(cs_R);
+ double* cea=REAL(ce_R);
+
+ int i=0;
+ for(vector< pair<double,double> >::const_iterator ci=contigs.begin(); ci!=contigs.end();++ci) {
+ csa[i]=ci->first;
+ cea[i]=ci->second;
+ i++;
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("s"));
+ SET_STRING_ELT(names_R, 1, mkChar("e"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, cs_R);
+ SET_VECTOR_ELT(ans_R, 1, ce_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+
+ }
+
+
+ // finds intersection between a list of regions
+ // the flag has +n/-n value, corresponding to the start/end of a segment in n-th regionset
+ // max_val: 1 - report max overlapping value, -1: report min, 0 - don't look at values
+ // returns: $s, $e, ($v) lists
+ SEXP region_intersection(SEXP n_R,SEXP pos_R,SEXP flags_R,SEXP vals_R,SEXP max_val_R,SEXP union_R) {
+ const int max_val=*INTEGER(max_val_R);
+ const int unionr=*INTEGER(union_R);
+ const int n=*INTEGER(n_R);
+ double* pos=REAL(pos_R);
+ int* flags=INTEGER(flags_R);
+ double* val=REAL(vals_R);
+
+#ifdef DEBUG
+ Rprintf("n=%d; npos=%d; max_val=%d\n",n,LENGTH(pos_R),max_val);
+#endif
+
+ int s[n]; // flag status for each set
+ double mv[n]; // max/min value of current clusters
+
+ for(int i=0;i<n;i++) { s[i]=0; }
+
+ vector<double> starts;
+ vector<double> ends;
+ vector<double> values;
+
+ int start=-1;
+ double mval=0;
+ for(int i=0;i<LENGTH(pos_R);i++) {
+ // update flags
+ int f=flags[i];
+ if(f>0) {
+ s[abs(f)-1]++;
+ } else {
+ s[abs(f)-1]--;
+ }
+
+ if(max_val!=0 && val[i]*max_val > mval*max_val) { mval=val[i]; }
+
+ // joined status
+ int all;
+ if(unionr) {
+ all=0;
+ for(int j=0;j<n;j++) { if(s[j]>0) { all=1; break;} }
+ } else {
+ all=1;
+ for(int j=0;j<n;j++) { all=all & (s[j]>0); }
+ }
+
+
+ //Rprintf("i=%d; s=[",i);
+ //for(int j=0;j<n;j++) { Rprintf("%d",s[j]); }
+ //Rprintf("]; all=%d; start=%d\n",all,start);
+
+ if(start>=0) {
+ // in fragment
+ if(!all) {
+ // end fragment
+ starts.push_back(pos[start]);
+ ends.push_back(pos[i]);
+ start=-1;
+ if(max_val!=0) { values.push_back(mval); }
+
+#ifdef DEBUG
+ Rprintf("recorded new fragment (s=%f,e=%f,v=%f);\n",pos[start],pos[i],mval);
+#endif
+ }
+ } else {
+ // should a fragment be started?
+ if(all) {
+ start=i;
+ if(max_val!=0) { mval=val[i]; }
+#ifdef DEBUG
+ Rprintf("starting new fragment (s=%f,i=%d);\n",pos[start],i);
+#endif
+ }
+ }
+ }
+ SEXP cs_R,ce_R,cv_R;
+ PROTECT(cs_R=allocVector(REALSXP,starts.size()));
+ PROTECT(ce_R=allocVector(REALSXP,ends.size()));
+
+ double* csa=REAL(cs_R);
+ int i=0;
+ for(vector<double>::const_iterator ci=starts.begin(); ci!=starts.end(); ++ci) {
+ csa[i]=*ci; i++;
+ }
+
+ csa=REAL(ce_R);
+ i=0;
+ for(vector<double>::const_iterator ci=ends.begin(); ci!=ends.end(); ++ci) {
+ csa[i]=*ci; i++;
+ }
+
+ if(max_val!=0) {
+ PROTECT(cv_R=allocVector(REALSXP,values.size()));
+ csa=REAL(cv_R);
+ i=0;
+ for(vector<double>::const_iterator ci=values.begin(); ci!=values.end(); ++ci) {
+ csa[i]=*ci; i++;
+ }
+ }
+
+ SEXP ans_R, names_R;
+ if(max_val!=0) {
+ PROTECT(names_R = allocVector(STRSXP, 3));
+ SET_STRING_ELT(names_R, 0, mkChar("s"));
+ SET_STRING_ELT(names_R, 1, mkChar("e"));
+ SET_STRING_ELT(names_R, 2, mkChar("v"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 3));
+ SET_VECTOR_ELT(ans_R, 0, cs_R);
+ SET_VECTOR_ELT(ans_R, 1, ce_R);
+ SET_VECTOR_ELT(ans_R, 2, cv_R);
+ } else {
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("s"));
+ SET_STRING_ELT(names_R, 1, mkChar("e"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, cs_R);
+ SET_VECTOR_ELT(ans_R, 1, ce_R);
+ }
+
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ if(max_val!=0) {
+ UNPROTECT(5);
+ } else {
+ UNPROTECT(4);
+ }
+ return(ans_R);
+ }
+
+}
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/wdl.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/wdl.cpp
new file mode 100755
index 0000000..70ded9a
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.2/src/wdl.cpp
@@ -0,0 +1,657 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <string>
+#include <set>
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+//#define DEBUG 1
+
+extern "C" {
+
+ /************************************************************************/
+ /*
+ * lwcc - calculate local window cross-correlation
+ */
+
+ SEXP lwcc(SEXP x_R, // positive strand hist
+ SEXP y_R, // negative strand hist of the same length
+ SEXP osize_R, // outer boundary distance
+ SEXP isize_R, // inner boundary distance
+ SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
+ SEXP min_peak_dist_R, // distance between closest peaks
+ SEXP min_peak_val_R, // min peak threshold
+ SEXP tag_weight_R, // tag weight
+ SEXP bg_subtract_R, // a flag whether do background subtractio
+ SEXP bgp_R, // optional background hist for positive strand
+ SEXP bgn_R, // optional background hist for negative strand
+ SEXP bg_wsize_R, // window size for the background counts
+ SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout)
+ SEXP round_up_R // whether to round up fractional signal tag counts
+ )
+ {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+
+ int is=INTEGER(isize_R)[0];
+ int os=INTEGER(osize_R)[0];
+ double rs=((double)(2*os+1));
+ int* x=INTEGER(x_R);
+ int* y=INTEGER(y_R);
+ int n_x=LENGTH(x_R);
+
+ // background-related
+ int* bgp=INTEGER(bgp_R);
+ int* bgn=INTEGER(bgn_R);
+ int bg_whs=INTEGER(bg_wsize_R)[0];
+
+ int return_peaks=*(INTEGER(return_peaks_R));
+ double min_peak_val=*(REAL(min_peak_val_R));
+ int min_peak_dist=*(INTEGER(min_peak_dist_R));
+ double tag_weight=*(REAL(tag_weight_R));
+
+ const int round_up=*(INTEGER(round_up_R));
+ const int bg_subtract=*(INTEGER(bg_subtract_R));
+ const double bg_weight=*(REAL(bg_weight_R));
+
+ int i; // point at which the value is being calculated
+ int start=os;
+ int end=n_x-os-1;
+
+ // bg tag counts within bg window
+ int bg_pn1=0;
+ int bg_nn1=0;
+ int bg_pn2=0;
+ int bg_nn2=0;
+
+
+
+ // illustration for counting:
+ //
+ // 012345678901234567890123456789012
+ // ==========------|------==========
+ //
+ // osize=16; isize=6;
+
+
+ SEXP nv;
+ double *d_nv;
+ vector<int> ppos;
+ vector<double> pval;
+ if(!return_peaks) {
+ PROTECT(nv=allocVector(REALSXP,n_x));
+ d_nv=REAL(nv);
+ for(int i=0;i<n_x;i++) {
+ d_nv[i]=0;
+ }
+ }
+
+#ifdef DEBUG
+ Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight);
+ Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
+#endif
+
+ int lpp=-1; // last peak position
+ double lpv=-1e3; // last peak value
+
+ double ppv=-1e3; // last value
+ double pppv=-11e-3; // value before last
+
+ int pn1,pn2,nn1,nn2;
+
+
+ if(bg_subtract) {
+ // pre-initialize background tag counts,
+ for(int i=0;i<bg_whs;i++) {
+ if(i<n_x) {
+ bg_pn2+=bgp[i];
+ bg_nn2+=bgn[i];
+ }
+ }
+ }
+
+
+ for(i=0;i<end;i++) {
+#ifdef DEBUG
+ //Rprintf("i=%d ", i);
+#endif
+
+ if(bg_subtract) {
+ // update background counts
+ int nl=i-bg_whs-1;
+
+ if(nl>=0) {
+ bg_pn1-=bgp[nl];
+ bg_nn1-=bgn[nl];
+ }
+ bg_pn1+=bgp[i];
+ bg_nn1+=bgn[i];
+
+ if(i>0) {
+ bg_pn2-=bgp[i-1];
+ bg_nn2-=bgn[i-1];
+ }
+ int nr=i+bg_whs;
+ if(nr<n_x) {
+ bg_pn2+=bgp[nr];
+ bg_nn2+=bgn[nr];
+ }
+ }
+
+ if(i >= start) {
+ // update counts, taking into account masked out regions
+ pn1=pn2=nn1=nn2=0;
+
+ for(int k=0;k<=(os-is);k++) {
+ int xp1=x[i-os+k];
+ int xp2=x[i+os-k];
+ int xn1=y[i+os-k];
+ int xn2=y[i-os+k];
+
+ if(xp1!=-1 && xn1!=-1) {
+ pn1+=xp1;
+ nn1+=xn1;
+ }
+ if(xp2!=-1 && xn2!=-1) {
+ pn2+=xp2;
+ nn2+=xn2;
+ }
+ }
+
+ // calculate the means
+ double mp=((double)(pn1+pn2))/rs;
+ double mn=((double)(pn1+pn2))/rs;
+#ifdef DEBUG
+ Rprintf("mp=%f mn=%f\n",mp,mn);
+#endif
+ // calculate correlation
+ double varp=0;
+ double varn=0;
+ double num=0;
+ double val=-1e3;
+ if(mp>0 & mn>0) {
+ for(int k=0;k<=(os-is);k++) {
+ int xp1=x[i-os+k];
+ int xp2=x[i+os-k];
+ int xn1=y[i+os-k];
+ int xn2=y[i-os+k];
+
+
+ if(xp1!=-1 && xn1!=-1) {
+ double nnp1=((double) xp1)-mp;
+ double nnn1=((double) xn1)-mn;
+ num+=nnp1*nnn1;
+ varp+=nnp1*nnp1;
+ varn+=nnn1*nnn1;
+ }
+
+ if(xp2!=-1 && xn2!=-1) {
+ double nnp2=((double) xp2)-mp;
+ double nnn2=((double) xn2)-mn;
+ num+=nnp2*nnn2;
+ varp+=nnp2*nnp2;
+ varn+=nnn2*nnn2;
+ }
+
+ }
+ double tagw;
+ double spn1=((double)pn1)*tag_weight;
+ double snn1=((double)nn1)*tag_weight;
+ double spn2=((double)pn2)*tag_weight;
+ double snn2=((double)nn2)*tag_weight;
+ if(round_up) {
+ if(pn1>0 && spn1<1) { spn1=1.0; }
+ //if(pn2>0 && spn2<1) { spn2=1.0; }
+ if(nn1>0 && snn1<1) { snn1=1.0; }
+ //if(nn2>0 && snn2<1) { snn2=1.0; }
+ }
+
+ if(bg_subtract) {
+ spn1-=((double)bg_pn1)*bg_weight;
+ snn1-=((double)bg_nn2)*bg_weight;
+ spn2-=((double)bg_pn2)*bg_weight;
+ snn2-=((double)bg_nn1)*bg_weight;
+
+ if(spn2<0) spn2=0;
+ if(snn2<0) snn2=0;
+
+ if(spn1>0 && snn1>0) {
+ tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0));
+ } else {
+ tagw=-(spn2+snn2+1.0);
+ }
+ //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl;
+ } else {
+ tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2);
+ }
+
+ if(tagw<0) {
+ val=0.0;
+ } else {
+ if(num==0.0) {
+ val=0;
+ } else {
+ val=num/(sqrt(varp*varn));
+ }
+ val=val*sqrt(tagw) + tagw;
+
+ }
+ //cout<<"val="<<val<<endl;
+
+#ifdef DEBUG
+ Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw);
+ Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val);
+#endif
+ }
+
+
+
+ if(return_peaks) {
+ // determine if previous position was a peak
+ if(ppv>min_peak_val && ppv>val && ppv>pppv) {
+ if(lpp>0 && (i-lpp+1)>min_peak_dist) {
+ // record previous peak position
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+#ifdef DEBUG
+ Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
+#endif
+ lpp=i-1; lpv=ppv;
+#ifdef DEBUG
+ Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
+#endif
+ } else {
+ if(ppv>lpv) {
+ // update last peak positions
+#ifdef DEBUG
+ Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
+#endif
+ lpp=i-1; lpv=ppv;
+ }
+ }
+ }
+
+ // update previous values
+ if(val!=ppv) {
+ pppv=ppv; ppv=val;
+ }
+ } else {
+ d_nv[i]=val;
+ }
+ }
+ }
+
+ if(return_peaks) {
+ // record last position
+ if(lpp>0) {
+#ifdef DEBUG
+ Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
+#endif
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+ }
+
+ SEXP rpp_R,rpv_R;
+ PROTECT(rpp_R=allocVector(INTSXP,ppos.size()));
+ PROTECT(rpv_R=allocVector(REALSXP,ppos.size()));
+ int* rpp=INTEGER(rpp_R);
+ double* rpv=REAL(rpv_R);
+
+ for(int i=0;i<ppos.size();i++) {
+ rpp[i]=ppos[i];
+ rpv[i]=pval[i];
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("x"));
+ SET_STRING_ELT(names_R, 1, mkChar("v"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, rpp_R);
+ SET_VECTOR_ELT(ans_R, 1, rpv_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+ } else {
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ }
+
+
+
+ /************************************************************************/
+ /*
+ * wtd - window tag difference implementation
+ */
+
+ SEXP wtd(SEXP x_R, // positive strand hist
+ SEXP y_R, // negative strand hist of the same length
+ SEXP wsize_R, // outer boundary distance
+ SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
+ SEXP min_peak_dist_R, // distance between closest peaks
+ SEXP min_peak_val_R, // min peak threshold
+ SEXP direct_count_R, // whether tag weighting should not be done
+ SEXP tag_weight_R, // tag weight
+ SEXP ignore_masking_R, // whether to ignore masked regions
+ SEXP bg_subtract_R, // a flag whether do background subtractio
+ SEXP bgp_R, // optional background hist for positive strand
+ SEXP bgn_R, // optional background hist for negative strand
+ SEXP bg_wsize_R, // window size for the background counts
+ SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference
+ SEXP round_up_R // whether to round up fractional signal tag counts
+ )
+ {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+
+ int whs=INTEGER(wsize_R)[0];
+ int* x=INTEGER(x_R);
+ int* y=INTEGER(y_R);
+ int n_x=LENGTH(x_R);
+
+ // background-related
+ int* bgp=INTEGER(bgp_R);
+ int* bgn=INTEGER(bgn_R);
+ int bg_whs=INTEGER(bg_wsize_R)[0];
+
+
+ const int return_peaks=*(INTEGER(return_peaks_R));
+ const int direct_count=*(INTEGER(direct_count_R));
+ const int ignore_masking=*(INTEGER(ignore_masking_R));
+ const double min_peak_val=*(REAL(min_peak_val_R));
+ const int min_peak_dist=*(INTEGER(min_peak_dist_R));
+ const double tag_weight=*(REAL(tag_weight_R));
+
+ const int round_up=*(INTEGER(round_up_R));
+ const int bg_subtract=*(INTEGER(bg_subtract_R));
+ const double bg_weight=*(REAL(bg_weight_R));
+
+ int i; // point at which the value is being calculated
+ int start=whs+1;
+ int end=n_x-whs-1;
+
+ // tag counts to calculate the means
+ int pn1=0;
+ int pn2=0;
+ int nn1=0;
+ int nn2=0;
+
+ // bg tag counts within bg window
+ int bg_pn1=0;
+ int bg_pn2=0;
+ int bg_nn1=0;
+ int bg_nn2=0;
+
+ SEXP nv;
+ double *d_nv;
+ vector<int> ppos;
+ vector<double> pval;
+ if(!return_peaks) {
+ PROTECT(nv=allocVector(REALSXP,n_x));
+ d_nv=REAL(nv);
+ for(int i=0;i<n_x;i++) {
+ d_nv[i]=0;
+ }
+ }
+
+#ifdef DEBUG
+ Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking);
+ Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
+#endif
+
+ int lpp=-1; // last peak position
+ double lpv=-1000; // last peak value
+
+ double ppv=-1000; // last value
+ int ppl=-1; // position of the last value
+ double pppv=-1000; // value before last
+
+
+ if(ignore_masking==1) {
+ for(int i=0;i<whs;i++) {
+ pn1+=x[i];
+ pn2+=x[i+whs+1];
+ nn1+=y[i];
+ nn2+=y[i+whs+1];
+
+ }
+ }
+
+ if(bg_subtract) {
+ // pre-initialize background tag counts,
+ for(int i=0;i<bg_whs;i++) {
+ if(i<n_x) {
+ bg_pn2+=bgp[i];
+ bg_nn2+=bgn[i];
+ }
+ }
+ // increment center of background count window to the start position
+ for(int i=0;i<start;i++) {
+ // update background counts
+ int nl=i-bg_whs-1;
+
+ if(nl>=0) {
+ bg_pn1-=bgp[nl];
+ bg_nn1-=bgn[nl];
+ }
+ bg_pn1+=bgp[i];
+ bg_nn1+=bgn[i];
+
+ if(i>0) {
+ bg_pn2-=bgp[i-1];
+ bg_nn2-=bgn[i-1];
+ }
+ int nr=i+bg_whs;
+ if(nr<n_x) {
+ bg_pn2+=bgp[nr];
+ bg_nn2+=bgn[nr];
+ }
+ }
+
+ }
+
+
+#ifdef DEBUG
+ Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2);
+#endif
+
+ for(i=start;i<end;i++) {
+ if(bg_subtract) {
+ // update background counts
+ int nl=i-bg_whs-1;
+
+ if(nl>=0) {
+ bg_pn1-=bgp[nl];
+ bg_nn1-=bgn[nl];
+ }
+ bg_pn1+=bgp[i];
+ bg_nn1+=bgn[i];
+
+ if(i>0) {
+ bg_pn2-=bgp[i-1];
+ bg_nn2-=bgn[i-1];
+ }
+ int nr=i+bg_whs;
+ if(nr<n_x) {
+ bg_pn2+=bgp[nr];
+ bg_nn2+=bgn[nr];
+ }
+ }
+
+ // update counts
+ if(ignore_masking==1) {
+ pn1+=x[i-1]-x[i-whs-1];
+ pn2+=x[i+whs]-x[i-1];
+ nn1+=y[i-1]-y[i-whs-1];
+ nn2+=y[i+whs]-y[i-1];
+
+ } else {
+
+ pn1=pn2=nn1=nn2=0;
+
+ for(int k=0;k<whs;k++) {
+ int xp1=x[i-k-1];
+ int xp2=x[i+k];
+ int xn1=y[i-k-1];
+ int xn2=y[i+k];
+
+ // omit masked positions
+ if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) {
+ pn1+=xp1;
+ nn1+=xn1;
+ pn2+=xp2;
+ nn2+=xn2;
+ }
+ }
+ }
+
+ double val;
+ double spn1=((double)pn1)*tag_weight;
+ double snn1=((double)nn1)*tag_weight;
+ double spn2=((double)pn2)*tag_weight;
+ double snn2=((double)nn2)*tag_weight;
+ if(round_up) {
+ if(pn1>0 && spn1<1) { spn1=1.0; }
+ //if(pn2>0 && spn2<1) { spn2=1.0; }
+ //if(nn1>0 && snn1<1) { snn1=1.0; }
+ if(nn2>0 && snn2<1) { snn2=1.0; }
+ }
+
+ if(direct_count) {
+ val=spn1+snn2;
+ if(round_up && val<1) {
+ val=1.0;
+ }
+ if(bg_subtract) {
+ val-=((double) (bg_pn1+bg_nn2))*bg_weight;
+ }
+ } else {
+ if(bg_subtract) {
+ spn1-=((double)bg_pn1)*bg_weight;
+ snn1-=((double)bg_nn1)*bg_weight;
+ spn2-=((double)bg_pn2)*bg_weight;
+ snn2-=((double)bg_nn2)*bg_weight;
+
+ if(spn2<0) spn2=0;
+ if(snn1<0) snn1=0;
+
+ if(spn1>0 && snn2>0) {
+ val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0));
+ } else {
+ val=-(spn2+snn1+1.0);
+ }
+ } else {
+ val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight);
+ }
+ }
+ //double val=sqrt(pn1*nn2);
+ //if(pn2>nn1) { val-=pn2; } else { val-=pn1; }
+#ifdef DEBUG
+ Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val);
+#endif
+
+ if(return_peaks) {
+ // determine if previous position was a peak
+ if(ppv>min_peak_val && ppv>val && ppv>pppv) {
+ if(lpp>0 && (i-lpp+1)>min_peak_dist) {
+ // record previous peak position
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+#ifdef DEBUG
+ Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
+#endif
+ if(ppl!=-1 && ppl!=i-1) {
+ lpp=(int) round((ppl+i-1)/2);
+ } else {
+ lpp=i-1;
+ }
+ lpv=ppv;
+#ifdef DEBUG
+ Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
+#endif
+ } else {
+ if(ppv>lpv) {
+ // update last peak positions
+#ifdef DEBUG
+ Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
+#endif
+ if(ppl!=-1 && ppl!=i-1) {
+ lpp=(int) round((ppl+i-1)/2);
+ } else {
+ lpp=i-1;
+ }
+ lpv=ppv;
+ }
+ }
+ }
+
+ // update previous values
+ if(val!=ppv) {
+ pppv=ppv; ppv=val; ppl=i;
+ }
+ } else {
+ d_nv[i]=val;
+ }
+ }
+
+ if(return_peaks) {
+ // record last position
+ if(lpp>0) {
+#ifdef DEBUG
+ Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
+#endif
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+ }
+
+ SEXP rpp_R,rpv_R;
+ PROTECT(rpp_R=allocVector(INTSXP,ppos.size()));
+ PROTECT(rpv_R=allocVector(REALSXP,ppos.size()));
+ int* rpp=INTEGER(rpp_R);
+ double* rpv=REAL(rpv_R);
+
+ for(int i=0;i<ppos.size();i++) {
+ rpp[i]=ppos[i];
+ rpv[i]=pval[i];
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("x"));
+ SET_STRING_ELT(names_R, 1, mkChar("v"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, rpp_R);
+ SET_VECTOR_ELT(ans_R, 1, rpv_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+ } else {
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ }
+
+
+}
+
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/DESCRIPTION b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/DESCRIPTION
new file mode 100755
index 0000000..59eeb71
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/DESCRIPTION
@@ -0,0 +1,12 @@
+Package: spp
+Type: Package
+Title: some description
+Version: 1.0
+Date: 2008-11-10
+Author: Peter K
+Depends: caTools
+Maintainer: peterK<peterk@compbio.med.harvard.edu>
+Description: Describe the package
+License: GPL-2
+LazyLoad: yes
+Packaged: Wed Nov 12 10:42:54 2008; vidhuch
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/NAMESPACE b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/NAMESPACE
new file mode 100755
index 0000000..caf30e6
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/NAMESPACE
@@ -0,0 +1,3 @@
+useDynLib(spp)
+
+exportPattern("^[^\\.]")
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/R/zroutines.R b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/R/zroutines.R
new file mode 100755
index 0000000..ece76f3
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/R/zroutines.R
@@ -0,0 +1,2501 @@
+#library(caTools)
+#dyn.load("src/bed2vector.so");
+#dyn.load("src/wdl.so");
+#dyn.load("src/peaks.so");
+#dyn.load("src/cdensum.so");
+
+
+# -------- ROUTINES FOR READING IN THE DATA FILES ------------
+# fix.chromosome.names : remove ".fa" suffix from match sequence names
+read.eland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T,max.eland.tag.length=-1,extended=F,multi=F) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ storage.mode(max.eland.tag.length) <- "integer";
+ callfunction <- "read_eland";
+ if(extended) { callfunction <- "read_eland_extended"; };
+ if(multi) { callfunction <- "read_eland_multi"; };
+ tl <- lapply(.Call(callfunction,filename,rtn,max.eland.tag.length),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+read.tagalign.tags <- function(filename,fix.chromosome.names=T,fix.quality=T) {
+ tl <- lapply(.Call("read_tagalign",filename),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ #if(fix.quality) {
+ # d$n <- 4-cut(d$n,breaks=c(0,250,500,750,1000),labels=F)
+ #}
+ if(fix.quality) { # Anshul: changed the way the quality field is processed
+ if (min(d$n)<0.5){
+ d$n = ceiling(1000/4^d$n);
+ }
+ break.vals <- unique(sort(c(0,unique(d$n))));
+ d$n <- length(break.vals)-1-cut(d$n,breaks=break.vals,labels=F);
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+}
+
+
+read.short.arachne.tags <- function(filename,fix.chromosome.names=F) {
+ tl <- lapply(.Call("read_arachne",filename),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+}
+
+
+read.arachne.tags <- function(filename,fix.chromosome.names=F) {
+ tl <- lapply(.Call("read_arachne_long",filename),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ d$l <- d$l[xo];
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l)));
+}
+
+read.bowtie.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_bowtie",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+read.bam.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_bam",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+
+read.helicos.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F,include.length.info=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_helicostabf",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ d$l <- d$l[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l)));
+ }
+}
+
+read.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_maqmap",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+
+read.bin.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_binmaqmap",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n)));
+ }
+}
+
+
+# read in tags from an extended eland format with match length information
+read.meland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) {
+ if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); };
+ tl <- lapply(.Call("read_meland",filename,rtn),function(d) {
+ xo <- order(abs(d$t));
+ d$t <- d$t[xo];
+ d$n <- d$n[xo];
+ d$l <- d$l[xo];
+ if(read.tag.names) {
+ d$s <- d$s[xo];
+ }
+ return(d);
+ });
+
+ if(fix.chromosome.names) {
+ # remove ".fa"
+ names(tl) <- gsub("\\.fa","",names(tl))
+ }
+ # separate tags and quality
+ chrl <- names(tl); names(chrl) <- chrl;
+ # reformulate quality scores into monotonic integers
+ ml <- max(unlist(lapply(tl,function(d) max(d$l))));
+ qual <- lapply(chrl,function(chr) (ml-tl[[chr]]$l)+tl[[chr]]$n/10);
+ if(read.tag.names) {
+ return(list(tags=lapply(tl,function(d) d$t),quality=qual,names=lapply(tl,function(d) d$s)));
+ } else {
+ return(list(tags=lapply(tl,function(d) d$t),quality=qual));
+ }
+}
+
+# -------- ROUTINES FOR ASSESSING BINDING PATTERN AND SELECTING INFORMATIVE TAGS ------------
+
+# removes tag positions that have anomalously high counts on both strands
+# z - z-score used to determine anomalous bins
+# zo - z used to filter out one-strand matches
+# trim.fraction - fraction of top bins to discard when calculating overall background density
+remove.tag.anomalies <- function(data, bin=1,trim.fraction=1e-3,z=5,zo=3*z) {
+
+ t.remove.tag.anomalies <- function(tv,bin=1,trim.fraction=1e-3,z=5,zo=3*z,return.indecies=F) {
+ tt <- table(floor(tv/bin));
+
+ # trim value
+ stt <- sort(as.numeric(tt));
+ stt <- stt[1:(length(stt)*(1-trim.fraction))];
+ mtc <- mean(stt); tcd <- sqrt(var(stt));
+
+ thr <- max(1,ceiling(mtc+z*tcd));
+ thr.o <- max(1,ceiling(mtc+zo*tcd));
+ # filter tt
+ tt <- tt[tt>=thr]
+ # get + and - tags
+ tp <- as.numeric(names(tt));
+ pti <- tp>0;
+ it <- intersect(tp[pti],(-1)*tp[!pti]);
+ # add one-strand matches
+ it <- unique(c(it,tp[tt>=thr.o]));
+ sit <- c(it,(-1)*it);
+
+ if(bin>1) {
+ sit <- sit*bin;
+ sit <- c(sit,unlist(lapply(1:bin,function(i) sit+i)))
+ }
+ if(return.indecies) {
+ return(!tv %in% sit);
+ } else {
+ return(tv[!tv %in% sit]);
+ }
+ }
+
+ vil <- lapply(data$tags,t.remove.tag.anomalies,return.indecies=T,bin=bin,trim.fraction=trim.fraction,z=z,zo=zo);
+ chrl <- names(data$tags); names(chrl) <- chrl;
+ data$tags <- lapply(chrl,function(chr) data$tags[[chr]][vil[[chr]]]);
+ # count tags to remove empty chromosomes
+ nt <- unlist(lapply(data$tags,length));
+ if(any(nt==0)) {
+ data$tags <- data$tags[nt!=0]
+ }
+
+ if(!is.null(data$quality)) {
+ data$quality <- lapply(chrl,function(chr) data$quality[[chr]][vil[[chr]]]);
+ data$quality <- data$quality[nt!=0];
+ }
+ if(!is.null(data$names)) {
+ data$names <- lapply(chrl,function(chr) data$names[[chr]][vil[[chr]]]);
+ data$names <- data$names[nt!=0];
+ }
+
+ return(data);
+}
+
+# caps or removes tag positions that are significantly higher than local background
+remove.local.tag.anomalies <- function(tags,window.size=200,eliminate.fold=10,cap.fold=4,z.threshold=3) {
+ lapply(tags,filter.singular.positions.by.local.density,window.size=2e2,eliminate.fold=10,cap.fold=4,z.threshold=3);
+}
+
+
+
+# assess strand cross-correlation, determine peak position, determine appropriate window size
+# for binding detection.
+get.binding.characteristics <- function(data,srange=c(50,500),bin=5,cluster=NULL,debug=F,min.tag.count=1e3,acceptance.z.score=3,remove.tag.anomalies=T,anomalies.z=5,accept.all.tags=F) {
+ if(remove.tag.anomalies) {
+ data <- remove.tag.anomalies(data,z=anomalies.z);
+ }
+
+ # take highest quality tag bin
+ if(!is.null(data$quality) & !accept.all.tags) {
+ min.bin <- min(unlist(lapply(data$quality,min)))
+ chrl <- names(data$tags); names(chrl) <- chrl;
+ otl <- lapply(chrl,function(chr) data$tags[[chr]][data$quality[[chr]]==min.bin]);
+ } else {
+ otl <- data$tags;
+ }
+ # remove empty chromosomes
+ otl <- otl[unlist(lapply(otl,length))!=0];
+
+
+ # calculate strand scc
+ if(!is.null(cluster)) {
+ cc <- clusterApplyLB(cluster,otl,tag.scc,srange=srange,bin=bin);
+ names(cc) <- names(otl);
+ } else {
+ cc <- lapply(otl,tag.scc,srange=srange,bin=bin);
+ }
+ ccl<-list(sample=cc);
+ ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,return.ac=T,ttl=list(sample=otl),plot=F)[[1]]
+ ccl.av <- data.frame(x=as.numeric(names(ccl.av)),y=as.numeric(ccl.av));
+
+ # find peak
+ pi <- which.max(ccl.av$y);
+
+ # determine width at third-height
+ th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/3+ccl.av$y[length(ccl.av$y)]
+ whs <- max(ccl.av$x[ccl.av$y>=th]);
+
+ if (! is.integer(whs)) { # Anshul: added this to avoid situations where whs ends up being -Inf
+ whs <- ccl.av$x[ min(c(2*pi,length(ccl.av$y))) ]
+ }
+
+ # determine acceptance of different quality bins
+
+ # calculates tag scc for the best tags, and combinations of best tag category with every other category
+ # for subsequent selection of acceptable categories
+ scc.acceptance.calc <- function() {
+
+ qr <- range(unlist(lapply(data$quality,range)))
+
+ # start with best tags
+
+ # determine half-width for scc calculations
+ pi <- which.max(ccl.av$y);
+
+ # determine width at half-height
+ th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/2+ccl.av$y[length(ccl.av$y)]
+ lwhs <- max(ccl.av$x[ccl.av$y>=th])-ccl.av$x[pi];
+ lwhs <- max(c(20,bin*10,lwhs));
+ srange <- ccl.av$x[pi]+c(-lwhs,lwhs)
+
+ # calculate chromosome-average scc
+ t.scc <- function(tags) {
+ if(is.null(cluster)) {
+ cc <- lapply(tags,tag.scc,srange=srange,bin=bin);
+ } else {
+ cc <- clusterApplyLB(cluster,tags,tag.scc,srange=srange,bin=bin); names(cc) <- names(tags);
+ }
+ return(t.plotavcc(1,type='l',ccl=list(cc),ttl=list(tags),plot=F,return.ac=T))
+ }
+
+
+ # returns info list for a given tag length (lv), mismatch count (nv)
+ t.cat <- function(qual) {
+ # construct tag set
+ if(qual==qr[1]) {
+ ts <- otl;
+ } else {
+ nts <- names(otl); names(nts) <- nts;
+ # select tags
+ at <- lapply(nts,function(chr) data$tags[[chr]][data$quality[[chr]]==qual]);
+ ntags <- sum(unlist(lapply(at,length)));
+ if(ntags<min.tag.count) { return(NULL); }
+
+ # append to otl
+ ts <- lapply(nts,function(nam) c(otl[[nam]],at[[nam]]));
+ }
+
+ return(t.scc(ts));
+ }
+
+
+ # calculate cross-correlation values for each quality bin
+ ql <- sort(unique(unlist(lapply(data$quality,unique)))); names(ql) <- ql;
+
+ qccl <- lapply(ql,t.cat);
+
+ # acceptance tests
+ ac <- c(T,unlist(lapply(qccl[-1],function(d) if(is.null(d)) { return(F) } else { t.test(d-qccl[[as.character(min.bin)]],alternative="greater")$p.value<pnorm(acceptance.z.score,lower.tail=F) }))); names(ac) <- names(qccl);
+ return(list(informative.bins=ac,quality.cc=qccl))
+ }
+
+ if(accept.all.tags | is.null(data$quality)) {
+ return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs))
+ } else {
+ acc <- scc.acceptance.calc();
+ return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs,quality.bin.acceptance=acc));
+ }
+
+}
+
+
+# select a set of informative tags based on the pre-calculated binding characteristics
+select.informative.tags <- function(data,binding.characteristics=NULL) {
+ if(is.null(binding.characteristics)) {
+ return(data$tags);
+ }
+ if(is.null(binding.characteristics$quality.bin.acceptance)) {
+ cat("binding characteristics doesn't contain quality selection info, accepting all tags\n");
+ return(data$tags);
+ }
+
+ ib <- binding.characteristics$quality.bin.acceptance$informative.bins;
+ abn <- names(ib)[ib]
+
+ chrl <- names(data$tags); names(chrl) <- chrl;
+ lapply(chrl,function(chr) {
+ data$tags[[chr]][as.character(data$quality[[chr]]) %in% abn]
+ })
+}
+
+# -------- ROUTINES FOR CALLING BINDING POSITIONS ------------
+
+# determine binding positions
+# signal.data - IP tag lists
+# control.data - input tag lists
+# e.value - desired E-value threshold (either E-value or FDR threshold must be provided)
+# fdr - desired FDR threshold
+# min.dist - minimal distance between detected positions
+# tag.count.whs - size of the window to be used to estimate confidence interval of the peak fold enrichment ratios
+# enrichmnent.z - Z-score defining the desired confidence level for enrichment interval estimates
+# enrichment.background.scales - define how many tiems larger should be the window for estimating background
+# tag density when evaluating peak enrichment confidence intervals.
+# If multiple values are given, multiple independent interval estimates will be
+# calculated.
+# tec.filter - whether to mask out the regions that exhibit significant background enrichment
+# tec.window.size, tec.z - window size and Z-score for maksing out significant background enrichment regions
+#
+# If the control.data is not provided, the method will assess significance of the determined binding positions
+# based on the randomizations of the original data. The following paramters control such randomizations:
+# n.randomizations - number of randomizations to be performed
+# shuffle.window - size of the bin that defines the tags that are kept together during randomization.
+# value of 0 means that all tags are shuffled independently
+#
+# Binding detection methods:
+# tag.wtd - default method.
+# must specify parameter "whs", which is the half-size of the window used to calculate binding scores
+# tag.lwcc - LWCC method;
+# must specify whs - a size of the window used to calculate binding scores
+# can specify isize (default=15bp) - size of the internal window that is masked out
+find.binding.positions <- function(signal.data,f=1,e.value=NULL,fdr=NULL, masked.data=NULL,control.data=NULL,whs=200,min.dist=200,window.size=4e7,cluster=NULL,debug=T,n.randomizations=3,shuffle.window=1,min.thr=2,topN=NULL, tag.count.whs=100, enrichment.z=2, method=tag.wtd, tec.filter=T,tec.window.size=1e4,tec.z=5,tec.masking.window.size=tec.window.size, tec.poisson.z=5,tec.poisson.ratio=5, tec=NULL, n.control.samples=1, enrichment.scale.down.control=F, enrichment.background.scales=c(1,5,10), use.randomized.controls=F, background.density.scaling=T, mle.filter=F, min.mle.threshold=1, ...) {
+
+ if(f<1) {
+ if(debug) { cat("subsampling signal ... "); }
+ signal.data <- lapply(signal.data,function(x) sample(x,length(x)*f))
+ if(debug) { cat("done\n"); }
+ }
+
+
+ if(!is.null(control.data) & !use.randomized.controls) {
+ # limit both control and signal data to a common set of chromosomes
+ chrl <- intersect(names(signal.data),names(control.data));
+ signal.data <- signal.data[chrl];
+ control.data <- control.data[chrl];
+ control <- list(control.data);
+ } else {
+ control <- NULL;
+ }
+
+ prd <- lwcc.prediction(signal.data,min.dist=min.dist,whs=whs,window.size=window.size,e.value=e.value,fdr=fdr,debug=debug,n.randomizations=n.randomizations,shuffle.window=shuffle.window,min.thr=min.thr,cluster=cluster,method=method,bg.tl=control.data,mask.tl=masked.data, topN=topN, control=control,tec.filter=tec.filter,tec.z=tec.z,tec.window.size=tec.window.size, tec.masking.window.size=tec.masking.window.size, tec.poisson.z=tec.poisson.z,tec.poisson.ratio=tec.poisson.ratio, background.density.scaling=background.density.scaling, ...);
+
+ # add tag counts
+ chrl <- names(prd$npl); names(chrl) <- chrl;
+ prd$npl <- lapply(chrl,function(chr) {
+ pd <- prd$npl[[chr]];
+ pd$nt <- points.within(abs(signal.data[[chr]]),pd$x-tag.count.whs,pd$x+tag.count.whs,return.point.counts=T);
+ return(pd);
+ });
+ prd$f <- f;
+ prd$n <- sum(unlist(lapply(signal.data,length)));
+ if(!is.null(control.data)) {
+ prd$n.bg <- sum(unlist(lapply(control.data,length)));
+ }
+
+ # calculate enrichment ratios
+ prd <- calculate.enrichment.estimates(prd,signal.data,control.data=control.data,fraction=1,tag.count.whs=tag.count.whs,z=enrichment.z,scale.down.control=enrichment.scale.down.control,background.scales=enrichment.background.scales);
+
+ if(mle.filter) {
+ if(!is.null(prd$npl)) {
+ if(length(prd$npl)>1) {
+ mle.columns <- grep("enr.mle",colnames(prd$npl[[1]]));
+ if(length(mle.columns)>1) {
+ prd$npl <- lapply(prd$npl,function(d) d[apply(d[,mle.columns],1,function(x) all(x>min.mle.threshold)),])
+ }
+ }
+ }
+ }
+
+ prd$whs <- whs;
+
+ return(prd);
+}
+
+
+
+# -------- ROUTINES FOR WRITING OUT TAG DENSITY AND ENRICHMENT PROFILES ------------
+# calculate smoothed tag density, optionally subtracting the background
+get.smoothed.tag.density <- function(signal.tags,control.tags=NULL,bandwidth=150,bg.weight=NULL,tag.shift=146/2,step=round(bandwidth/3),background.density.scaling=T,rngl=NULL,scale.by.dataset.size=F) {
+ chrl <- names(signal.tags); names(chrl) <- chrl;
+
+ if(!is.null(control.tags)) {
+ bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling);
+ }
+
+ if(scale.by.dataset.size) {
+ den.scaling <- dataset.density.size(signal.tags,background.density.scaling=background.density.scaling)/1e6;
+ } else {
+ den.scaling <- 1;
+ }
+
+ lapply(chrl,function(chr) {
+ ad <- abs(signal.tags[[chr]]+tag.shift);
+ rng <- NULL;
+ if(!is.null(rngl)) {
+ rng <- rngl[[chr]];
+ }
+ if(is.null(rng)) {
+ rng <- range(ad);
+ }
+
+ ds <- densum(ad,bw=bandwidth,from=rng[1],to=rng[2],return.x=T,step=step);
+ if(!is.null(control.tags)) {
+ if(!is.null(control.tags[[chr]])) {
+ bsd <- densum(abs(control.tags[[chr]]+tag.shift),bw=bandwidth,from=rng[1],to=rng[2],return.x=F,step=step);
+ ds$y <- ds$y-bsd*bg.weight;
+ }
+ }
+ return(data.frame(x=seq(ds$x[1],ds$x[2],by=step),y=den.scaling*ds$y))
+ })
+}
+
+# get smoothed maximum likelihood estimate of the log2 signal to control enrichment ratio
+get.smoothed.enrichment.mle <- function(signal.tags, control.tags, tag.shift=146/2, background.density.scaling=F, pseudocount=1,bg.weight=NULL, ... ) {
+ # determine common range
+ chrl <- intersect(names(signal.tags),names(control.tags)); names(chrl) <- chrl;
+ rngl <- lapply(chrl,function(chr) range(c(range(abs(signal.tags[[chr]]+tag.shift)),range(abs(control.tags[[chr]]+tag.shift)))))
+ ssd <- get.smoothed.tag.density(signal.tags, rngl=rngl, ..., scale.by.dataset.size=F)
+ csd <- get.smoothed.tag.density(control.tags, rngl=rngl, ..., scale.by.dataset.size=F)
+ if(is.null(bg.weight)) {
+ bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling);
+ }
+ cmle <- lapply(chrl,function(chr) { d <- ssd[[chr]]; d$y <- log2(d$y+pseudocount) - log2(csd[[chr]]$y+pseudocount) - log2(bg.weight); return(d); })
+}
+
+
+# returns a conservative upper/lower bound profile (log2) given signal tag list, background tag list and window scales
+get.conservative.fold.enrichment.profile <- function(ftl,btl,fws,bwsl=c(1,5,25,50)*fws,step=50,tag.shift=146/2,alpha=0.05,use.most.informative.scale=F,quick.calculation=T,background.density.scaling=T,bg.weight=NULL,posl=NULL,return.mle=F) {
+ # include only chromosomes with more than 2 reads
+ ftl <- ftl[unlist(lapply(ftl,length))>2]
+ chrl <- names(ftl); names(chrl) <- chrl;
+ if(!is.null(posl)) {
+ chrl <- chrl[chrl %in% names(posl)];
+ }
+ # calculate background tag ratio
+ if(is.null(bg.weight)) {
+ bg.weight <- dataset.density.ratio(ftl,btl,background.density.scaling=background.density.scaling);
+ }
+ lapply(chrl,function(chr) {
+ if(is.null(btl[[chr]])) { bt <- c(); } else { bt <- abs(btl[[chr]]+tag.shift); }
+ if(is.null(posl)) {
+ x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha);
+ } else {
+ x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha,pos=posl[[chr]]);
+ }
+ # compose profile showing lower bound for enriched, upper bound for depleted regions
+ ps <- rep(1,length(x$mle));
+ vi <- which(!is.na(x$lb) & x$lb>1);
+ ps[vi] <- x$lb[vi];
+ vi <- which(!is.na(x$ub) & x$ub<1);
+ ps[vi] <- x$ub[vi];
+ ps <- log2(ps);
+ if(is.null(posl)) {
+ if(return.mle) {
+ return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub)));
+ } else {
+ return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps));
+ }
+ } else {
+ if(return.mle) {
+ return(data.frame(x=posl[[chr]],y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub)));
+ } else {
+ return(data.frame(x=posl[[chr]],y=ps));
+ }
+ }
+ })
+}
+
+
+# write a per-chromosome $x/$y data structure into a wig file
+writewig <- function(dat,fname,feature,threshold=5,zip=F) {
+ chrl <- names(dat); names(chrl) <- chrl;
+ invisible(lapply(chrl,function(chr) {
+ bdiff <- dat[[chr]];
+ ind <- seq(1,length(bdiff$x));
+ ind <- ind[!is.na(bdiff$y[ind])];
+ header <- chr==chrl[1];
+ write.probe.wig(chr,bdiff$x[ind],bdiff$y[ind],fname,append=!header,feature=feature,header=header);
+ }))
+ if(zip) {
+ zf <- paste(fname,"zip",sep=".");
+ system(paste("zip \"",zf,"\" \"",fname,"\"",sep=""));
+ system(paste("rm \"",fname,"\"",sep=""));
+ return(zf);
+ } else {
+ return(fname);
+ }
+}
+
+
+
+# -------- ROUTINES FOR ANALYZING SATURATION PROPERTIES ------------
+
+# PUBLIC
+# calculate minimal saturation enrichment ratios (MSER)
+get.mser <- function(signal.data,control.data,n.chains=5,step.size=1e5, chains=NULL, cluster=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), n.steps=1, ...) {
+ if(is.null(chains)) {
+ ci <- c(1:n.chains); names(ci) <- ci;
+ if(is.null(cluster)) {
+ chains <- lapply(ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...);
+ } else {
+ chains <- clusterApplyLB(cluster,ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...);
+ names(chains) <- ci;
+ }
+ }
+ cvl <- mser.chain.interpolation(chains=chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=F);
+ if(n.steps>1) {
+ msers <- cvl;
+ } else {
+ msers <- unlist(lapply(cvl,function(d) d$me))
+ }
+ if(return.chains) {
+ return(list(mser=msers,chains=chains));
+ } else {
+ return(msers);
+ }
+}
+
+# PUBLIC
+# interpolate MSER dependency on tag counts
+get.mser.interpolation <- function(signal.data,control.data,target.fold.enrichment=5,n.chains=10,n.steps=6,step.size=1e5, chains=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), excluded.steps=c(seq(2,n.steps-2)), ...) {
+ msers <- get.mser(signal.data,control.data,n.chains=n.chains,n.steps=n.steps,step.size=step.size,chains=chains,test.agrement=test.agreement,return.chains=T,enrichment.background.scales=enrichment.background.scales,excluded.steps=excluded.steps, ...);
+
+ # adjust sizes in case a subset of chromosomes was used
+ mser <- mser.chain.interpolation(chains=msers$chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=T);
+ sr <- sum(unlist(lapply(signal.data,length)))/mser[[1]][[1]]$n[1];
+
+ # Subsampling each chain requires removing a fraction of each chromosome's
+ # tag list. To get the exact step.size, this often leaves chromosomes with
+ # a non-integer number of tags. The non-integer values are floored, so each
+ # chr can contribute at most 0.999.. <= 1 error to the step.size.
+ floor.error <- length(msers$chains[[1]][[1]]$npl)
+ intpn <- lapply(mser,function(ms) {
+ lmvo <- do.call(rbind,ms)
+ lmvo$n <- lmvo$n*sr;
+ # Don't select rows corresponding to excluded.steps
+ # Keep in mind that nd values are negative.
+ lmvo <- lmvo[lmvo$nd <= (lmvo$nd[1] + floor.error) & lmvo$nd >= (lmvo$nd[1] - floor.error),];
+ lmvo <- na.omit(lmvo);
+ if(any(lmvo$me==1)) {
+ return(list(prediction=NA));
+ }
+ lmvo$n <- log10(lmvo$n); lmvo$me <- log10(lmvo$me-1)
+ # remove non-standard steps
+ emvf <- lm(me ~ n,data=lmvo);
+ tfe <- (log10(target.fold.enrichment-1)-coef(emvf)[[1]])/coef(emvf)[[2]];
+ tfen <- 10^tfe;
+ return(list(prediction=tfen,log10.fit=emvf));
+ })
+
+ if(return.chains) {
+ return(list(interpolation=intpn,chains=msers$chains))
+ } else {
+ return(intpn);
+ }
+
+ return(msers);
+
+}
+
+
+# output binding detection results to a text file
+# the file will contain a table with each row corresponding
+# to a detected position, with the following columns:
+# chr - chromosome or target sequence
+# pos - position of detected binding site on the chromosome/sequence
+# score - a score reflecting magnitude of the binding
+# Evalue - E-value corresponding to the peak magnitude
+# FDR - FDR corresponding to the peak magnitude
+# enrichment.lb - lower bound of the fold-enrichment ratio
+# enrichment.mle - maximum likelihood estimate of the fold-enrichment ratio
+output.binding.results <- function(results,filename) {
+ write(file=filename,"chr\tpos\tscore\tEvalue\tFDR\tenrichment.lb\tenrichment.mle",append=F);
+ chrl <- names(results$npl); names(chrl) <- chrl;
+ x <- lapply(chrl,function(chr) {
+ d <- results$npl[[chr]];
+ if(dim(d)[1]>0) {
+ if(results$thr$type=="topN") {
+ od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,enr,enr.mle)))
+ } else {
+ od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,evalue,fdr,enr,enr.mle)))
+ }
+ write.table(od,file=filename,col.names=F,row.names=F,sep="\t",append=T,quote=F)
+ }
+ })
+}
+
+
+# -------- LOW-LEVEL ROUTINES ------------
+
+# calculates tag strand cross-correlation for a range of shifts (on positive strand)
+tag.scc <- function(tags,srange=c(50,250),bin=1,tt=NULL,llim=10) {
+ if(is.null(tt)) {
+ tt <- table(sign(tags)*as.integer(floor(abs(tags)/bin+0.5)));
+ }
+ if(!is.null(llim)) { l <- mean(tt); tt <- tt[tt<llim*l] }
+ tc <- as.integer(names(tt));
+ tt <- as.numeric(tt);
+
+ pv <- tt; pv[tc<0]<-0;
+ nv <- tt; nv[tc>0]<-0;
+
+ pti <- which(tc>0)
+ nti <- which(tc<0);
+
+ ptc <- tc[pti];
+ ntc <- (-1)*tc[nti];
+
+ ptv <- tt[pti];
+ ntv <- tt[nti];
+
+ trng <- range(c(range(ptc),range(ntc)))
+ l <- diff(trng)+1;
+ rm(tc,tt);
+
+ mp <- sum(ptv)*bin/l; mn <- sum(ntv)*bin/l;
+ ptv <- ptv-mp; ntv <- ntv-mn;
+ ss <- sqrt((sum(ptv*ptv)+(l-length(ptv))*mp^2) * (sum(ntv*ntv)+(l-length(ntv))*mn^2));
+
+ t.cor <- function(s) {
+ smi <- match(ptc+s,ntc);
+ return((sum(ptv[!is.na(smi)]*ntv[na.omit(smi)]) -
+ mn*sum(ptv[is.na(smi)]) -
+ mp*sum(ntv[-na.omit(smi)]) +
+ mp*mn*(l-length(ptv)-length(ntv)+length(which(!is.na(smi)))))/ss);
+ }
+ shifts <- floor(seq(srange[1],srange[2],by=bin)/bin+0.5);
+ scc <- unlist(lapply(shifts,t.cor)); names(scc) <- shifts*bin;
+ return(scc);
+}
+
+
+# plot tag cross-correlation
+t.plotcc <- function(ac, lab=c(10,5,7), ylab="correlation", xlab="lag", pch=19, grid.i=c(-5:5), grid.s=10, type='b', plot.grid=F, cols=c(1,2,4,"orange",8,"pink"), min.peak.x=NULL, xlim=NULL, plot.147=F, plot.max=T, rmw=1, rescale=F, legendx="right", ltys=rep(1,length(ac)), ...) {
+ if(is.list(ac)) {
+ cols <- cols[1:length(ac)];
+
+ if(!is.null(xlim)) {
+ vx <- as.numeric(names(ac[[1]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]);
+ ac[[1]] <- (ac[[1]])[vx];
+ } else {
+ xlim <- range(as.numeric(names(ac[[1]])));
+ }
+
+
+ plot(as.numeric(names(ac[[1]])),runmean(ac[[1]],rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, col=cols[1], xlim=xlim, lty=ltys[1], ...);
+ if(length(ac)>1) {
+ for(i in seq(2,length(ac))) {
+ irng <- range(ac[[i]]);
+ vx <- as.numeric(names(ac[[i]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]);
+ if(rescale) {
+ lines(as.numeric(names(ac[[i]])[vx]),runmean((ac[[i]][vx]-irng[1])/diff(irng)*diff(range(ac[[1]]))+min(ac[[1]]),rmw),col=cols[i],lty=ltys[i]);
+ } else {
+ lines(as.numeric(names(ac[[i]]))[vx],runmean(ac[[i]][vx],rmw),col=cols[i],lty=ltys[i]);
+ }
+ }
+ }
+ if(is.null(min.peak.x)) {
+ m <- as.numeric(names(ac[[1]])[which.max(ac[[1]])]);
+ } else {
+ sac <- (ac[[1]])[which(as.numeric(names(ac[[1]]))>min.peak.x)]
+ m <- as.numeric(names(sac)[which.max(sac)]);
+ }
+ legend(x="topright",bty="n",legend=c(names(ac)),col=cols,lty=ltys)
+ } else {
+ if(!is.null(xlim)) {
+ vx <- as.numeric(names(ac));
+ vx <- which(vx>=xlim[1] & vx<=xlim[2]);
+ ac <- ac[vx];
+ } else {
+ xlim <- range(as.numeric(names(ac)));
+ }
+
+ plot(names(ac),runmean(ac,rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, xlim=xlim, ...);
+ if(is.null(min.peak.x)) {
+ m <- as.numeric(names(ac)[which.max(ac)]);
+ } else {
+ sac <- ac[which(names(ac)>min.peak.x)]
+ m <- as.numeric(names(sac)[which.max(sac)]);
+ }
+ }
+ if(plot.147) {
+ abline(v=147,lty=2,col=8);
+ }
+ if(plot.grid) {
+ abline(v=m+grid.i*grid.s,lty=3,col="pink");
+ }
+ if(plot.max) {
+ abline(v=m,lty=2,col=2);
+ legend(x=legendx,bty="n",legend=c(paste("max at ",m,"bp",sep="")));
+ return(m);
+ }
+ }
+
+ # plot chromosome-acerage cross-correlation
+ t.plotavcc <- function(ci, main=paste(ci,"chromosome average"), ccl=tl.cc, return.ac=F, ttl=tl, plot=T, ... ) {
+ cc <- ccl[[ci]];
+ if(length(cc)==1) { return(cc[[1]]) };
+ if(length(cc)==0) { return(c()) };
+ ac <- do.call(rbind,cc);
+ # omit NA chromosomes
+ ina <- apply(ac,1,function(d) any(is.na(d)));
+
+ tags <- ttl[[ci]];
+ avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw);
+ ac <- ac[!ina,]; avw <- avw[!ina];
+ ac <- apply(ac,2,function(x) sum(x*avw));
+ if(plot) {
+ m <- t.plotcc(ac, main=main, ...);
+ if(!return.ac) { return(m) }
+ }
+ if(return.ac) { return(ac) }
+ }
+
+ t.plotchrcc <- function(ci,ncol=4, ccl=tl.cc, ... ) {
+ cc <- ccl[[ci]];
+ ac <- do.call(rbind,cc);
+ par(mfrow = c(length(cc)/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8)
+ lapply(names(cc),function(ch) { t.plotcc(cc[[ch]],main=paste(ci,": chr",ch,sep=""), ...) })
+ }
+
+ t.plotavccl <- function(ci, ccl=tl.ccl, main=paste(ci,"chromosome average"), rtl=tl, ... ) {
+ #cc <- lapply(ccl[[ci]],function(x) { if(!is.null(x$M)) { x$M <- NULL;}; return(x); });
+ cc <- ccl[[ci]];
+ chrs <- names(cc[[1]]); names(chrs) <- chrs;
+ acl <- lapply(cc,function(x) do.call(rbind,x));
+ tags <- rtl[[ci]][chrs];
+ avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw);
+ acl <- lapply(acl,function(ac) apply(ac,2,function(x) sum(x*avw)))
+ t.plotcc(acl, main=main, ...);
+ }
+
+ t.plotchrccl <- function(ci,ccl=tl.ccl,ncol=4, ... ) {
+ par(mfrow = c(length(cc[[1]])/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8)
+ lapply(names(cc[[1]]),function(ch) { t.plotcc(lapply(cc,function(x) x[[ch]]),main=paste(ci,": chr",ch,sep=""), ...) })
+ }
+
+
+
+show.scc <- function(tl,srange,cluster=NULL) {
+ if(!is.null(cluster)) {
+ cc <- clusterApplyLB(cluster,tl,tag.scc,srange=srange);
+ names(cc) <- names(tl);
+ } else {
+ cc <- lapply(tl,tag.scc,srange=srange);
+ }
+ par(mfrow = c(1,1), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
+ ccl<-list(sample=cc);
+ ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,xlim=srange,return.ac=F,ttl=list(sample=tl),main="")[[1]]
+}
+
+# find regions of significant tag enrichment
+find.significantly.enriched.regions <- function(signal.data,control.data,window.size=500,multiplier=1,z.thr=3,mcs=0,debug=F,background.density.scaling=T,masking.window.size=window.size,poisson.z=0,poisson.ratio=4,either=F,tag.shift=146/2,bg.weight=NULL) {
+ if(is.null(bg.weight)) {
+ bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling);
+ }
+
+ if(debug) {
+ cat("bg.weight=",bg.weight,"\n");
+ }
+ chrl <- names(signal.data); names(chrl) <- chrl;
+ tec <- lapply(chrl,function(chr) {
+ d <- tag.enrichment.clusters(signal.data[[chr]],control.data[[chr]],bg.weight=bg.weight*multiplier,thr=z.thr,wsize=window.size,mcs=mcs,min.tag.count.z=poisson.z,min.tag.count.ratio=poisson.ratio,either=either,tag.shift=tag.shift);
+ d$s <- d$s-masking.window.size/2; d$e <- d$e+masking.window.size/2;
+ return(d);
+ })
+}
+
+
+# given tag position vectors, find contigs of significant enrichment of signal over background
+# thr - z score threshold
+# mcs - minimal cluster size
+# bg.weight - fraction by which background counts should be multipled
+# min.tag.count.z will impose a poisson constraint based on randomized signal in parallel of background constaint (0 - no constraint)
+tag.enrichment.clusters <- function(signal,background,wsize=200,thr=3,mcs=1,bg.weight=1,min.tag.count.z=0,tag.av.den=NULL,min.tag.count.thr=0,min.tag.count.ratio=4,either=F,tag.shift=146/2) {
+ if(is.null(tag.av.den)) {
+ tag.av.den <- length(signal)/diff(range(abs(signal)));
+ }
+ if(min.tag.count.z>0) {
+ min.tag.count.thr <- qpois(pnorm(min.tag.count.z,lower.tail=F),min.tag.count.ratio*tag.av.den*wsize,lower.tail=F)
+ } else {
+ min.tag.count.thr <- 0;
+ }
+
+ #if(bg.weight!=1) {
+ # background <- sample(background,length(background)*(bg.weight),replace=T);
+ #}
+ # make up combined position, flag vectors
+ pv <- abs(c(signal,background)+tag.shift);
+ fv <- c(rep(1,length(signal)),rep(0,length(background)));
+ po <- order(pv);
+ pv <- pv[po];
+ fv <- fv[po];
+
+ #thr <- pnorm(thr,lower.tail=F);
+
+ storage.mode(wsize) <- storage.mode(mcs) <- storage.mode(fv) <- "integer";
+ storage.mode(thr) <- storage.mode(pv) <- "double";
+ storage.mode(bg.weight) <- "double";
+ storage.mode(min.tag.count.thr) <- "double";
+ either <- as.integer(either);
+ storage.mode(either) <- "integer";
+
+ z <- .Call("find_poisson_enrichment_clusters",pv,fv,wsize,thr,mcs,bg.weight,min.tag.count.thr,either)
+ return(z);
+}
+
+
+
+
+
+# estimates threshold, calculates predictions on complete data and randomized data
+# input: tvl
+# control - a list of control tag datasets
+# no randomization is done if control is supplied
+# return.rtp - return randomized tag peaks - do not fit thresholds or do actual predictions
+# topN - use min threshold to do a run, return topN peaks from entire genome
+# threshold - specify a user-defined threshold
+lwcc.prediction <- function(tvl,e.value=NULL, fdr=0.01, chrl=names(tvl), min.thr=0, n.randomizations=1, shuffle.window=1, debug=T, predict.on.random=F, shuffle.both.strands=T,strand.shuffle.only=F, return.rtp=F, control=NULL, print.level=0, threshold=NULL, topN=NULL, bg.tl=NULL, tec.filter=T, tec.window.size=1e3,tec.z=3, tec.masking.window.size=tec.window.size, tec.poisson.z=3,tec.poisson.ratio=4, bg.reverse=T, return.control.predictions=F, return.core.data=F, background.density.scaling=T, ... ) {
+
+ control.predictions <- NULL;
+ core.data <- list();
+
+ if(!is.null(bg.tl) & tec.filter) {
+ if(debug) { cat("finding background exclusion regions ... "); }
+ tec <- find.significantly.enriched.regions(bg.tl,tvl,window.size=tec.window.size,z.thr=tec.z,masking.window.size=tec.masking.window.size,poisson.z=tec.poisson.z,poisson.ratio=tec.poisson.ratio,background.density.scaling=background.density.scaling,either=T);
+ if(return.core.data) {
+ core.data <- c(core.data,list(tec=tec));
+ }
+ if(debug) { cat("done\n"); }
+ }
+
+
+ if(is.null(threshold) & is.null(topN)) { # threshold determination is needed
+ # generate control predictions
+ if(!is.null(control)) {
+ if(debug) { cat("determining peaks on provided",length(control),"control datasets:\n"); }
+ if(!is.null(bg.tl)) {
+ if(bg.reverse) {
+ if(debug) { cat("using reversed signal for FDR calculations\n"); }
+ rbg.tl <- tvl;
+ } else {
+ if(debug) { cat("generating randomized (within chromosome) background ... "); }
+ rbg.tl <- lapply(bg.tl,function(d) {
+ if(length(d)<2) { return(d); }
+ rng <- range(abs(d));
+ rd <- round(runif(length(d),rng[1],rng[2]));
+ nrd <- sample(1:length(rd),length(which(d<0)));
+ rd[nrd] <- rd[nrd]*(-1);
+ return(rd);
+ })
+ if(debug) { cat("done\n"); }
+ }
+ } else {
+ rbg.tl <- NULL;
+ }
+ n.randomizations <- length(control);
+ #signal.size <- sum(unlist(lapply(tvl,length)));
+ rtp <- lapply(control,function(d) {
+ # calculate tag.weight
+ #tag.weight <- sum(unlist(lapply(tvl,length)))/sum(unlist(lapply(d,length)));
+ tag.weight <- dataset.density.ratio(tvl,d,background.density.scaling=background.density.scaling);
+ #cat("tag.weight=",tag.weight," ");
+ return(window.call.mirror.binding(d,min.thr=min.thr, tag.weight=tag.weight,bg.tl=rbg.tl, debug=debug, round.up=T,background.density.scaling=background.density.scaling, ...));
+ #return(window.call.mirror.binding(d,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster))
+ });
+ if(return.core.data) {
+ core.data <- c(core.data,list(rtp.unfiltered=rtp));
+ }
+ if(tec.filter) {
+ if(debug) { cat("excluding systematic background anomalies ... "); }
+ rtp <- lapply(rtp,filter.binding.sites,tec,exclude=T);
+ if(debug) { cat("done\n"); }
+ }
+ } else {
+ if(debug) { cat("determining peaks on ",n.randomizations,"randomized datasets:\n"); }
+ rtp <- lapply(1:n.randomizations,function(i) {
+ rd <- generate.randomized.data(tvl,shuffle.window=shuffle.window,shuffle.both.strands=shuffle.both.strands,strand.shuffle.only=strand.shuffle.only);
+ return(window.call.mirror.binding(rd,min.thr=min.thr,bg.tl=bg.tl, debug=debug, ...));
+ #return(window.call.mirror.binding(rd,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist))
+ });
+ }
+ if(return.control.predictions) {
+ control.predictions <- rtp;
+ }
+ rtp <- do.call(rbind,lapply(rtp,function(d) do.call(rbind,d))); # merge tables
+
+ # generate real data predictions
+ if(debug) { cat("determining peaks on real data:\n"); }
+ npl <- window.call.mirror.binding(tvl,min.thr=min.thr,bg.tl=bg.tl, debug=debug, background.density.scaling=background.density.scaling, ...);
+ #npl <- window.call.mirror.binding(tvl,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster);
+ if(return.core.data) {
+ core.data <- c(core.data,list(npl.unfiltered=npl));
+ }
+
+ if(!is.null(bg.tl) & tec.filter) {
+ if(debug) { cat("excluding systematic background anomalies ... "); }
+ npl <- filter.binding.sites(npl,tec,exclude=T);
+ if(debug) { cat("done\n"); }
+ }
+
+ # calculate E-value and FDRs for all of the peaks
+ if(debug) { cat("calculating statistical thresholds\n"); }
+ chrl <- names(npl); names(chrl) <- chrl;
+ npld <- do.call(rbind,lapply(names(npl),function(chr) { k <- npl[[chr]]; if(!is.null(k) & dim(k)[1]>0) { k$chr <- rep(chr,dim(k)[1]) }; return(k) }))
+ npld <- cbind(npld,get.eval.fdr.vectors(npld$y,rtp$y));
+ # correct for n.randomizations
+ npld$fdr <- npld$fdr/n.randomizations;
+ npld$evalue <- npld$evalue/n.randomizations;
+
+ if(return.core.data) {
+ core.data <- c(core.data,list(npld=npld));
+ }
+
+ # determine actual thresholds
+ if(is.null(e.value)) {
+ if(is.null(fdr)) { fdr <- 0.01; }
+ thr <- list(root=min(npld$y[npld$fdr<=fdr]),type="FDR",fdr=fdr)
+ if(debug) { cat("FDR",fdr,"threshold=",thr$root,"\n"); }
+ } else {
+ # determine threshold based on e-value
+ thr <- list(root=min(npld$y[npld$evalue<=e.value]),type="Evalue",e.value=e.value)
+ if(debug) { cat("E-value",e.value,"threshold=",thr$root,"\n"); }
+ }
+
+
+ npld <- npld[npld$y>=thr$root,];
+ if(dim(npld)[1]>0) {
+ npl <- tapply(c(1:dim(npld)[1]),as.factor(npld$chr),function(ii) {df <- npld[ii,]; df$chr <- NULL; return(df) });
+ } else {
+ npl <- list();
+ }
+ } else {
+ if(is.null(threshold)) {
+ thr <- list(root=min.thr,type="minimal");
+ } else {
+ thr <- list(root=threshold,type="user specified");
+ }
+
+ cat("calling binding positions using",thr$type,"threshold (",thr$root,") :\n");
+ npl <- window.call.mirror.binding(tvl=tvl,min.thr=thr$root,bg.tl=bg.tl, debug=debug, ...);
+ if(!is.null(bg.tl) & tec.filter) {
+ if(debug) { cat("excluding systematic background anomalies ... "); }
+ npl <- filter.binding.sites(npl,tec,exclude=T);
+ if(debug) { cat("done\n"); }
+ }
+
+ if(!is.null(topN)) {
+ # determine threshold based on topN peaks
+ ay <- unlist(lapply(npl,function(d) d$y));
+ if(length(ay)>topN) {
+ thr <- list(root=sort(ay,decreasing=T)[topN],type="topN",topN=topN);
+ cat(paste("determined topN threshold :",thr$root,"\n"));
+ npl <- lapply(npl,function(d) d[d$y>thr$root,]);
+ }
+ }
+ }
+
+ if(return.core.data) {
+ return(c(list(npl=npl,thr=thr),core.data));
+ }
+ if(return.control.predictions & !is.null(control.predictions)) {
+ return(list(npl=npl,thr=thr,control.predictions=control.predictions));
+ }
+ return(list(npl=npl,thr=thr));
+}
+
+# window tag difference method
+wtd <- function(x,y,s,e,whs=200,return.peaks=T,min.thr=5,min.dist=200,step=1,direct.count=F,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=1,mask.x=NULL,mask.y=NULL,ignore.masking=F, bg.whs=whs, round.up=F, ...) {
+ ignore.masking <- ignore.masking | (is.null(mask.x) & is.null(mask.y));
+ if(step>1) {
+ x <- floor(x/step+0.5); y <- floor(y/step+0.5)
+
+ if(!is.null(bg.x)) {
+ bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5)
+ }
+
+ if(!is.null(mask.x)) {
+ mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5)
+ }
+
+
+ whs <- floor(whs/step+0.5);
+ bg.whs <- floor(bg.whs/step+0.5);
+ min.dist <- floor(min.dist/step +0.5);
+ s <- floor(s/step+0.5)
+ e <- floor(e/step+0.5)
+ }
+
+ # scale bg.weight, since within calculation they are considered independent
+ bg.weight <- bg.weight*tag.weight;
+
+ rx <- c(s-whs,e+whs);
+
+ # compile tag vectors
+ xt <- table(x);
+ xh <- integer(diff(rx)+1);
+ xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt);
+
+ yt <- table(y);
+ yh <- integer(diff(rx)+1);
+ yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt);
+
+ # compile background vectors
+ if(!is.null(bg.x) & length(bg.x)>0) {
+ bg.subtract <- 1;
+
+ bg.xt <- table(bg.x);
+ bg.xh <- integer(diff(rx)+1);
+ bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt);
+ rm(bg.xt);
+
+ bg.yt <- table(bg.y);
+ bg.yh <- integer(diff(rx)+1);
+ bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt);
+ rm(bg.yt);
+
+ # adjust bg.weight according to bg.whs
+ if(bg.whs!=whs) {
+ bg.weight <- bg.weight*whs/bg.whs;
+ }
+ } else {
+ bg.subtract <- 0;
+ bg.xh <- bg.yh <- c();
+ }
+
+ # record masked positions
+ if(!ignore.masking) {
+ if(!is.null(mask.x) & length(mask.x)>0) {
+ mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt)));
+ mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]];
+ xh[mvx-rx[1]+1] <- -1;
+ }
+
+ if(!is.null(mask.y) & length(mask.y)>0) {
+ mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt)));
+ mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]];
+ yh[mvy-rx[1]+1] <- -1;
+ }
+ }
+
+ rm(xt,yt);
+
+ if(round.up) { round.up <- 1; } else { round.up <- 0; }
+
+ storage.mode(xh) <- storage.mode(yh) <- "integer";
+ storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer";
+ nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(bg.whs) <- "integer";
+ rp <- as.integer(return.peaks);
+ dcon <- as.integer(direct.count);
+ storage.mode(rp) <- storage.mode(min.dist) <- "integer";
+ storage.mode(min.thr) <- "double";
+ storage.mode(dcon) <- "integer";
+ storage.mode(tag.weight) <- "double";
+ storage.mode(bg.weight) <- "double";
+ storage.mode(bg.subtract) <- "integer";
+ storage.mode(round.up) <- "integer";
+ im <- as.integer(ignore.masking);
+ storage.mode(im) <- "integer";
+ z <- .Call("wtd",xh,yh,whs,rp,min.dist,min.thr,dcon,tag.weight,im,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up);
+ if(return.peaks) {
+ return(data.frame(x=(z$x+rx[1])*step,y=z$v));
+ } else {
+ return(list(x=rx*step,y=z));
+ }
+}
+
+
+tag.wtd <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) {
+ x <- ctv[ctv>=s & ctv<=e];
+ y <- (-1)*ctv[ctv<=-s & ctv>=-e];
+
+ if(!is.null(bg.ctv)) {
+ bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e];
+ bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e];
+ } else {
+ bg.x <- bg.y <- NULL;
+ }
+
+ if(!is.null(mask.ctv)) {
+ mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e];
+ mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e];
+ } else {
+ mask.x <- mask.y <- NULL;
+ }
+
+ if(length(x)==0 | length(y) ==0) {
+ if(return.peaks) {
+ return(data.frame(x=c(),y=c()));
+ } else {
+ rx <- range(c(x,y));
+ return(list(x=rx,y=numeric(diff(rx)+1)));
+ }
+ } else {
+ return(wtd(x,y,s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...))
+ }
+}
+
+# shuffles tags in chromosome blocks of a specified size
+# note: all coordinates should be positive
+tag.block.shuffle <- function(tags,window.size=100) {
+ if(length(tags)<3) {
+ warning("too few tags for shuffling");
+ return(tags);
+ }
+ rng <- range(tags);
+ #if(rng[1]<0) { stop("negative tag coordinates found") }
+ if(diff(rng)<=window.size) {
+ warning(paste("tag range (",diff(rng),") is smaller than shuffle window size"));
+ return(tags);
+ }
+
+ if(window.size==0) {
+ return(as.integer(runif(length(tags),min=rng[1],max=rng[2])))
+ } else if(window.size==1) {
+ tt <- table(tags);
+ return(rep(runif(length(tt),min=rng[1],max=rng[2]),as.integer(tt)))
+ } else {
+ # block positions
+ bp <- tags %/% window.size;
+ # block-relative tag positions
+ rp <- tags %% window.size;
+
+ # shuffle block positions
+ bpu <- unique(bp);
+ rbp <- range(bpu);
+ bps <- as.integer(runif(length(bpu),min=rbp[1],max=rbp[2]));
+ bpi <- match(bp,bpu);
+ sbp <- bps[bpi];
+ #sbp <- rbp[1]+match(bp,sample(rbp[1]:rbp[2]))
+ return(sbp*window.size+rp);
+ }
+}
+
+
+# calculate window cross-correlation
+lwcc <- function(x,y,s,e,whs=100,isize=20,return.peaks=T,min.thr=1,min.dist=100,step=1,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=NULL,mask.x=NULL,mask.y=NULL,bg.whs=whs,round.up=F) {
+ if(step>1) {
+ x <- floor(x/step+0.5); y <- floor(y/step+0.5)
+
+ if(!is.null(bg.x)) {
+ bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5)
+ }
+
+ if(!is.null(mask.x)) {
+ mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5)
+ }
+
+ whs <- floor(whs/step+0.5);
+ bg.whs <- floor(bg.whs/step+0.5);
+ isize <- floor(isize/step+0.5);
+ min.dist <- floor(min.dist/step +0.5);
+ s <- floor(s/step+0.5)
+ e <- floor(e/step+0.5)
+ }
+
+ # scale bg.weight, since within calculation they are considered independent
+ bg.weight <- bg.weight*tag.weight;
+
+
+ rx <- c(s-whs,e+whs);
+ xt <- table(x);
+ xh <- integer(diff(rx)+1);
+ xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt);
+
+ yt <- table(y);
+
+ yh <- integer(diff(rx)+1);
+ yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt);
+
+ # compile background vectors
+ if(!is.null(bg.x) & length(bg.x)>0) {
+ bg.subtract <- 1;
+
+ bg.xt <- table(bg.x);
+ bg.xh <- integer(diff(rx)+1);
+ bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt);
+ rm(bg.xt);
+
+ bg.yt <- table(bg.y);
+ bg.yh <- integer(diff(rx)+1);
+ bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt);
+ rm(bg.yt);
+
+ # adjust bg.weight according to bg.whs
+ bg.weight <- bg.weight*(whs-isize)/bg.whs;
+ } else {
+ bg.subtract <- 0;
+ bg.xh <- bg.yh <- c();
+ }
+
+ # record masked positions
+ if(!is.null(mask.x) & length(mask.x)>0) {
+ mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt)));
+ mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]];
+
+ xh[mvx-rx[1]+1] <- -1;
+ }
+
+ if(!is.null(mask.y) & length(mask.y)>0) {
+ mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt)));
+ mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]];
+ yh[mvy-rx[1]+1] <- -1;
+ }
+
+ rm(xt,yt);
+ if(round.up) { round.up <- 1; } else { round.up <- 0; }
+
+ storage.mode(xh) <- storage.mode(yh) <- "integer";
+ storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer";
+ nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(isize) <- storage.mode(bg.whs) <- "integer";
+ rp <- as.integer(return.peaks);
+ storage.mode(rp) <- storage.mode(min.dist) <- "integer";
+ storage.mode(min.thr) <- "double";
+ storage.mode(tag.weight) <- "double";
+ storage.mode(bg.weight) <- "double";
+ storage.mode(bg.subtract) <- "integer";
+ storage.mode(round.up) <- "integer";
+
+ # allocate return arrays
+ #cc <- numeric(nx); storage.mode(cc) <- "double";
+ z <- .Call("lwcc",xh,yh,whs,isize,rp,min.dist,min.thr,tag.weight,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up);
+ if(return.peaks) {
+ return(data.frame(x=(z$x+rx[1])*step,y=z$v));
+ } else {
+ return(list(x=rx*step,y=z));
+ }
+}
+
+
+tag.lwcc <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) {
+ x <- ctv[ctv>=s & ctv<=e];
+ y <- (-1)*ctv[ctv<=-s & ctv>=-e];
+
+ if(!is.null(bg.ctv)) {
+ bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e];
+ bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e];
+ } else {
+ bg.x <- bg.y <- NULL;
+ }
+
+ if(!is.null(mask.ctv)) {
+ mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e];
+ mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e];
+ } else {
+ mask.x <- mask.y <- NULL;
+ }
+
+ if(length(x)==0 | length(y) ==0) {
+ if(return.peaks) {
+ return(data.frame(x=c(),y=c()));
+ } else {
+ rx <- range(c(x,y));
+ return(list(x=rx,y=numeric(diff(rx)+1)));
+ }
+ } else {
+ return(lwcc(x,y, s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...))
+ }
+}
+
+# determine mirror-based binding positions using sliding window along each chromosome
+# extra parameters are passed on to call.nucleosomes()
+window.call.mirror.binding <- function(tvl,window.size=4e7, debug=T, cluster=NULL, bg.tl=NULL, mask.tl=NULL, background.density.scaling=T, ...) {
+ chrl <- names(tvl);
+ # determine bg.weight
+ if(!is.null(bg.tl)) {
+ bg.weight <- dataset.density.ratio(tvl,bg.tl,background.density.scaling=background.density.scaling);
+ } else {
+ bg.weight <- NULL;
+ }
+ if(debug) {
+ cat("bg.weight=",bg.weight," ");
+ }
+
+ names(chrl) <- chrl;
+
+ if(is.null(cluster)) {
+ return(lapply(chrl,function(chr) {
+ bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; };
+ mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; };
+
+ window.chr.call.mirror.binding(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv),window.size=window.size,chr=chr,debug=debug, bg.weight=bg.weight, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ...);
+ }));
+ } else {
+ # add bg.ctv and mask.ctv to parallel call
+ tvll <- lapply(chrl,function(chr) {
+ bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; };
+ mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; };
+ return(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv))
+ });
+ bl <- clusterApplyLB(cluster,tvll,window.chr.call.mirror.binding,window.size=window.size,debug=debug, bg.weight=bg.weight, ...);
+ names(bl) <- chrl;
+ return(bl);
+ }
+}
+
+window.chr.call.mirror.binding <- function(ctvl,window.size,debug=T, chr="NA", cluster=NULL, method=tag.wtd, bg.ctv=NULL, mask.ctv=NULL, ...) {
+ ctv <- ctvl$ctv; bg.ctv <- ctvl$bg.ctv; mask.ctv <- ctvl$mask.ctv;
+ if(is.null(ctv)) { return(data.frame(x=c(),y=c())) }
+ if(length(ctv)<2) { return(data.frame(x=c(),y=c())) }
+
+ dr <- range(unlist(lapply(ctv,function(x) range(abs(x)))))
+ n.windows <- ceiling(diff(dr)/window.size);
+
+
+ pinfo <- c();
+ if(debug) {
+ cat(paste("processing ",chr," in ",n.windows," steps [",sep=""));
+ }
+ for(i in 1:n.windows) {
+ s <- dr[1]+(i-1)*window.size;
+ npn <- method(s=s, e=s+window.size,ctv=ctv, return.peaks=T, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ... );
+ if(length(npn) > 0) { pinfo <- rbind(pinfo,npn) }
+ if(debug) {
+ cat(".");
+ }
+ }
+ if(debug) {
+ cat(paste("] done (",dim(pinfo)[1],"positions)\n"));
+ } else {
+ cat(".");
+ }
+ return(data.frame(x=pinfo[,1],y=pinfo[,2]));
+}
+
+generate.randomized.data <- function(data,shuffle.window=1,shuffle.both.strands=T,strand.shuffle.only=F,chrl=names(data)) {
+ names(chrl) <- unlist(chrl);
+ if(strand.shuffle.only) {
+ # shuffle just strand assignment, not tag positions
+ rt <- lapply(data[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T));
+ } else {
+ if(shuffle.both.strands) {
+ rt <- lapply(data[unlist(chrl)],function(tv) {
+ pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window)))
+ });
+ } else {
+ rt <- lapply(data[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))});
+ }
+ }
+}
+
+# determine threshold based on E value
+# for efficiency chrl should include just one or two small chromosomes
+# optional parameters are passed to call.nucleosomes()
+determine.lwcc.threshold <- function(tvl,chrl=names(tvl),e.value=100, n.randomizations=1, min.thr=1, debug=F, tol=1e-2, shuffle.window=1, shuffle.both.strands=T, return.rtp=F, control=NULL, strand.shuffle=F, ...) {
+ names(chrl) <- unlist(chrl);
+
+ # determine fraction of total tags contained in the specified nucleosomes
+ ntags <- sum(unlist(lapply(tvl,function(cv) length(cv))));
+ nctags <- sum(unlist(lapply(chrl, function(cn) length(tvl[[cn]]))));
+ # calculate actual target E value
+ if(!is.null(control)) {
+ n.randomizations <- length(control);
+ }
+ eval <- e.value*n.randomizations*nctags/ntags
+ if(eval<1) {
+ warning("specified e.value and set of chromosomes results in target e.value of less than 1");
+ eval <- 1;
+ }
+
+ if(debug) {
+ cat(paste("randomizations =",n.randomizations," chromosomes =",length(chrl),"\n"))
+ cat(paste("adjusted target eval =",eval,"\ngenerating randomized tag peaks ..."));
+ }
+
+ # get peaks on randomized tags
+ if(is.null(control)) {
+ rtp <- data.frame(do.call(rbind,lapply(1:n.randomizations,function(i) {
+ if(strand.shuffle) {
+ # shuffle just strand assignment, not tag positions
+ rt <- lapply(tvl[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T));
+ } else {
+ if(shuffle.both.strands) {
+ rt <- lapply(tvl[unlist(chrl)],function(tv) {
+ pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window)))
+ });
+ } else {
+ rt <- lapply(tvl[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))});
+ }
+ }
+ if(debug) {
+ cat(".");
+ }
+ rl <- window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...);
+
+ return(do.call(rbind,rl))
+ #return(do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, whs=100,isize=10,window.size=3e7,min.dist=200)))
+ })));
+
+ } else {
+ if(debug) {
+ cat(" using provided controls ");
+ }
+ rtp <- data.frame(do.call(rbind,lapply(control,function(rt) do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...)))))
+ }
+
+ if(return.rtp) {
+ return(rtp)
+ }
+
+ if(debug) {
+ cat(" done\nfinding threshold .");
+ }
+
+ # determine range and starting value
+ rng <- c(min.thr,max(na.omit(rtp$y)))
+
+ # find E value threshold
+ count.nucs.f <- function(nthr) {
+ return(eval-length(which(rtp$y>=nthr)));
+ }
+
+ # estimate position of the root by downward bisection iterations
+ mv <- c(eval); mvp <- c(rng[2]); ni <- 1;
+ max.it <- 2*as.integer(log2(rng[2]/rng[1])+0.5);
+ while((ni<=max.it) & (mv[1]>=0)) {
+ np <- mvp[1]/2;
+ npv <- count.nucs.f(np);
+ mv <- c(npv,mv);
+ mvp <- c(np,mvp);
+ ni <- ni+1;
+ }
+
+
+ if(ni>max.it) {
+ # determine lowest value
+ if(debug) {
+ cat(paste("exceeded max.it (",max.it,"), returning lowest point",signif(mvp[1],4)));
+ }
+ return(list(root=mvp[1]))
+ } else {
+ rng <- mvp[1:2];
+ if(mv[2]==0) rng[2] <- mvp[3];
+ if(debug) {
+ cat(paste("bound to (",signif(rng[1],4),signif(rng[2],4),") "));
+ }
+ }
+
+ # find root on the right side
+ x <- uniroot(count.nucs.f,rng,tol=tol);
+ #x$max <- o$par;
+ #x$f.max <- (-1)*o$value;
+ if(debug) {
+ cat(paste(" done (thr=",signif(x$root,4),")\n"));
+ }
+ return(x);
+
+}
+
+
+# determine membership of points in fragments
+points.within <- function(x,fs,fe,return.list=F,return.unique=F,sorted=F,return.point.counts=F) {
+ if(is.null(x) | length(x) < 1) { return(c()) };
+ if(!sorted) {
+ ox <- rank(x,ties="first");
+ x <- sort(x);
+ }
+
+ se <- c(fs,fe);
+ fi <- seq(1:length(fs));
+ fi <- c(fi,-1*fi);
+
+ fi <- fi[order(se)];
+ se <- sort(se);
+
+ storage.mode(x) <- storage.mode(fi) <- storage.mode(se) <- "integer";
+ if(return.unique) { iu <- 1; } else { iu <- 0; }
+ if(return.list) { il <- 1; } else { il <- 0; }
+ if(return.point.counts) { rpc <- 1; } else { rpc <- 0; }
+ storage.mode(iu) <- storage.mode(il) <- storage.mode(rpc) <- "integer";
+ result <- .Call("points_within",x,se,fi,il,iu,rpc);
+ if(!sorted & !return.point.counts) {
+ result <- result[ox];
+ }
+ return(result);
+}
+
+
+# determine cooridnates of points x relative to signed
+# positions pos within size range
+get.relative.coordinates <- function(x,pos,size,sorted=F) {
+ if(!sorted) {
+ op <- order(abs(pos));
+ x <- sort(x); pos <- pos[op];
+ }
+ #dyn.load("~/zhao/sc/peaks.so");
+ storage.mode(x) <- storage.mode(pos) <- storage.mode(size) <- "integer";
+ rf <- .Call("get_relative_coordinates",x,pos,size);
+ if(!sorted) {
+ rf$i <- op[rf$i];
+ } else {
+ return(rf$i);
+ }
+ return(rf);
+}
+
+# given list of magnitude values for signal(x) and control (y),
+# return a dataframe with $e.val and $fdr
+get.eval.fdr.vectors <- function(x,y) {
+ nx <- length(x); ny <- length(y);
+ if(nx==0) { return(data.frame(evalue=c(),fdr=c())) }
+ if(ny==0) { return(data.frame(evalue=rep(0,nx),fdr=rep(1,nx))) }
+ ex <- ecdf(x); ey <- ecdf(y);
+
+ evals <- (1-ey(x))*ny;
+ yvals <- (1-ex(x))*nx;
+ fdr <- (evals+0.5)/(yvals+0.5); # with pseudo-counts
+ fdr[yvals==0] <- min(fdr); # correct for undercounts
+ # find a min x corresponding to a minimal FDR
+ mfdr <- min(fdr);
+ mfdrmx <- min(x[fdr==mfdr]);
+ # correct
+ fdr[x>=mfdrmx] <- mfdr;
+ return(data.frame(evalue=(evals+1),fdr=fdr));
+}
+
+
+# filter predictions to remove calls failling into the tag enrichment clusters ( chr list of $s/$e dfs)
+filter.binding.sites <- function(bd,tec,exclude=F) {
+ chrl <- names(bd); names(chrl) <- chrl;
+ lapply(chrl,function(chr) {
+ cbd <- bd[[chr]];
+ if(is.null(cbd)) { return(NULL) };
+ if(length(cbd)==0) { return(NULL) };
+ if(dim(cbd)[1]>0) {
+ ctec <- tec[[chr]];
+ if(length(ctec$s)>0) {
+ if(exclude) {
+ pwi <- which(points.within(cbd$x,ctec$s,ctec$e)== -1);
+ } else {
+ pwi <- which(points.within(cbd$x,ctec$s,ctec$e)> -1);
+ }
+ return(cbd[pwi,]);
+ } else {
+ if(exclude) {
+ return(cbd);
+ } else {
+ return(data.frame(x=c(),y=c()));
+ }
+ }
+ } else {
+ return(cbd);
+ }
+ });
+}
+
+
+# PUBLIC
+# generate predictions on sequential (chained) subsamples of data
+# if step.size <1, it is intepreted as a fraciton and a each subsequent subsample
+# is of a size (1-fraction.step)*N (N - size of the signal data);
+# otherwise the step.size is interpreted as a number of tags, and each subsample is of the size N-step.size
+get.subsample.chain.calls <- function(signal.data,control.data,n.steps=NULL,step.size=1e6,subsample.control=F,debug=F,min.ntags=1e3, excluded.steps=c(), test.chromosomes=NULL, ... ) {
+
+ if(!is.null(test.chromosomes)) {
+ # adjust step size
+ sz <- sum(unlist(lapply(signal.data,length)))
+ signal.data <- signal.data[test.chromosomes];
+ control.data <- control.data[test.chromosomes];
+
+ if(step.size>1) {
+ step.size <- step.size*sum(unlist(lapply(signal.data,length)))/sz;
+ # cat("adjusted step.size=",step.size,"\n");
+ }
+ }
+
+ if(is.null(n.steps)) {
+ if(step.size<1) {
+ # down to 10%
+ n.steps <- log(0.1)/log(step.size);
+ } else {
+ n.steps <- floor(sum(unlist(lapply(signal.data,length)))/step.size)
+ }
+ }
+ if(subsample.control & !is.null(control.data)) {
+ # normalize control to the signal size
+ if(debug) { cat("pre-subsampling control.\n"); }
+ bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length)))
+ control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight,replace=(bg.weight>1)))
+ }
+ calls <- list();
+ callnames <- c();
+ for(i in 0:n.steps) {
+ if(debug) { cat("chained subsample step",i,":\n"); }
+ if(!i %in% excluded.steps) {
+ ans <- list(find.binding.positions(signal.data=signal.data,control.data=control.data,debug=debug, skip.control.normalization=T, ...));
+ names(ans) <- as.character(c(i));
+ calls <- c(calls,ans);
+ callnames <- c(callnames,i);
+ }
+ # subsample
+ if(step.size<1) {
+ # fraction steps
+ f <- 1-step.size;
+ } else {
+ # bin steps
+ sz <- sum(unlist(lapply(signal.data,length)));
+ f <- (sz-step.size)/sz;
+ if(f<=0) break;
+ }
+ if(debug) { cat("chained subsampling using fraction",f,".\n"); }
+ signal.data <- lapply(signal.data,function(d) sample(d,length(d)*f));
+ if(subsample.control & !is.null(control.data)) {
+ control.data <- lapply(control.data,function(d) sample(d,length(d)*f));
+ }
+ sz <- sum(unlist(lapply(signal.data,length)));
+ if(sz<min.ntags) break;
+ }
+ names(calls) <- callnames;
+ return(calls);
+}
+
+
+# chain-subsample dataset and calculate MSER interpolation
+mser.chain.interpolation <- function(signal.data=NULL,control.data=NULL,chains=NULL,n.chains=5,debug=F, enrichment.background.scales=c(1,5), test.agreement=0.99, agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr", return.lists=F, ...) {
+ if(is.null(chains)) {
+ cn <- c(1:n.chains); names(cn) <- cn;
+ tf <- function(i, ...) get.subsample.chain.calls(signal.data,control.data,debug=debug, enrichment.background.scales=enrichment.background.scales, ...);
+ chains <- lapply(cn,tf,...);
+ }
+ names(enrichment.background.scales) <- enrichment.background.scales;
+ lapply(enrichment.background.scales,function(scale) {
+ actual.enr.field <- enr.field;
+ if(scale>1) {
+ actual.enr.field <- paste(actual.enr.field,scale,sep=".");
+ }
+
+ cvl <- lapply(chains,function(chain) {
+ nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T);
+ nd <- diff(nn);
+ nn <- nn[-length(nn)];
+ me <- lapply(c(2:length(chain)),function(i) {
+ sla <- t.precalculate.ref.peak.agreement(chain[[i-1]],chain[i],agreement.distance=agreement.distance,enr.field=actual.enr.field)
+ me <- t.find.min.saturated.enr(sla,thr=1-test.agreement)
+ menr <- max(min(na.omit(unlist(lapply(chain[[i-1]]$npl,function(d) d[actual.enr.field])))),min(na.omit(unlist(lapply(chain[[i]]$npl,function(d) d[actual.enr.field])))),1)
+ if(me<=menr) { me <- 1; };
+ return(me);
+ })
+ data.frame(n=nn,me=unlist(me),nd=nd);
+ });
+ if(return.lists) { return(cvl) }
+ cvl <- na.omit(do.call(rbind,cvl));
+ if(return.median) {
+ tv <- tapply(cvl$me,as.factor(cvl$n),median)
+ } else {
+ tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim);
+ }
+ df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv));
+ return(df[order(df$n,decreasing=T),])
+ })
+}
+
+
+
+# returns agreement as a function of dataset size, possibly filtering peaks by min.enr threshold, and by max.fdr
+chain.to.reference.comparison <- function(chains,min.enr=NULL,debug=F,agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr",max.fdr=NULL) {
+ cvl <- lapply(chains,function(chain) {
+ # filter chain by fdr
+ if(!is.null(max.fdr)) {
+ chain <- lapply(chain,function(d) { d$npl <- lapply(d$npl,function(cd) cd[cd$fdr<=max.fdr,]); return(d); });
+ }
+ nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T);
+ nn <- nn[-length(nn)];
+ me <- lapply(c(2:length(chain)),function(i) {
+ sla <- t.precalculate.ref.peak.agreement(chain[[1]],chain[i],agreement.distance=agreement.distance,enr.field=enr.field)
+ # calculate overlap
+ x <- lapply(sla,function(mpd) {
+ if(!is.null(min.enr)) {
+
+ me <- mpd$re >= min.enr;
+ me[is.na(me)] <- F;
+ mpd <- mpd[me,];
+ ome <- mpd$oe < min.enr;
+ ome[is.na(ome)] <- T;
+ mpd$ov[ome] <- 0;
+ }
+ return(mean(mpd$ov));
+ })
+ })
+
+ data.frame(n=nn,me=unlist(me));
+ });
+
+ cvl <- na.omit(do.call(rbind,cvl));
+ if(return.median) {
+ tv <- tapply(cvl$me,as.factor(cvl$n),median)
+ } else {
+ tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim);
+ }
+ df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv));
+ return(df[order(df$n,decreasing=T),])
+}
+
+
+# estimates enrichment confidence interval based on 2*tag.count.whs window around each position, and a z-score (alpha/2)
+# if(multiple.background.scales=T) the enrichment is also estimated using 5- and 10-fold increased background tag window
+# adds $enr (lower bound), $enr.ub (upper bound) and $enr.mle fields
+calculate.enrichment.estimates <- function(binding.positions,signal.data=NULL,control.data=NULL,fraction=1,tag.count.whs=100,z=2,effective.genome.size=3e9,scale.down.control=F,background.scales=c(1),bg.weight=NULL) {
+ f <- fraction;
+ qv <- pnorm(z,lower.tail=F);
+ cn <- names(binding.positions$npl); names(cn) <- cn;
+
+ if(is.null(control.data)) {
+ # estimate from gamma distribution
+ fg.lambda <- f*sum(unlist(lapply(signal.data,length)))*2*tag.count.whs/effective.genome.size;
+ binding.positions$npl <- lapply(binding.positions$npl,function(d) {
+ d$enr <- qgamma(qv,d$nt,scale=1)/fg.lambda;
+ d$enr.ub <- qgamma(1-qv,d$nt,scale=1)/fg.lambda;
+ d$enr.mle <- d$nt/fg.lambda;
+ return(d);
+ });
+ } else {
+ # estimate using beta distribution
+ if(is.null(bg.weight)) {
+ bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length)))
+ }
+
+ if(scale.down.control) {
+ # sample down control to be the same size as true signal.data (bg.weight*f)
+ control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight*f,replace=(f*bg.weight>1)))
+ #bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length)))
+ bg.weight <- 1/f;
+
+ }
+
+ binding.positions$enrichment.bg.weight <- bg.weight;
+ binding.positions$enrichment.whs <- tag.count.whs;
+ binding.positions$enrichment.z <- z;
+
+ binding.positions$npl <- lapply(cn,function(chr) {
+ d <- binding.positions$npl[[chr]];
+
+ edf <- lapply(background.scales,function(background.width.multiplier) {
+ sig.mult <- bg.weight*f/background.width.multiplier;
+ nbg <- points.within(abs(control.data[[chr]]),d$x-tag.count.whs*background.width.multiplier,d$x+tag.count.whs*background.width.multiplier,return.point.counts=T,return.unique=F);
+
+ nfg <- d$nt;
+
+
+ # Poisson ratio Bayesian LB with non-informative prior (Clopper & Pearson 1934)
+ nf <- ((nfg+0.5)/(nbg+0.5))*qf(1-qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F)
+ nf <- nf/sig.mult;
+
+ ub <- ((nfg+0.5)/(nbg+0.5))*qf(qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F)
+ ub <- ub/sig.mult;
+
+ mle <- (nfg+0.5)/(nbg+0.5);
+ mle <- mle/sig.mult;
+ if(is.null(nbg)) { nbg <- numeric(0) }
+ if(is.null(nf)) { nf <- numeric(0) }
+ if(is.null(ub)) { ub <- numeric(0) }
+ if(is.null(mle)) { mle <- numeric(0) }
+ return(data.frame(nbg=nbg,lb=nf,ub=ub,mle=mle))
+ })
+
+ adf <- do.call(cbind,lapply(c(1:length(background.scales)),function(i) {
+ df <- edf[[i]];
+ cn <- c("nbgt","enr","enr.ub","enr.mle");
+ if(background.scales[i]>1) {
+ cn <- paste(cn,as.character(background.scales[i]),sep=".");
+ }
+ names(df) <- cn;
+ return(df);
+ }))
+
+ return(cbind(d,adf));
+ });
+ }
+
+ return(binding.positions);
+}
+
+
+# precalculate peak agreement of a sampling list given a reference
+t.precalculate.ref.peak.agreement <- function(ref,sf,agreement.distance=50,enr.field="enr") {
+ ref <- ref$npl;
+ cn <- names(ref); names(cn) <- cn;
+
+ # for each sampling round
+ lapply(sf,function(sd) {
+ # calculate overlap
+
+ ov <- data.frame(do.call(rbind,lapply(cn,function(chr) {
+ if(dim(ref[[chr]])[1]<1) { return(cbind(ov=c(),re=c(),oe=c())) };
+ pwi <- points.within(ref[[chr]]$x,sd$npl[[chr]]$x-agreement.distance,sd$npl[[chr]]$x+agreement.distance);
+ pwi[pwi==-1] <- NA;
+ renr <- ref[[chr]][,enr.field]
+ oenr <- sd$npl[[chr]][,enr.field][pwi];
+ if(length(oenr)==0) { oenr <- rep(NA,length(renr)); }
+ return(cbind(ov=as.integer(!is.na(pwi)),re=renr,oe=oenr));
+ })))
+ })
+}
+
+
+# find minimal saturated enrichment given a list of replicate agreement matrices (for one fraction)
+t.find.min.saturated.enr <- function(pal,thr=0.01,plot=F,return.number.of.peaks=F,plot.individual=T,return.median=F,return.vector=F) {
+ nr <- length(pal);
+ # merge replicate data frames
+ mpd <- data.frame(do.call(rbind,pal));
+
+ mpd$re[is.na(mpd$re)] <- Inf;
+ mpd$oe[is.na(mpd$oe)] <- Inf;
+
+
+
+ # round up values to avoid miscounting
+ mpd$re <- round(mpd$re,digits=2);
+ mpd$oe <- round(mpd$oe,digits=2);
+
+ me <- pmin(mpd$re,mpd$oe);
+ ome <- order(me,decreasing=T);
+ df <- data.frame(me=me[ome],ov=mpd$ov[ome]);
+ recdf <- ecdf(-mpd$re); ren <- length(mpd$re);
+
+ # collapse equal peak heights
+ xk <- tapply(df$ov,as.factor(df$me),sum); xk <- data.frame(ov=as.numeric(xk),me=as.numeric(names(xk))); xk <- xk[order(xk$me,decreasing=T),];
+
+
+ cso <- cumsum(xk$ov)/(recdf(-xk$me)*ren);
+ cso[is.na(cso)] <- 0;
+ cso[!is.finite(cso)] <- 0;
+ mv <- max(which(cso >= 1-thr))
+ menr <- xk$me[mv];
+
+ ir <- lapply(pal,function(d) {
+ d$re[is.na(d$re)] <- Inf;
+ d$oe[is.na(d$oe)] <- Inf;
+
+ me <- pmin(d$re,d$oe);
+ ome <- order(me,decreasing=T);
+ df <- data.frame(me=me[ome],ov=d$ov[ome]);
+ cso <- cumsum(df$ov)/c(1:length(df$ov));
+ mv <- max(which(cso >= 1-thr))
+ menr <- df$me[mv];
+ return(list(df=df,menr=menr));
+ });
+
+ if(plot) {
+ par(mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
+ plot(df$me,cumsum(df$ov)/c(1:length(df$ov)),type='l',ylab="fraction of positions overlapping with reference",xlab="minimal enrichment of binding positions",xlim=c(min(df$me),2*menr));
+ abline(h=1-thr,lty=2,col=4)
+ if(plot.individual) {
+ lapply(ir,function(d) {
+ df <- d$df;
+ lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=8);
+ abline(v=menr,col="pink",lty=3)
+ });
+ lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=1);
+ }
+ abline(v=menr,col=2,lty=2)
+ legend(x="bottomright",lty=c(1,2,1,3,2),col=c(1,2,8,"pink",4),legend=c("combined samples","combined sample MSER","individual samples","individual MSERs","consistency threshold"));
+ }
+
+ if(return.number.of.peaks) {
+ mpd <- data.frame(do.call(rbind,pal));
+ return(length(which(!is.na(mpd$re) & mpd$re >=menr))/nr);
+ } else {
+ if(return.vector) {
+ return(unlist(lapply(ir,function(d) d$menr)));
+ }
+ if(return.median) {
+ return(median(unlist(lapply(ir,function(d) d$menr))));
+ } else {
+ return(menr);
+ }
+ }
+}
+
+
+
+# determine d1/d2 dataset size ratio. If background.density.scaling=F, the ratio of tag counts is returned.
+# if background.density.scaling=T, regions of significant tag enrichment are masked prior to ratio calculation.
+dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) {
+ if(!background.density.scaling) {
+ return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length))))
+ }
+
+ chrl <- intersect(names(d1),names(d2));
+ ntc <- do.call(rbind,lapply(chrl,function(chr) {
+ x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ x2 <- tag.enrichment.clusters(abs(d2[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ return(c(length(which(points.within(abs(d1[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)),length(which(points.within(abs(d2[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1))))
+ }))
+ ntcs <- apply(ntc,2,sum);
+ #print(ntcs/c(sum(unlist(lapply(d1,length))),sum(unlist(lapply(d2,length)))));
+ return(ntcs[1]/ntcs[2])
+}
+
+# returns effective size of the dataset based on the same logic as dataset.density.ratio
+dataset.density.size <- function(d1,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) {
+ if(!background.density.scaling) {
+ return(sum(unlist(lapply(d1,length))))
+ }
+
+ chrl <- names(d1);
+ ntc <- lapply(chrl,function(chr) {
+ x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ return(length(which(points.within(abs(d1[[chr]]),x1$s-wsize/2,x1$e+wsize/2)==-1)))
+ })
+ return(sum(unlist(ntc)))
+}
+
+old.dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) {
+ if(!background.density.scaling) {
+ return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length))))
+ }
+
+ t.chromosome.counts <- function(tl) {
+ lapply(tl,function(d) {
+ x <- tag.enrichment.clusters(abs(d),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F)
+ x$s <- x$s-wsize/2; x$e <- x$e+wsize/2;
+ x <- regionset.intersection.c(list(x),do.union=T)
+ return(c(n=length(which(points.within(abs(d),x$s,x$e)==-1)),s=diff(range(abs(d))),m=sum(x$e-x$s)));
+ })
+ }
+
+ l1 <- t.chromosome.counts(d1);
+ l2 <- t.chromosome.counts(d2);
+
+ l2 <- data.frame(do.call(rbind,l2[names(l1)]));
+ l1 <- data.frame(do.call(rbind,l1));
+
+ # genome size
+ gs <- sum(pmax(l1$s,l2$s))
+
+ den1 <- sum(l1$n)/(gs-sum(l1$m))
+ den2 <- sum(l2$n)/(gs-sum(l2$m))
+ return(den1/den2);
+}
+
+
+
+
+# calculate cumulative density based on sum of scaled gaussian curves
+# (by Michael Tolstorukov)
+#
+# vin - input vector; bw -- standard deviation, dw-gaussina cutoff in stdev; dout - output "density")
+# output - if return.x=F vector of cumulative density values corresponding to integer positions described by range(vin)
+# output - if return.x=T a data structure with $x and $y corresponding to the cumulative density
+# optional match.wt.f is a function that will return weights for a tag vector
+densum <- function(vin,bw=5,dw=3,match.wt.f=NULL,return.x=T,from=min(vin),to=max(vin),step=1) {
+ # construct vector of unique tags and their counts
+ tc <- table(vin[vin>=from & vin<=to]);
+ pos <- as.numeric(names(tc)); storage.mode(pos) <- "double";
+ tc <- as.numeric(tc); storage.mode(tc) <- "double";
+ n <- length(pos)
+ # weight counts
+ if(!is.null(match.wt.f)) {
+ tc <- tc*match.wt.f(pos);
+ }
+
+ rng <- c(from,to);
+ if(rng[1]<0) { stop("range extends into negative values") }
+ if(range(pos)[1]<0) { stop("position vector contains negative values") }
+
+ storage.mode(n) <- storage.mode(rng) <- storage.mode(bw) <- storage.mode(dw) <- storage.mode(step) <- "integer";
+
+ spos <- rng[1]; storage.mode(spos) <- "double";
+
+ dlength <- floor((rng[2] - rng[1])/step) + 1; # length of output array
+ if(dlength<1) { stop("zero data range") }
+ dout <- numeric(dlength); storage.mode(dout) <- "double";
+ storage.mode(dlength) <- "integer";
+ .C("cdensum",n,pos,tc,spos,bw,dw,dlength,step,dout,DUP=F);
+
+ if(return.x) {
+ return(list(x=c(rng[1],rng[1]+step*(dlength-1)),y=dout,step=step))
+ } else {
+ return(dout)
+ }
+}
+
+# count tags within sliding window of a specified size
+# vin - tag vector (postive values, pre-shifted)
+# window.size/window.step - window characteristics
+# tv - optional, pre-sorted, pre-trimmed tag vector
+window.tag.count <- function(vin,window.size,window.step=1,return.x=T,from=min(vin)+floor(window.size/2),to=max(vin)-floor(window.size/2),tv=NULL) {
+ whs <- floor(window.size/2);
+ # select tags with margins
+ if(is.null(tv)) {
+ tv <- sort(vin[vin>=from-whs-1 & vin<=to+whs+1])
+ }
+ storage.mode(tv) <- "double";
+ n <- length(tv)
+ nsteps <- ceiling((to-from)/window.step);
+
+ storage.mode(n) <- storage.mode(nsteps) <- storage.mode(window.size) <- storage.mode(window.step) <- "integer";
+
+ spos <- from; storage.mode(spos) <- "double";
+
+ if(nsteps<1) { stop("zero data range") }
+ #dout <- integer(nsteps); storage.mode(dout) <- "integer";
+ #.C("window_n_tags",n,tv,spos,window.size,window.step,nsteps,dout,DUP=F);
+ dout <- .Call("cwindow_n_tags",tv,spos,window.size,window.step,nsteps);
+
+ if(return.x) {
+ return(list(x=c(from,from+(nsteps-1)*window.step),y=dout,step=window.step))
+ } else {
+ return(dout)
+ }
+}
+
+# count tags in windows around specified positions (pos)
+window.tag.count.around <- function(vin,window.size,pos,return.x=T,tc=NULL,sorted=F) {
+ if(is.null(tc)) {
+ tc <- table(vin);
+ }
+ if(!sorted) {
+ op <- rank(pos);
+ pos <- sort(pos);
+ }
+ storage.mode(pos) <- "double";
+ tpos <- as.integer(names(tc)); storage.mode(tpos) <- "double";
+ tc <- as.integer(tc); storage.mode(tc) <- "integer";
+
+ whs <- floor(window.size/2);
+
+ storage.mode(whs) <- "integer";
+ twc <- .Call("cwindow_n_tags_around",tpos,tc,pos,whs);
+ if(return.x) {
+ if(sorted) {
+ return(data.frame(x=pos,y=twc));
+ } else {
+ return(data.frame(x=pos[op],y=twc[op]));
+ }
+ } else {
+ if(sorted) {
+ return(twc);
+ } else {
+ return(twc[op]);
+ }
+ }
+}
+
+# given a tag vector (signed), identify and clean up (either remove or cap) singular positions that exceed local tag density
+# vin - tag vector
+# cap.fold - maximal fold over enrichment over local density allowed for a single tag position, at which the tag count is capped
+# eliminate.fold - max fold enrichment that, when exceeded, results in exclusion of all the tags at that position (e.g. counted as anomaly)
+# z.threshold - Z-score used to determine max allowed counts
+filter.singular.positions.by.local.density <- function(tags,window.size=200,cap.fold=4,eliminate.fold=10,z.threshold=3) {
+ # tabulate tag positions
+ if(length(tags)<2) { return(tags); };
+
+ tc <- table(tags);
+ pos <- as.numeric(names(tc)); storage.mode(pos) <- "double";
+ tc <- as.integer(tc); storage.mode(tc) <- "integer";
+ n <- length(pos);
+
+ whs <- floor(window.size/2);
+
+ storage.mode(n) <- storage.mode(whs) <- "integer";
+ twc <- .Call("cwindow_n_tags_around",pos,tc,pos,whs);
+ twc <- (twc-tc+1)/window.size; # local density
+
+ pv <- pnorm(z.threshold,lower.tail=F)
+ # exclude
+ max.counts <- qpois(pv,twc*eliminate.fold,lower.tail=F)
+ tc[tc>max.counts] <- 0;
+ # cap
+ max.counts <- qpois(pv,twc*cap.fold,lower.tail=F)
+ ivi <- which(tc>max.counts);
+ tc[ivi] <- max.counts[ivi]+1;
+
+ # reconstruct tag vector
+ tv <- rep(pos,tc);
+ to <- order(abs(tv)); tv <- tv[to];
+ return(tv);
+}
+
+
+
+# calculates enrichment bounds using multiple background scales
+# ft - foreground tags (pre-shifted, positive)
+# bt - background tags
+# fws - foreground window size
+# bwsl - background window size list
+# step - window step
+# rng - from/to coordinates (to will be adjusted according to step)
+#
+# returns: a list with $x ($s $e $step), $lb vector and $mle vector ($ub if calculate.upper.bound=T)
+mbs.enrichment.bounds <- function(ft,bt,fws,bwsl,step=1,rng=NULL,alpha=0.05,calculate.upper.bound=F,bg.weight=length(ft)/length(bt),use.most.informative.scale=F,quick.calculation=F,pos=NULL) {
+ # determine range
+ if(is.null(rng)) {
+ rng <- range(range(ft));
+ }
+ # foreground counts
+ if(is.null(pos)) {
+ fwc <- window.tag.count(ft,fws,window.step=step,from=rng[1],to=rng[2],return.x=T);
+ } else {
+ fwc <- window.tag.count.around(ft,fws,pos,return.x=T)
+ }
+ fwc$y <- fwc$y+0.5;
+
+ zal <- qnorm(alpha/2,lower.tail=F);
+
+ # background counts
+ bt <- sort(bt);
+ if(!is.null(pos)) {
+ tc <- table(bt);
+ }
+ bgcm <- lapply(bwsl,function(bgws) {
+ if(is.null(pos)) {
+ window.tag.count(bt,bgws,window.step=step,from=rng[1],to=rng[2],return.x=F,tv=bt)+0.5;
+ } else {
+ window.tag.count.around(bt,bgws,pos,return.x=F,tc=tc)+0.5
+ }
+ })
+ if(!is.null(pos)) {
+ rm(tc);
+ }
+
+ # pick most informative scale
+ if(use.most.informative.scale) {
+ bgcm <- t(do.call(cbind,bgcm))
+ isi <- max.col(t((bgcm)/(bwsl/fws))) # add pseudo-counts to select lowest scale in case of a tie
+
+ bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)]
+
+ if(quick.calculation) {
+ rte <- fwc$y+bgc-0.25*zal*zal; rte[rte<0] <- 0;
+ dn <- bgc - 0.25*zal*zal;
+ lbm=(sqrt(fwc$y*bgc) - 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(lbm<0);
+ lbm <- lbm*lbm*bwsl[isi]/fws/bg.weight;
+ lbm[rte<=0] <- 1;
+ lbm[dn<=0] <- 1;
+ lbm[ivi] <- 1;
+ } else {
+ lbm <- (fwc$y/bgc)*qf(1-alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight;
+ }
+
+ mle <- fwc$y/bgc*bwsl[isi]/fws/bg.weight; mle[is.nan(mle)] <- Inf; mle[is.na(mle)] <- Inf;
+
+ rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle);
+
+ if(calculate.upper.bound) {
+ isi <- max.col(t((-bgcm)/(bwsl/fws))) # add pseudo-counts to select highest scale in case of a tie
+ bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)]
+
+ if(quick.calculation) {
+ ubm=(sqrt(fwc$y*bgc) + 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(ubm<0);
+ ubm <- ubm*ubm*bwsl[isi]/fws/bg.weight;
+ ubm[rte<=0] <- 1;
+ ubm[ivi] <- 1;
+ lbm[dn<=0] <- 1;
+ } else {
+ ubm <- (fwc$y/bgc)*qf(alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight;
+ }
+ rl <- c(rl,list(ub=ubm));
+ }
+ return(rl);
+
+ } else {
+ # determine lower bounds
+ lbm <- lapply(c(1:length(bgcm)),function(i) {
+ nbg <- bgcm[[i]];
+ if(quick.calculation) {
+ rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0;
+ dn <- (nbg - 0.25*zal*zal);
+ lbm=(sqrt(fwc$y*nbg) - 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(lbm<0);
+ lbm <- lbm*lbm*bwsl[i]/fws/bg.weight;
+ lbm[rte<=0] <- 1;
+ lbm[dn<=0] <- 1;
+ lbm[ivi] <- 1;
+ return(lbm);
+ } else {
+ return((fwc$y/nbg)*qf(1-alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight);
+ }
+ })
+ lbm <- do.call(pmin,lbm);
+
+ # calculate mle
+ #mle <- do.call(pmin,lapply(bgcm,function(bgc) fwc/bgc))
+ mle <- do.call(pmin,lapply(c(1:length(bgcm)),function(i) {
+ bgc <- bgcm[[i]];
+ x <- fwc$y/bgc*bwsl[i]/fws/bg.weight; x[is.nan(x)] <- Inf; x[is.na(x)] <- Inf; return(x);
+ }))
+
+ rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle);
+
+ if(calculate.upper.bound) {
+ # determine upper bound
+ ubm <- lapply(c(1:length(bgcm)),function(i) {
+ nbg <- bgcm[[i]];
+ if(quick.calculation) {
+ rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0;
+ dn <- (nbg - 0.25*zal*zal);
+ ubm=(sqrt(fwc$y*nbg) + 0.5*zal*sqrt(rte))/dn;
+ ivi <- which(ubm<0);
+ ubm <- ubm*ubm*bwsl[i]/fws/bg.weight;
+ ubm[rte<=0] <- 1;
+ ubm[dn<=0] <- 1;
+ ubm[ivi] <- 1;
+ return(ubm);
+ } else {
+ return((fwc$y/nbg)*qf(alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight);
+ }
+ })
+ ubm <- do.call(pmax,ubm);
+ rl <- c(rl,list(ub=ubm));
+ }
+
+ return(rl);
+ }
+}
+
+write.probe.wig <- function(chr,pos,val,fname,append=F,feature="M",probe.length=35,header=T) {
+ min.dist <- min(diff(pos));
+ if(probe.length>=min.dist) {
+ probe.length <- min.dist-1;
+ cat("warning: adjusted down wig segment length to",probe.length,"\n");
+ }
+ mdat <- data.frame(chr,as.integer(pos),as.integer(pos+probe.length),val)
+
+ if(header) {
+ write(paste("track type=wiggle_0 name=\"Bed Format\" description=\"",feature,"\" visibility=dense color=200,100,0 altColor=0,100,200 priority=20",sep=""),file=fname,append=append)
+ write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=T);
+ } else {
+ write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=append);
+ }
+
+}
+
+# returns intersection of multiple region sets
+# each regionset needs to contain $s, $e and optional $v column
+regionset.intersection.c <- function(rsl,max.val=-1,do.union=F) {
+ # translate into position/flag form
+ rfl <- lapply(rsl,function(rs) {
+ rp <- c(rs$s,rs$e); rf <- c(rep(c(1,-1),each=length(rs$s)));
+
+ ro <- order(rp);
+ rp <- rp[ro]; rf <- rf[ro];
+ if(!is.null(rs$v)) {
+ rv <- c(rs$v,rs$v)[ro];
+ return(data.frame(p=as.numeric(rp),f=as.integer(rf),v=as.numeric(rv)));
+ } else {
+ return(data.frame(p=as.numeric(rp),f=as.integer(rf)));
+ }
+ })
+ rfd <- data.frame(do.call(rbind,lapply(1:length(rfl),function(i) {
+ d <- rfl[[i]]; d$f <- d$f*i; return(d);
+ })))
+ rfd <- rfd[order(rfd$p),];
+ if(is.null(rfd$v)) { max.val <- 0; }
+ if(do.union) { ur <- 1; } else { ur <- 0; };
+ rl <- .Call("region_intersection",as.integer(length(rfl)),as.numeric(rfd$p),as.integer(rfd$f),as.numeric(rfd$v),as.integer(max.val),as.integer(ur));
+ return(data.frame(do.call(cbind,rl)));
+}
+
+
+# idenfity if binding peak falls within a larger region of significant tag enrichment, and if so record its booundaries
+add.broad.peak.regions <- function(chip.tags,input.tags,bp,window.size=500,z.thr=2) {
+ se <- find.significantly.enriched.regions(chip.tags,input.tags,window.size=window.size,z.thr=z.thr,poisson.z=0,poisson.ratio=0,either=F)
+ chrl <- names(bp$npl); names(chrl) <- chrl;
+ bnpl <- lapply(chrl,function(chr) {
+ npl <- bp$npl[[chr]];
+ if(is.null(npl) | dim(npl)[1]<1) {
+ return(npl);
+ }
+ pi <- points.within(npl$x,se[[chr]]$s,se[[chr]]$e,return.list=T);
+
+ pm <- do.call(rbind,lapply(pi,function(rl) {
+ if(length(rl)>0) {
+ return(range(c(se[[chr]]$s[rl],se[[chr]]$e[rl])))
+ } else {
+ return(c(NA,NA));
+ }
+ }))
+
+ npl$rs <- pm[,1];
+ npl$re <- pm[,2];
+ return(npl);
+ })
+ bp$npl <- bnpl;
+ return(bp);
+}
+
+# writing out binding results in a narrowpeak format, incorporating broad region boundaries if they are present
+# if broad region info is not present, margin is used to determine region width. The default margin is equal
+# to the window half size used to call the binding peaks
+write.narrowpeak.binding <- function(bd,fname,margin=bd$whs,npeaks=NA) { # Anshul: added npeaks option
+ if(is.null(margin)) { margin <- 50; }
+ chrl <- names(bd$npl); names(chrl) <- chrl;
+ md <- do.call(rbind,lapply(chrl,function(chr) {
+ df <- bd$npl[[chr]];
+ x <- df$x;
+ rs <- df$rs; if(is.null(rs)) { rs <- rep(NA,length(x)) }
+ re <- df$re; if(is.null(re)) { re <- rep(NA,length(x)) }
+ #ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- x[ivi]-margin;}
+ ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- pmax(0,x[ivi]-margin);} # Anshul: added the pmax (0, ...) to avoid negative peak starts
+ ivi <- which(is.na(re)); if(any(ivi)) {re[ivi] <- x[ivi]+margin;}
+ #cbind(chr,rs,re,".","0",".",df$y,-1,format(df$fdr,scientific=T,digits=3),x-rs)
+ cbind(chr,rs,re,".","0",".",df$y,-1,-log10(df$fdr),x-rs) # Anshul: converted fdr to -log10
+ }))
+ md <- md[order(as.numeric(md[,7]),decreasing=T),]
+ if (!is.na(npeaks)) { # Anshul: added this option to print a limited number of peaks
+ npeaks <- min(nrow(md),npeaks)
+ md <- md[1:npeaks,]
+ }
+ write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F);
+}
+
+
+get.broad.enrichment.clusters <- function(signal.data,control.data,window.size=1e3,z.thr=3, tag.shift=146/2,background.density.scaling=F, ... ) {
+ # find significantly enriched clusters
+ bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling);
+ se <- find.significantly.enriched.regions(signal.data,control.data,window.size=window.size,z.thr=z.thr,tag.shift=tag.shift, bg.weight=bg.weight, ...)
+ chrl <- names(se); names(chrl) <- chrl;
+ se <- lapply(chrl,function(chr) {
+ d <- se[[chr]];
+ if(length(d$s>1)) {
+ d <- regionset.intersection.c(list(d,d),do.union=T);
+ sc <- points.within(abs(signal.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T);
+ cc <- points.within(abs(control.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T);
+ d$rv <- log2((sc+1)/(cc+1)/bg.weight);
+ return(d);
+ } else {
+ return(d)
+ }
+ })
+}
+
+write.broadpeak.info <- function(bp,fname) {
+ chrl <- names(bp); names(chrl) <- chrl;
+ chrl <- chrl[unlist(lapply(bp,function(d) length(d$s)))>0]
+ md <- do.call(rbind,lapply(chrl,function(chr) {
+ df <- bp[[chr]];
+ cbind(chr,df$s,df$e,".","0",".",df$rv,-1,-1)
+ }))
+ md <- md[order(as.numeric(md[,7]),decreasing=T),]
+ write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F);
+}
+
+
+get.clusters2 <- function(x,CL) {
+ temp <- which(diff(x) != 0)
+ begin <- c(1, temp + 1)
+ end <- c(temp, length(x))
+ size <- end - begin + 1
+
+ begin <- begin[size >= CL]
+ end <- end[size >= CL]
+ size <- size[size >= CL]
+
+ size <- size[x[end] != 0]
+ begin <- begin[x[end] != 0]
+ end <- end[x[end] != 0]
+
+ return (list(size=size,begin=begin,end=end))
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure
new file mode 100755
index 0000000..1cef55c
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure
@@ -0,0 +1,3856 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.63 for SPP 1.7.
+#
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## --------------------- ##
+## M4sh Initialization. ##
+## --------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ PATH_SEPARATOR=:
+ (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+ (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+ PATH_SEPARATOR=';'
+ }
+fi
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+ as_unset=unset
+else
+ as_unset=false
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order. Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" "" $as_nl"
+
+# Find who we are. Look in the path if we contain no directory separator.
+case $0 in
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+IFS=$as_save_IFS
+
+ ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+ as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+ $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+ { (exit 1); exit 1; }
+fi
+
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+
+# CDPATH.
+$as_unset CDPATH
+
+
+if test "x$CONFIG_SHELL" = x; then
+ if (eval ":") 2>/dev/null; then
+ as_have_required=yes
+else
+ as_have_required=no
+fi
+
+ if test $as_have_required = yes && (eval ":
+(as_func_return () {
+ (exit \$1)
+}
+as_func_success () {
+ as_func_return 0
+}
+as_func_failure () {
+ as_func_return 1
+}
+as_func_ret_success () {
+ return 0
+}
+as_func_ret_failure () {
+ return 1
+}
+
+exitcode=0
+if as_func_success; then
+ :
+else
+ exitcode=1
+ echo as_func_success failed.
+fi
+
+if as_func_failure; then
+ exitcode=1
+ echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+ :
+else
+ exitcode=1
+ echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+ exitcode=1
+ echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+ :
+else
+ exitcode=1
+ echo positional parameters were not saved.
+fi
+
+test \$exitcode = 0) || { (exit 1); exit 1; }
+
+(
+ as_lineno_1=\$LINENO
+ as_lineno_2=\$LINENO
+ test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" &&
+ test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; }
+") 2> /dev/null; then
+ :
+else
+ as_candidate_shells=
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ case $as_dir in
+ /*)
+ for as_base in sh bash ksh sh5; do
+ as_candidate_shells="$as_candidate_shells $as_dir/$as_base"
+ done;;
+ esac
+done
+IFS=$as_save_IFS
+
+
+ for as_shell in $as_candidate_shells $SHELL; do
+ # Try only shells that exist, to save several forks.
+ if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+ { ("$as_shell") 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+:
+_ASEOF
+}; then
+ CONFIG_SHELL=$as_shell
+ as_have_required=yes
+ if { "$as_shell" 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+:
+(as_func_return () {
+ (exit $1)
+}
+as_func_success () {
+ as_func_return 0
+}
+as_func_failure () {
+ as_func_return 1
+}
+as_func_ret_success () {
+ return 0
+}
+as_func_ret_failure () {
+ return 1
+}
+
+exitcode=0
+if as_func_success; then
+ :
+else
+ exitcode=1
+ echo as_func_success failed.
+fi
+
+if as_func_failure; then
+ exitcode=1
+ echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+ :
+else
+ exitcode=1
+ echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+ exitcode=1
+ echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = "$1" ); then
+ :
+else
+ exitcode=1
+ echo positional parameters were not saved.
+fi
+
+test $exitcode = 0) || { (exit 1); exit 1; }
+
+(
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; }
+
+_ASEOF
+}; then
+ break
+fi
+
+fi
+
+ done
+
+ if test "x$CONFIG_SHELL" != x; then
+ for as_var in BASH_ENV ENV
+ do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+ done
+ export CONFIG_SHELL
+ exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"}
+fi
+
+
+ if test $as_have_required = no; then
+ echo This script requires a shell more modern than all the
+ echo shells that I found on your system. Please install a
+ echo modern shell, or manually run the script under such a
+ echo shell if you do have one.
+ { (exit 1); exit 1; }
+fi
+
+
+fi
+
+fi
+
+
+
+(eval "as_func_return () {
+ (exit \$1)
+}
+as_func_success () {
+ as_func_return 0
+}
+as_func_failure () {
+ as_func_return 1
+}
+as_func_ret_success () {
+ return 0
+}
+as_func_ret_failure () {
+ return 1
+}
+
+exitcode=0
+if as_func_success; then
+ :
+else
+ exitcode=1
+ echo as_func_success failed.
+fi
+
+if as_func_failure; then
+ exitcode=1
+ echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+ :
+else
+ exitcode=1
+ echo as_func_ret_success failed.
+fi
+
+if as_func_ret_failure; then
+ exitcode=1
+ echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+ :
+else
+ exitcode=1
+ echo positional parameters were not saved.
+fi
+
+test \$exitcode = 0") || {
+ echo No shell found that supports shell functions.
+ echo Please tell bug-autoconf@gnu.org about your system,
+ echo including any error possibly output before this message.
+ echo This can help us improve future autoconf versions.
+ echo Configuration will now proceed without shell functions.
+}
+
+
+
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+ # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+ # uniformly replaced by the line number. The first 'sed' inserts a
+ # line-number line after each line using $LINENO; the second 'sed'
+ # does the real work. The second script uses 'N' to pair each
+ # line-number line with the line containing $LINENO, and appends
+ # trailing '-' during substitution so that $LINENO is not a special
+ # case at line end.
+ # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+ # scripts with optimization help from Paolo Bonzini. Blame Lee
+ # E. McMahon (1931-1989) for sed's syntax. :-)
+ sed -n '
+ p
+ /[$]LINENO/=
+ ' <$as_myself |
+ sed '
+ s/[$]LINENO.*/&-/
+ t lineno
+ b
+ :lineno
+ N
+ :loop
+ s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+ t loop
+ s/-\n.*//
+ ' >$as_me.lineno &&
+ chmod +x "$as_me.lineno" ||
+ { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+ { (exit 1); exit 1; }; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensitive to this).
+ . "./$as_me.lineno"
+ # Exit status is that of the last command.
+ exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+ as_dirname=dirname
+else
+ as_dirname=false
+fi
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in
+-n*)
+ case `echo 'x\c'` in
+ *c*) ECHO_T=' ';; # ECHO_T is single tab character.
+ *) ECHO_C='\c';;
+ esac;;
+*)
+ ECHO_N='-n';;
+esac
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+ rm -f conf$$.dir/conf$$.file
+else
+ rm -f conf$$.dir
+ mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+ if ln -s conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s='ln -s'
+ # ... but there are two gotchas:
+ # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+ # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+ # In both cases, we have to default to `cp -p'.
+ ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+ as_ln_s='cp -p'
+ elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+ else
+ as_ln_s='cp -p'
+ fi
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p=:
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+ as_test_x='test -x'
+else
+ if ls -dL / >/dev/null 2>&1; then
+ as_ls_L_option=L
+ else
+ as_ls_L_option=
+ fi
+ as_test_x='
+ eval sh -c '\''
+ if test -d "$1"; then
+ test -d "$1/.";
+ else
+ case $1 in
+ -*)set "./$1";;
+ esac;
+ case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
+ ???[sx]*):;;*)false;;esac;fi
+ '\'' sh
+ '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+
+exec 7<&0 </dev/null 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+# Identity of this package.
+PACKAGE_NAME='SPP'
+PACKAGE_TARNAME='spp'
+PACKAGE_VERSION='1.7'
+PACKAGE_STRING='SPP 1.7'
+PACKAGE_BUGREPORT=''
+
+ac_subst_vars='LTLIBOBJS
+LIBOBJS
+HAVE_LIBBZ2
+OBJEXT
+EXEEXT
+ac_ct_CC
+CPPFLAGS
+LDFLAGS
+CFLAGS
+CC
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+'
+ ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+ # If the previous option needs an argument, assign it.
+ if test -n "$ac_prev"; then
+ eval $ac_prev=\$ac_option
+ ac_prev=
+ continue
+ fi
+
+ case $ac_option in
+ *=*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+ *) ac_optarg=yes ;;
+ esac
+
+ # Accept the important Cygnus configure options, so we can diagnose typos.
+
+ case $ac_dashdash$ac_option in
+ --)
+ ac_dashdash=yes ;;
+
+ -bindir | --bindir | --bindi | --bind | --bin | --bi)
+ ac_prev=bindir ;;
+ -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+ bindir=$ac_optarg ;;
+
+ -build | --build | --buil | --bui | --bu)
+ ac_prev=build_alias ;;
+ -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+ build_alias=$ac_optarg ;;
+
+ -cache-file | --cache-file | --cache-fil | --cache-fi \
+ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+ ac_prev=cache_file ;;
+ -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+ cache_file=$ac_optarg ;;
+
+ --config-cache | -C)
+ cache_file=config.cache ;;
+
+ -datadir | --datadir | --datadi | --datad)
+ ac_prev=datadir ;;
+ -datadir=* | --datadir=* | --datadi=* | --datad=*)
+ datadir=$ac_optarg ;;
+
+ -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+ | --dataroo | --dataro | --datar)
+ ac_prev=datarootdir ;;
+ -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+ | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+ datarootdir=$ac_optarg ;;
+
+ -disable-* | --disable-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"enable_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval enable_$ac_useropt=no ;;
+
+ -docdir | --docdir | --docdi | --doc | --do)
+ ac_prev=docdir ;;
+ -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+ docdir=$ac_optarg ;;
+
+ -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+ ac_prev=dvidir ;;
+ -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+ dvidir=$ac_optarg ;;
+
+ -enable-* | --enable-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"enable_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval enable_$ac_useropt=\$ac_optarg ;;
+
+ -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+ | --exec | --exe | --ex)
+ ac_prev=exec_prefix ;;
+ -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+ | --exec=* | --exe=* | --ex=*)
+ exec_prefix=$ac_optarg ;;
+
+ -gas | --gas | --ga | --g)
+ # Obsolete; use --with-gas.
+ with_gas=yes ;;
+
+ -help | --help | --hel | --he | -h)
+ ac_init_help=long ;;
+ -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+ ac_init_help=recursive ;;
+ -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+ ac_init_help=short ;;
+
+ -host | --host | --hos | --ho)
+ ac_prev=host_alias ;;
+ -host=* | --host=* | --hos=* | --ho=*)
+ host_alias=$ac_optarg ;;
+
+ -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+ ac_prev=htmldir ;;
+ -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+ | --ht=*)
+ htmldir=$ac_optarg ;;
+
+ -includedir | --includedir | --includedi | --included | --include \
+ | --includ | --inclu | --incl | --inc)
+ ac_prev=includedir ;;
+ -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+ | --includ=* | --inclu=* | --incl=* | --inc=*)
+ includedir=$ac_optarg ;;
+
+ -infodir | --infodir | --infodi | --infod | --info | --inf)
+ ac_prev=infodir ;;
+ -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+ infodir=$ac_optarg ;;
+
+ -libdir | --libdir | --libdi | --libd)
+ ac_prev=libdir ;;
+ -libdir=* | --libdir=* | --libdi=* | --libd=*)
+ libdir=$ac_optarg ;;
+
+ -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+ | --libexe | --libex | --libe)
+ ac_prev=libexecdir ;;
+ -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+ | --libexe=* | --libex=* | --libe=*)
+ libexecdir=$ac_optarg ;;
+
+ -localedir | --localedir | --localedi | --localed | --locale)
+ ac_prev=localedir ;;
+ -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+ localedir=$ac_optarg ;;
+
+ -localstatedir | --localstatedir | --localstatedi | --localstated \
+ | --localstate | --localstat | --localsta | --localst | --locals)
+ ac_prev=localstatedir ;;
+ -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+ | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+ localstatedir=$ac_optarg ;;
+
+ -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+ ac_prev=mandir ;;
+ -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+ mandir=$ac_optarg ;;
+
+ -nfp | --nfp | --nf)
+ # Obsolete; use --without-fp.
+ with_fp=no ;;
+
+ -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+ | --no-cr | --no-c | -n)
+ no_create=yes ;;
+
+ -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+ no_recursion=yes ;;
+
+ -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+ | --oldin | --oldi | --old | --ol | --o)
+ ac_prev=oldincludedir ;;
+ -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+ oldincludedir=$ac_optarg ;;
+
+ -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+ ac_prev=prefix ;;
+ -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+ prefix=$ac_optarg ;;
+
+ -program-prefix | --program-prefix | --program-prefi | --program-pref \
+ | --program-pre | --program-pr | --program-p)
+ ac_prev=program_prefix ;;
+ -program-prefix=* | --program-prefix=* | --program-prefi=* \
+ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+ program_prefix=$ac_optarg ;;
+
+ -program-suffix | --program-suffix | --program-suffi | --program-suff \
+ | --program-suf | --program-su | --program-s)
+ ac_prev=program_suffix ;;
+ -program-suffix=* | --program-suffix=* | --program-suffi=* \
+ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+ program_suffix=$ac_optarg ;;
+
+ -program-transform-name | --program-transform-name \
+ | --program-transform-nam | --program-transform-na \
+ | --program-transform-n | --program-transform- \
+ | --program-transform | --program-transfor \
+ | --program-transfo | --program-transf \
+ | --program-trans | --program-tran \
+ | --progr-tra | --program-tr | --program-t)
+ ac_prev=program_transform_name ;;
+ -program-transform-name=* | --program-transform-name=* \
+ | --program-transform-nam=* | --program-transform-na=* \
+ | --program-transform-n=* | --program-transform-=* \
+ | --program-transform=* | --program-transfor=* \
+ | --program-transfo=* | --program-transf=* \
+ | --program-trans=* | --program-tran=* \
+ | --progr-tra=* | --program-tr=* | --program-t=*)
+ program_transform_name=$ac_optarg ;;
+
+ -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+ ac_prev=pdfdir ;;
+ -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+ pdfdir=$ac_optarg ;;
+
+ -psdir | --psdir | --psdi | --psd | --ps)
+ ac_prev=psdir ;;
+ -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+ psdir=$ac_optarg ;;
+
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ silent=yes ;;
+
+ -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+ ac_prev=sbindir ;;
+ -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+ | --sbi=* | --sb=*)
+ sbindir=$ac_optarg ;;
+
+ -sharedstatedir | --sharedstatedir | --sharedstatedi \
+ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+ | --sharedst | --shareds | --shared | --share | --shar \
+ | --sha | --sh)
+ ac_prev=sharedstatedir ;;
+ -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+ | --sha=* | --sh=*)
+ sharedstatedir=$ac_optarg ;;
+
+ -site | --site | --sit)
+ ac_prev=site ;;
+ -site=* | --site=* | --sit=*)
+ site=$ac_optarg ;;
+
+ -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+ ac_prev=srcdir ;;
+ -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+ srcdir=$ac_optarg ;;
+
+ -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+ | --syscon | --sysco | --sysc | --sys | --sy)
+ ac_prev=sysconfdir ;;
+ -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+ sysconfdir=$ac_optarg ;;
+
+ -target | --target | --targe | --targ | --tar | --ta | --t)
+ ac_prev=target_alias ;;
+ -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+ target_alias=$ac_optarg ;;
+
+ -v | -verbose | --verbose | --verbos | --verbo | --verb)
+ verbose=yes ;;
+
+ -version | --version | --versio | --versi | --vers | -V)
+ ac_init_version=: ;;
+
+ -with-* | --with-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"with_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval with_$ac_useropt=\$ac_optarg ;;
+
+ -without-* | --without-*)
+ ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2
+ { (exit 1); exit 1; }; }
+ ac_useropt_orig=$ac_useropt
+ ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+ case $ac_user_opts in
+ *"
+"with_$ac_useropt"
+"*) ;;
+ *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+ ac_unrecognized_sep=', ';;
+ esac
+ eval with_$ac_useropt=no ;;
+
+ --x)
+ # Obsolete; use --with-x.
+ with_x=yes ;;
+
+ -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+ | --x-incl | --x-inc | --x-in | --x-i)
+ ac_prev=x_includes ;;
+ -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+ x_includes=$ac_optarg ;;
+
+ -x-libraries | --x-libraries | --x-librarie | --x-librari \
+ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+ ac_prev=x_libraries ;;
+ -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+ x_libraries=$ac_optarg ;;
+
+ -*) { $as_echo "$as_me: error: unrecognized option: $ac_option
+Try \`$0 --help' for more information." >&2
+ { (exit 1); exit 1; }; }
+ ;;
+
+ *=*)
+ ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+ # Reject names that are not valid shell variable names.
+ expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null &&
+ { $as_echo "$as_me: error: invalid variable name: $ac_envvar" >&2
+ { (exit 1); exit 1; }; }
+ eval $ac_envvar=\$ac_optarg
+ export $ac_envvar ;;
+
+ *)
+ # FIXME: should be removed in autoconf 3.0.
+ $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+ expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+ $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+ : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}
+ ;;
+
+ esac
+done
+
+if test -n "$ac_prev"; then
+ ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+ { $as_echo "$as_me: error: missing argument to $ac_option" >&2
+ { (exit 1); exit 1; }; }
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+ case $enable_option_checking in
+ no) ;;
+ fatal) { $as_echo "$as_me: error: unrecognized options: $ac_unrecognized_opts" >&2
+ { (exit 1); exit 1; }; } ;;
+ *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+ esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
+ datadir sysconfdir sharedstatedir localstatedir includedir \
+ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+ libdir localedir mandir
+do
+ eval ac_val=\$$ac_var
+ # Remove trailing slashes.
+ case $ac_val in
+ */ )
+ ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+ eval $ac_var=\$ac_val;;
+ esac
+ # Be sure to have absolute directory names.
+ case $ac_val in
+ [\\/$]* | ?:[\\/]* ) continue;;
+ NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+ esac
+ { $as_echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
+ { (exit 1); exit 1; }; }
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+ if test "x$build_alias" = x; then
+ cross_compiling=maybe
+ $as_echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host.
+ If a cross compiler is detected then cross compile mode will be used." >&2
+ elif test "x$build_alias" != "x$host_alias"; then
+ cross_compiling=yes
+ fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+ { $as_echo "$as_me: error: working directory cannot be determined" >&2
+ { (exit 1); exit 1; }; }
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+ { $as_echo "$as_me: error: pwd does not report name of working directory" >&2
+ { (exit 1); exit 1; }; }
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+ ac_srcdir_defaulted=yes
+ # Try the directory containing this script, then the parent directory.
+ ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_myself" : 'X\(//\)[^/]' \| \
+ X"$as_myself" : 'X\(//\)$' \| \
+ X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ srcdir=$ac_confdir
+ if test ! -r "$srcdir/$ac_unique_file"; then
+ srcdir=..
+ fi
+else
+ ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+ test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+ { $as_echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2
+ { (exit 1); exit 1; }; }
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+ cd "$srcdir" && test -r "./$ac_unique_file" || { $as_echo "$as_me: error: $ac_msg" >&2
+ { (exit 1); exit 1; }; }
+ pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+ srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+ eval ac_env_${ac_var}_set=\${${ac_var}+set}
+ eval ac_env_${ac_var}_value=\$${ac_var}
+ eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+ eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+ # Omit some internal or obsolete options to make the list less imposing.
+ # This message is too long to be a string in the A/UX 3.1 sh.
+ cat <<_ACEOF
+\`configure' configures SPP 1.7 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE. See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+ -h, --help display this help and exit
+ --help=short display options specific to this package
+ --help=recursive display the short help of all the included packages
+ -V, --version display version information and exit
+ -q, --quiet, --silent do not print \`checking...' messages
+ --cache-file=FILE cache test results in FILE [disabled]
+ -C, --config-cache alias for \`--cache-file=config.cache'
+ -n, --no-create do not create output files
+ --srcdir=DIR find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+ --prefix=PREFIX install architecture-independent files in PREFIX
+ [$ac_default_prefix]
+ --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX
+ [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+ --bindir=DIR user executables [EPREFIX/bin]
+ --sbindir=DIR system admin executables [EPREFIX/sbin]
+ --libexecdir=DIR program executables [EPREFIX/libexec]
+ --sysconfdir=DIR read-only single-machine data [PREFIX/etc]
+ --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
+ --localstatedir=DIR modifiable single-machine data [PREFIX/var]
+ --libdir=DIR object code libraries [EPREFIX/lib]
+ --includedir=DIR C header files [PREFIX/include]
+ --oldincludedir=DIR C header files for non-gcc [/usr/include]
+ --datarootdir=DIR read-only arch.-independent data root [PREFIX/share]
+ --datadir=DIR read-only architecture-independent data [DATAROOTDIR]
+ --infodir=DIR info documentation [DATAROOTDIR/info]
+ --localedir=DIR locale-dependent data [DATAROOTDIR/locale]
+ --mandir=DIR man documentation [DATAROOTDIR/man]
+ --docdir=DIR documentation root [DATAROOTDIR/doc/spp]
+ --htmldir=DIR html documentation [DOCDIR]
+ --dvidir=DIR dvi documentation [DOCDIR]
+ --pdfdir=DIR pdf documentation [DOCDIR]
+ --psdir=DIR ps documentation [DOCDIR]
+_ACEOF
+
+ cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+ case $ac_init_help in
+ short | recursive ) echo "Configuration of SPP 1.7:";;
+ esac
+ cat <<\_ACEOF
+
+Some influential environment variables:
+ CC C compiler command
+ CFLAGS C compiler flags
+ LDFLAGS linker flags, e.g. -L<lib dir> if you have libraries in a
+ nonstandard directory <lib dir>
+ LIBS libraries to pass to the linker, e.g. -l<library>
+ CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
+ you have headers in a nonstandard directory <include dir>
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+ # If there are subdirs, report their specific --help.
+ for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+ test -d "$ac_dir" ||
+ { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+ continue
+ ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+ ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+ # A ".." for each directory in $ac_dir_suffix.
+ ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+ case $ac_top_builddir_sub in
+ "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+ *) ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+ esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+ .) # We are building in place.
+ ac_srcdir=.
+ ac_top_srcdir=$ac_top_builddir_sub
+ ac_abs_top_srcdir=$ac_pwd ;;
+ [\\/]* | ?:[\\/]* ) # Absolute name.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir
+ ac_abs_top_srcdir=$srcdir ;;
+ *) # Relative name.
+ ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_build_prefix$srcdir
+ ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+ cd "$ac_dir" || { ac_status=$?; continue; }
+ # Check for guested configure.
+ if test -f "$ac_srcdir/configure.gnu"; then
+ echo &&
+ $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+ elif test -f "$ac_srcdir/configure"; then
+ echo &&
+ $SHELL "$ac_srcdir/configure" --help=recursive
+ else
+ $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+ fi || ac_status=$?
+ cd "$ac_pwd" || { ac_status=$?; break; }
+ done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+ cat <<\_ACEOF
+SPP configure 1.7
+generated by GNU Autoconf 2.63
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+ exit
+fi
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by SPP $as_me 1.7, which was
+generated by GNU Autoconf 2.63. Invocation command line was
+
+ $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown`
+
+/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown`
+/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown`
+/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown`
+/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown`
+/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ $as_echo "PATH: $as_dir"
+done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+ for ac_arg
+ do
+ case $ac_arg in
+ -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil)
+ continue ;;
+ *\'*)
+ ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ esac
+ case $ac_pass in
+ 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;;
+ 2)
+ ac_configure_args1="$ac_configure_args1 '$ac_arg'"
+ if test $ac_must_keep_next = true; then
+ ac_must_keep_next=false # Got value, back to normal.
+ else
+ case $ac_arg in
+ *=* | --config-cache | -C | -disable-* | --disable-* \
+ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+ | -with-* | --with-* | -without-* | --without-* | --x)
+ case "$ac_configure_args0 " in
+ "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+ esac
+ ;;
+ -* ) ac_must_keep_next=true ;;
+ esac
+ fi
+ ac_configure_args="$ac_configure_args '$ac_arg'"
+ ;;
+ esac
+ done
+done
+$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; }
+$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; }
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log. We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+ # Save into config.log some information that might help in debugging.
+ {
+ echo
+
+ cat <<\_ASBOX
+## ---------------- ##
+## Cache variables. ##
+## ---------------- ##
+_ASBOX
+ echo
+ # The following way of writing the cache mishandles newlines in values,
+(
+ for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+ eval ac_val=\$$ac_var
+ case $ac_val in #(
+ *${as_nl}*)
+ case $ac_var in #(
+ *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+ esac
+ case $ac_var in #(
+ _ | IFS | as_nl) ;; #(
+ BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+ *) $as_unset $ac_var ;;
+ esac ;;
+ esac
+ done
+ (set) 2>&1 |
+ case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+ *${as_nl}ac_space=\ *)
+ sed -n \
+ "s/'\''/'\''\\\\'\'''\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+ ;; #(
+ *)
+ sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+ ;;
+ esac |
+ sort
+)
+ echo
+
+ cat <<\_ASBOX
+## ----------------- ##
+## Output variables. ##
+## ----------------- ##
+_ASBOX
+ echo
+ for ac_var in $ac_subst_vars
+ do
+ eval ac_val=\$$ac_var
+ case $ac_val in
+ *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+ esac
+ $as_echo "$ac_var='\''$ac_val'\''"
+ done | sort
+ echo
+
+ if test -n "$ac_subst_files"; then
+ cat <<\_ASBOX
+## ------------------- ##
+## File substitutions. ##
+## ------------------- ##
+_ASBOX
+ echo
+ for ac_var in $ac_subst_files
+ do
+ eval ac_val=\$$ac_var
+ case $ac_val in
+ *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+ esac
+ $as_echo "$ac_var='\''$ac_val'\''"
+ done | sort
+ echo
+ fi
+
+ if test -s confdefs.h; then
+ cat <<\_ASBOX
+## ----------- ##
+## confdefs.h. ##
+## ----------- ##
+_ASBOX
+ echo
+ cat confdefs.h
+ echo
+ fi
+ test "$ac_signal" != 0 &&
+ $as_echo "$as_me: caught signal $ac_signal"
+ $as_echo "$as_me: exit $exit_status"
+ } >&5
+ rm -f core *.core core.conftest.* &&
+ rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+ exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+ trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+ ac_site_file1=$CONFIG_SITE
+elif test "x$prefix" != xNONE; then
+ ac_site_file1=$prefix/share/config.site
+ ac_site_file2=$prefix/etc/config.site
+else
+ ac_site_file1=$ac_default_prefix/share/config.site
+ ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+ test "x$ac_site_file" = xNONE && continue
+ if test -r "$ac_site_file"; then
+ { $as_echo "$as_me:$LINENO: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+ sed 's/^/| /' "$ac_site_file" >&5
+ . "$ac_site_file"
+ fi
+done
+
+if test -r "$cache_file"; then
+ # Some versions of bash will fail to source /dev/null (special
+ # files actually), so we avoid doing that.
+ if test -f "$cache_file"; then
+ { $as_echo "$as_me:$LINENO: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+ case $cache_file in
+ [\\/]* | ?:[\\/]* ) . "$cache_file";;
+ *) . "./$cache_file";;
+ esac
+ fi
+else
+ { $as_echo "$as_me:$LINENO: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+ >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+ eval ac_old_set=\$ac_cv_env_${ac_var}_set
+ eval ac_new_set=\$ac_env_${ac_var}_set
+ eval ac_old_val=\$ac_cv_env_${ac_var}_value
+ eval ac_new_val=\$ac_env_${ac_var}_value
+ case $ac_old_set,$ac_new_set in
+ set,)
+ { $as_echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,set)
+ { $as_echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+ ac_cache_corrupted=: ;;
+ ,);;
+ *)
+ if test "x$ac_old_val" != "x$ac_new_val"; then
+ # differences in whitespace do not lead to failure.
+ ac_old_val_w=`echo x $ac_old_val`
+ ac_new_val_w=`echo x $ac_new_val`
+ if test "$ac_old_val_w" != "$ac_new_val_w"; then
+ { $as_echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+ ac_cache_corrupted=:
+ else
+ { $as_echo "$as_me:$LINENO: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+ eval $ac_var=\$ac_old_val
+ fi
+ { $as_echo "$as_me:$LINENO: former value: \`$ac_old_val'" >&5
+$as_echo "$as_me: former value: \`$ac_old_val'" >&2;}
+ { $as_echo "$as_me:$LINENO: current value: \`$ac_new_val'" >&5
+$as_echo "$as_me: current value: \`$ac_new_val'" >&2;}
+ fi;;
+ esac
+ # Pass precious variables to config.status.
+ if test "$ac_new_set" = set; then
+ case $ac_new_val in
+ *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+ *) ac_arg=$ac_var=$ac_new_val ;;
+ esac
+ case " $ac_configure_args " in
+ *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy.
+ *) ac_configure_args="$ac_configure_args '$ac_arg'" ;;
+ esac
+ fi
+done
+if $ac_cache_corrupted; then
+ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+ { $as_echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+ { { $as_echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5
+$as_echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;}
+ { (exit 1); exit 1; }; }
+fi
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="${ac_tool_prefix}gcc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+ ac_ct_CC=$CC
+ # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_CC"; then
+ ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_CC="gcc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+ { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+ if test "x$ac_ct_CC" = x; then
+ CC=""
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ CC=$ac_ct_CC
+ fi
+else
+ CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+ if test -n "$ac_tool_prefix"; then
+ # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="${ac_tool_prefix}cc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ fi
+fi
+if test -z "$CC"; then
+ # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+ ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+ ac_prog_rejected=yes
+ continue
+ fi
+ ac_cv_prog_CC="cc"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+ # We found a bogon in the path, so make sure we never use it.
+ set dummy $ac_cv_prog_CC
+ shift
+ if test $# != 0; then
+ # We chose a different compiler from the bogus one.
+ # However, it has the same basename, so the bogon will be chosen
+ # first if we set CC to just the basename; use the full file name.
+ shift
+ ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+ fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+ if test -n "$ac_tool_prefix"; then
+ for ac_prog in cl.exe
+ do
+ # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$CC"; then
+ ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+ { $as_echo "$as_me:$LINENO: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$CC" && break
+ done
+fi
+if test -z "$CC"; then
+ ac_ct_CC=$CC
+ for ac_prog in cl.exe
+do
+ # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$ac_ct_CC"; then
+ ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+ ac_cv_prog_ac_ct_CC="$ac_prog"
+ $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+ { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+ { $as_echo "$as_me:$LINENO: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+ test -n "$ac_ct_CC" && break
+done
+
+ if test "x$ac_ct_CC" = x; then
+ CC=""
+ else
+ case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+ CC=$ac_ct_CC
+ fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:$LINENO: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+{ (ac_try="$ac_compiler --version >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compiler --version >&5") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compiler -v >&5") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compiler -V >&5") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }
+
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:$LINENO: checking for C compiler default output file name" >&5
+$as_echo_n "checking for C compiler default output file name... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+ * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+ esac
+done
+rm -f $ac_rmfiles
+
+if { (ac_try="$ac_link_default"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link_default") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; then
+ # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile. We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+ test -f "$ac_file" || continue
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+ ;;
+ [ab].out )
+ # We found the default executable, but exeext='' is most
+ # certainly right.
+ break;;
+ *.* )
+ if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+ then :; else
+ ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+ fi
+ # We set ac_cv_exeext here because the later test for it is not
+ # safe: cross compilers may not add the suffix if given an `-o'
+ # argument, so we may need to know it at that point already.
+ # Even if this section looks crufty: it has the advantage of
+ # actually working.
+ break;;
+ * )
+ break;;
+ esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+ ac_file=''
+fi
+
+{ $as_echo "$as_me:$LINENO: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+if test -z "$ac_file"; then
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: C compiler cannot create executables
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: C compiler cannot create executables
+See \`config.log' for more details." >&2;}
+ { (exit 77); exit 77; }; }; }
+fi
+
+ac_exeext=$ac_cv_exeext
+
+# Check that the compiler produces executables we can run. If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:$LINENO: checking whether the C compiler works" >&5
+$as_echo_n "checking whether the C compiler works... " >&6; }
+# FIXME: These cross compiler hacks should be removed for Autoconf 3.0
+# If not cross compiling, check that we can run a simple program.
+if test "$cross_compiling" != yes; then
+ if { ac_try='./$ac_file'
+ { (case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ cross_compiling=no
+ else
+ if test "$cross_compiling" = maybe; then
+ cross_compiling=yes
+ else
+ { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+ fi
+ fi
+fi
+{ $as_echo "$as_me:$LINENO: result: yes" >&5
+$as_echo "yes" >&6; }
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+# Check that the compiler produces executables we can run. If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:$LINENO: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+{ $as_echo "$as_me:$LINENO: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+{ $as_echo "$as_me:$LINENO: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; then
+ # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+ test -f "$ac_file" || continue
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+ *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+ break;;
+ * ) break;;
+ esac
+done
+else
+ { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+fi
+
+rm -f conftest$ac_cv_exeext
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+{ $as_echo "$as_me:$LINENO: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if test "${ac_cv_objext+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; then
+ for ac_file in conftest.o conftest.obj conftest.*; do
+ test -f "$ac_file" || continue;
+ case $ac_file in
+ *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+ *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+ break;;
+ esac
+done
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&2;}
+ { (exit 1); exit 1; }; }; }
+fi
+
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if test "${ac_cv_c_compiler_gnu+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+#ifndef __GNUC__
+ choke me
+#endif
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_compiler_gnu=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_compiler_gnu=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+ GCC=yes
+else
+ GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if test "${ac_cv_prog_cc_g+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ ac_save_c_werror_flag=$ac_c_werror_flag
+ ac_c_werror_flag=yes
+ ac_cv_prog_cc_g=no
+ CFLAGS="-g"
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_prog_cc_g=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ CFLAGS=""
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ :
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_c_werror_flag=$ac_save_c_werror_flag
+ CFLAGS="-g"
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_prog_cc_g=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+ CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+ if test "$GCC" = yes; then
+ CFLAGS="-g -O2"
+ else
+ CFLAGS="-g"
+ fi
+else
+ if test "$GCC" = yes; then
+ CFLAGS="-O2"
+ else
+ CFLAGS=
+ fi
+fi
+{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if test "${ac_cv_prog_cc_c89+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <stdarg.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+ char **p;
+ int i;
+{
+ return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+ char *s;
+ va_list v;
+ va_start (v,p);
+ s = g (p, va_arg (v,int));
+ va_end (v);
+ return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has
+ function prototypes and stuff, but not '\xHH' hex character constants.
+ These don't provoke an error unfortunately, instead are silently treated
+ as 'x'. The following induces an error, until -std is added to get
+ proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an
+ array size at least. It's necessary to write '\x00'==0 to get something
+ that's true only with -std. */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+ inside strings and character constants. */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1];
+ ;
+ return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+ -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+ CC="$ac_save_CC $ac_arg"
+ rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_prog_cc_c89=$ac_arg
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext
+ test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+ x)
+ { $as_echo "$as_me:$LINENO: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+ xno)
+ { $as_echo "$as_me:$LINENO: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+ *)
+ CC="$CC $ac_cv_prog_cc_c89"
+ { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+{ $as_echo "$as_me:$LINENO: checking for BZ2_bzDecompressInit in -lbz2" >&5
+$as_echo_n "checking for BZ2_bzDecompressInit in -lbz2... " >&6; }
+if test "${ac_cv_lib_bz2_BZ2_bzDecompressInit+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ ac_check_lib_save_LIBS=$LIBS
+LIBS="-lbz2 $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+
+/* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+#ifdef __cplusplus
+extern "C"
+#endif
+char BZ2_bzDecompressInit ();
+int
+main ()
+{
+return BZ2_bzDecompressInit ();
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+ (eval "$ac_link") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest$ac_exeext && {
+ test "$cross_compiling" = yes ||
+ $as_test_x conftest$ac_exeext
+ }; then
+ ac_cv_lib_bz2_BZ2_bzDecompressInit=yes
+else
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_cv_lib_bz2_BZ2_bzDecompressInit=no
+fi
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+ conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_lib_bz2_BZ2_bzDecompressInit" >&5
+$as_echo "$ac_cv_lib_bz2_BZ2_bzDecompressInit" >&6; }
+if test "x$ac_cv_lib_bz2_BZ2_bzDecompressInit" = x""yes; then
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBBZ2 1
+_ACEOF
+
+ LIBS="-lbz2 $LIBS"
+
+fi
+
+
+ac_config_files="$ac_config_files src/Makevars"
+
+cp confdefs.h src/config.h
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems. If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+ for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+ eval ac_val=\$$ac_var
+ case $ac_val in #(
+ *${as_nl}*)
+ case $ac_var in #(
+ *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+ esac
+ case $ac_var in #(
+ _ | IFS | as_nl) ;; #(
+ BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+ *) $as_unset $ac_var ;;
+ esac ;;
+ esac
+ done
+
+ (set) 2>&1 |
+ case $as_nl`(ac_space=' '; set) 2>&1` in #(
+ *${as_nl}ac_space=\ *)
+ # `set' does not quote correctly, so add quotes (double-quote
+ # substitution turns \\\\ into \\, and sed turns \\ into \).
+ sed -n \
+ "s/'/'\\\\''/g;
+ s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+ ;; #(
+ *)
+ # `set' quotes correctly as required by POSIX, so do not add quotes.
+ sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+ ;;
+ esac |
+ sort
+) |
+ sed '
+ /^ac_cv_env_/b end
+ t clear
+ :clear
+ s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+ t end
+ s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+ :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+ if test -w "$cache_file"; then
+ test "x$cache_file" != "x/dev/null" &&
+ { $as_echo "$as_me:$LINENO: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+ cat confcache >$cache_file
+ else
+ { $as_echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+ fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section. Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+ g
+ s/^\n//
+ s/\n/ /g
+ p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+ # 1. Remove the extension, and $U if already installed.
+ ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+ ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+ # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR
+ # will be set to the directory where LIBOBJS objects are built.
+ ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+ ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: ${CONFIG_STATUS=./config.status}
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+cat >$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+SHELL=\${CONFIG_SHELL-$SHELL}
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+## --------------------- ##
+## M4sh Initialization. ##
+## --------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+ emulate sh
+ NULLCMD=:
+ # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+ # is contrary to our usage. Disable this feature.
+ alias -g '${1+"$@"}'='"$@"'
+ setopt NO_GLOB_SUBST
+else
+ case `(set -o) 2>/dev/null` in
+ *posix*) set -o posix ;;
+esac
+
+fi
+
+
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+ as_echo='printf %s\n'
+ as_echo_n='printf %s'
+else
+ if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+ as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+ as_echo_n='/usr/ucb/echo -n'
+ else
+ as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+ as_echo_n_body='eval
+ arg=$1;
+ case $arg in
+ *"$as_nl"*)
+ expr "X$arg" : "X\\(.*\\)$as_nl";
+ arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+ esac;
+ expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+ '
+ export as_echo_n_body
+ as_echo_n='sh -c $as_echo_n_body as_echo'
+ fi
+ export as_echo_body
+ as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+ PATH_SEPARATOR=:
+ (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+ (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+ PATH_SEPARATOR=';'
+ }
+fi
+
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+ as_unset=unset
+else
+ as_unset=false
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order. Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" "" $as_nl"
+
+# Find who we are. Look in the path if we contain no directory separator.
+case $0 in
+ *[\\/]* ) as_myself=$0 ;;
+ *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
+IFS=$as_save_IFS
+
+ ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+ as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+ $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+ { (exit 1); exit 1; }
+fi
+
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+ as_basename=basename
+else
+ as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+ X"$0" : 'X\(//\)$' \| \
+ X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+ sed '/^.*\/\([^/][^/]*\)\/*$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\/\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+
+# CDPATH.
+$as_unset CDPATH
+
+
+
+ as_lineno_1=$LINENO
+ as_lineno_2=$LINENO
+ test "x$as_lineno_1" != "x$as_lineno_2" &&
+ test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+ # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+ # uniformly replaced by the line number. The first 'sed' inserts a
+ # line-number line after each line using $LINENO; the second 'sed'
+ # does the real work. The second script uses 'N' to pair each
+ # line-number line with the line containing $LINENO, and appends
+ # trailing '-' during substitution so that $LINENO is not a special
+ # case at line end.
+ # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+ # scripts with optimization help from Paolo Bonzini. Blame Lee
+ # E. McMahon (1931-1989) for sed's syntax. :-)
+ sed -n '
+ p
+ /[$]LINENO/=
+ ' <$as_myself |
+ sed '
+ s/[$]LINENO.*/&-/
+ t lineno
+ b
+ :lineno
+ N
+ :loop
+ s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+ t loop
+ s/-\n.*//
+ ' >$as_me.lineno &&
+ chmod +x "$as_me.lineno" ||
+ { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+ { (exit 1); exit 1; }; }
+
+ # Don't try to exec as it changes $[0], causing all sort of problems
+ # (the dirname of $[0] is not the place where we might find the
+ # original and so on. Autoconf is especially sensitive to this).
+ . "./$as_me.lineno"
+ # Exit status is that of the last command.
+ exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+ as_dirname=dirname
+else
+ as_dirname=false
+fi
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in
+-n*)
+ case `echo 'x\c'` in
+ *c*) ECHO_T=' ';; # ECHO_T is single tab character.
+ *) ECHO_C='\c';;
+ esac;;
+*)
+ ECHO_N='-n';;
+esac
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+ test "X`expr 00001 : '.*\(...\)'`" = X001; then
+ as_expr=expr
+else
+ as_expr=false
+fi
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+ rm -f conf$$.dir/conf$$.file
+else
+ rm -f conf$$.dir
+ mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+ if ln -s conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s='ln -s'
+ # ... but there are two gotchas:
+ # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+ # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+ # In both cases, we have to default to `cp -p'.
+ ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+ as_ln_s='cp -p'
+ elif ln conf$$.file conf$$ 2>/dev/null; then
+ as_ln_s=ln
+ else
+ as_ln_s='cp -p'
+ fi
+else
+ as_ln_s='cp -p'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+ as_mkdir_p=:
+else
+ test -d ./-p && rmdir ./-p
+ as_mkdir_p=false
+fi
+
+if test -x / >/dev/null 2>&1; then
+ as_test_x='test -x'
+else
+ if ls -dL / >/dev/null 2>&1; then
+ as_ls_L_option=L
+ else
+ as_ls_L_option=
+ fi
+ as_test_x='
+ eval sh -c '\''
+ if test -d "$1"; then
+ test -d "$1/.";
+ else
+ case $1 in
+ -*)set "./$1";;
+ esac;
+ case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
+ ???[sx]*):;;*)false;;esac;fi
+ '\'' sh
+ '
+fi
+as_executable_p=$as_test_x
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+
+# Save the log message, to keep $[0] and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by SPP $as_me 1.7, which was
+generated by GNU Autoconf 2.63. Invocation command line was
+
+ CONFIG_FILES = $CONFIG_FILES
+ CONFIG_HEADERS = $CONFIG_HEADERS
+ CONFIG_LINKS = $CONFIG_LINKS
+ CONFIG_COMMANDS = $CONFIG_COMMANDS
+ $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files from templates according to the
+current configuration.
+
+Usage: $0 [OPTION]... [FILE]...
+
+ -h, --help print this help, then exit
+ -V, --version print version number and configuration settings, then exit
+ -q, --quiet, --silent
+ do not print progress messages
+ -d, --debug don't remove temporary files
+ --recheck update $as_me by reconfiguring in the same conditions
+ --file=FILE[:TEMPLATE]
+ instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to <bug-autoconf@gnu.org>."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_version="\\
+SPP config.status 1.7
+configured by $0, generated by GNU Autoconf 2.63,
+ with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
+
+Copyright (C) 2008 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+ case $1 in
+ --*=*)
+ ac_option=`expr "X$1" : 'X\([^=]*\)='`
+ ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+ ac_shift=:
+ ;;
+ *)
+ ac_option=$1
+ ac_optarg=$2
+ ac_shift=shift
+ ;;
+ esac
+
+ case $ac_option in
+ # Handling of the options.
+ -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+ ac_cs_recheck=: ;;
+ --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+ $as_echo "$ac_cs_version"; exit ;;
+ --debug | --debu | --deb | --de | --d | -d )
+ debug=: ;;
+ --file | --fil | --fi | --f )
+ $ac_shift
+ case $ac_optarg in
+ *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+ esac
+ CONFIG_FILES="$CONFIG_FILES '$ac_optarg'"
+ ac_need_defaults=false;;
+ --he | --h | --help | --hel | -h )
+ $as_echo "$ac_cs_usage"; exit ;;
+ -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+ | -silent | --silent | --silen | --sile | --sil | --si | --s)
+ ac_cs_silent=: ;;
+
+ # This is an error.
+ -*) { $as_echo "$as_me: error: unrecognized option: $1
+Try \`$0 --help' for more information." >&2
+ { (exit 1); exit 1; }; } ;;
+
+ *) ac_config_targets="$ac_config_targets $1"
+ ac_need_defaults=false ;;
+
+ esac
+ shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+ exec 6>/dev/null
+ ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+ set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+ shift
+ \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+ CONFIG_SHELL='$SHELL'
+ export CONFIG_SHELL
+ exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+ echo
+ sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+ $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+ case $ac_config_target in
+ "src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;;
+
+ *) { { $as_echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5
+$as_echo "$as_me: error: invalid argument: $ac_config_target" >&2;}
+ { (exit 1); exit 1; }; };;
+ esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used. Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+ test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience. Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+ tmp=
+ trap 'exit_status=$?
+ { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status
+' 0
+ trap '{ (exit 1); exit 1; }' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+ tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+} ||
+{
+ tmp=./conf$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+} ||
+{
+ $as_echo "$as_me: cannot create a temporary directory in ." >&2
+ { (exit 1); exit 1; }
+}
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=' '
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+ ac_cs_awk_cr='\\r'
+else
+ ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+ echo "cat >conf$$subs.awk <<_ACEOF" &&
+ echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+ echo "_ACEOF"
+} >conf$$subs.sh ||
+ { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '$'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+ . ./conf$$subs.sh ||
+ { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+
+ ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+ if test $ac_delim_n = $ac_delim_num; then
+ break
+ elif $ac_last_try; then
+ { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+ else
+ ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+ fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\).*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\).*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+ N
+ s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$tmp/subs1.awk" <<_ACAWK &&
+ for (key in S) S_is_set[key] = 1
+ FS = ""
+
+}
+{
+ line = $ 0
+ nfields = split(line, field, "@")
+ substed = 0
+ len = length(field[1])
+ for (i = 2; i < nfields; i++) {
+ key = field[i]
+ keylen = length(key)
+ if (S_is_set[key]) {
+ value = S[key]
+ line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+ len += length(value) + length(field[++i])
+ substed = 1
+ } else
+ len += 1 + keylen
+ }
+
+ print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+ sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+ cat
+fi < "$tmp/subs1.awk" > "$tmp/subs.awk" \
+ || { { $as_echo "$as_me:$LINENO: error: could not setup config files machinery" >&5
+$as_echo "$as_me: error: could not setup config files machinery" >&2;}
+ { (exit 1); exit 1; }; }
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove $(srcdir),
+# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+ ac_vpsub='/^[ ]*VPATH[ ]*=/{
+s/:*\$(srcdir):*/:/
+s/:*\${srcdir}:*/:/
+s/:*@srcdir@:*/:/
+s/^\([^=]*=[ ]*\):*/\1/
+s/:*$//
+s/^[^=]*=[ ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X " :F $CONFIG_FILES "
+shift
+for ac_tag
+do
+ case $ac_tag in
+ :[FHLC]) ac_mode=$ac_tag; continue;;
+ esac
+ case $ac_mode$ac_tag in
+ :[FHL]*:*);;
+ :L* | :C*:*) { { $as_echo "$as_me:$LINENO: error: invalid tag $ac_tag" >&5
+$as_echo "$as_me: error: invalid tag $ac_tag" >&2;}
+ { (exit 1); exit 1; }; };;
+ :[FH]-) ac_tag=-:-;;
+ :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+ esac
+ ac_save_IFS=$IFS
+ IFS=:
+ set x $ac_tag
+ IFS=$ac_save_IFS
+ shift
+ ac_file=$1
+ shift
+
+ case $ac_mode in
+ :L) ac_source=$1;;
+ :[FH])
+ ac_file_inputs=
+ for ac_f
+ do
+ case $ac_f in
+ -) ac_f="$tmp/stdin";;
+ *) # Look for the file first in the build tree, then in the source tree
+ # (if the path is not absolute). The absolute path cannot be DOS-style,
+ # because $ac_f cannot contain `:'.
+ test -f "$ac_f" ||
+ case $ac_f in
+ [\\/$]*) false;;
+ *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+ esac ||
+ { { $as_echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5
+$as_echo "$as_me: error: cannot find input file: $ac_f" >&2;}
+ { (exit 1); exit 1; }; };;
+ esac
+ case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+ ac_file_inputs="$ac_file_inputs '$ac_f'"
+ done
+
+ # Let's still pretend it is `configure' which instantiates (i.e., don't
+ # use $as_me), people would be surprised to read:
+ # /* config.h. Generated by config.status. */
+ configure_input='Generated from '`
+ $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+ `' by configure.'
+ if test x"$ac_file" != x-; then
+ configure_input="$ac_file. $configure_input"
+ { $as_echo "$as_me:$LINENO: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+ fi
+ # Neutralize special characters interpreted by sed in replacement strings.
+ case $configure_input in #(
+ *\&* | *\|* | *\\* )
+ ac_sed_conf_input=`$as_echo "$configure_input" |
+ sed 's/[\\\\&|]/\\\\&/g'`;; #(
+ *) ac_sed_conf_input=$configure_input;;
+ esac
+
+ case $ac_tag in
+ *:-:* | *:-) cat >"$tmp/stdin" \
+ || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+ { (exit 1); exit 1; }; } ;;
+ esac
+ ;;
+ esac
+
+ ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$ac_file" : 'X\(//\)[^/]' \| \
+ X"$ac_file" : 'X\(//\)$' \| \
+ X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ { as_dir="$ac_dir"
+ case $as_dir in #(
+ -*) as_dir=./$as_dir;;
+ esac
+ test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
+ as_dirs=
+ while :; do
+ case $as_dir in #(
+ *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+ *) as_qdir=$as_dir;;
+ esac
+ as_dirs="'$as_qdir' $as_dirs"
+ as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+ X"$as_dir" : 'X\(//\)[^/]' \| \
+ X"$as_dir" : 'X\(//\)$' \| \
+ X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)[^/].*/{
+ s//\1/
+ q
+ }
+ /^X\(\/\/\)$/{
+ s//\1/
+ q
+ }
+ /^X\(\/\).*/{
+ s//\1/
+ q
+ }
+ s/.*/./; q'`
+ test -d "$as_dir" && break
+ done
+ test -z "$as_dirs" || eval "mkdir $as_dirs"
+ } || test -d "$as_dir" || { { $as_echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
+$as_echo "$as_me: error: cannot create directory $as_dir" >&2;}
+ { (exit 1); exit 1; }; }; }
+ ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+ ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+ # A ".." for each directory in $ac_dir_suffix.
+ ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+ case $ac_top_builddir_sub in
+ "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+ *) ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+ esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+ .) # We are building in place.
+ ac_srcdir=.
+ ac_top_srcdir=$ac_top_builddir_sub
+ ac_abs_top_srcdir=$ac_pwd ;;
+ [\\/]* | ?:[\\/]* ) # Absolute name.
+ ac_srcdir=$srcdir$ac_dir_suffix;
+ ac_top_srcdir=$srcdir
+ ac_abs_top_srcdir=$srcdir ;;
+ *) # Relative name.
+ ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+ ac_top_srcdir=$ac_top_build_prefix$srcdir
+ ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+ case $ac_mode in
+ :F)
+ #
+ # CONFIG_FILE
+ #
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+
+ac_sed_dataroot='
+/datarootdir/ {
+ p
+ q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p
+'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+ { $as_echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ ac_datarootdir_hack='
+ s&@datadir@&$datadir&g
+ s&@docdir@&$docdir&g
+ s&@infodir@&$infodir&g
+ s&@localedir@&$localedir&g
+ s&@mandir@&$mandir&g
+ s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$tmp/subs.awk" >$tmp/out \
+ || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+ { (exit 1); exit 1; }; }
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+ { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&
+ { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } &&
+ { $as_echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined. Please make sure it is defined." >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined. Please make sure it is defined." >&2;}
+
+ rm -f "$tmp/stdin"
+ case $ac_file in
+ -) cat "$tmp/out" && rm -f "$tmp/out";;
+ *) rm -f "$ac_file" && mv "$tmp/out" "$ac_file";;
+ esac \
+ || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+ { (exit 1); exit 1; }; }
+ ;;
+
+
+
+ esac
+
+done # for ac_tag
+
+
+{ (exit 0); exit 0; }
+_ACEOF
+chmod +x $CONFIG_STATUS
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+ { { $as_echo "$as_me:$LINENO: error: write failure creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: write failure creating $CONFIG_STATUS" >&2;}
+ { (exit 1); exit 1; }; }
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded. So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status. When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+ ac_cs_success=:
+ ac_config_status_args=
+ test "$silent" = yes &&
+ ac_config_status_args="$ac_config_status_args --quiet"
+ exec 5>/dev/null
+ $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+ exec 5>>config.log
+ # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+ # would make configure fail if this is the last instruction.
+ $ac_cs_success || { (exit 1); exit 1; }
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+ { $as_echo "$as_me:$LINENO: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure.ac b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure.ac
new file mode 100755
index 0000000..db87fcd
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/configure.ac
@@ -0,0 +1,7 @@
+AC_INIT([SPP], 1.7)
+
+AC_CHECK_LIB(bz2, BZ2_bzDecompressInit)
+AC_SUBST(HAVE_LIBBZ2)
+AC_CONFIG_FILES([src/Makevars])
+cp confdefs.h src/config.h
+AC_OUTPUT
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/add.broad.peak.regions.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/add.broad.peak.regions.Rd
new file mode 100755
index 0000000..24355db
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/add.broad.peak.regions.Rd
@@ -0,0 +1,27 @@
+\name{add.broad.peak.regions}
+\alias{add.broad.peak.regions}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate chromosome-wide profiles of smoothed tag density }
+\description{
+ Looks for broader regions of enrichment associated with the determined
+ peak positions, adds them to the $npl data as $rs, $re columns.
+}
+\usage{
+add.broad.peak.regions(signal.tags, control.tags, binding.postions,window.size=500,z.thr=2)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output
+ of \code{\link{select.informative.tags}} }
+ \item{control.tags}{ optionall control (input) tags }
+ \item{binding.positions}{ output of find.binding.positions call }
+ \item{window.size}{ window size to be used in calculating enrichment }
+ \item{z.thr}{ Z-score corresponding to the Poisson ratio threshold
+ used to flag significantly enriched windows}
+}
+\value{
+ A structure identical to binding.postions with two additional columns
+ added (rs and re) corresponding to start and end of the associated
+ significantly enriched region. If no region was associated with a
+ particular peak, NAs values are reported.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/find.binding.positions.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/find.binding.positions.Rd
new file mode 100755
index 0000000..865aa1d
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/find.binding.positions.Rd
@@ -0,0 +1,130 @@
+\name{find.binding.positions}
+\alias{find.binding.positions}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Determine significant point protein binding positions (peaks) }
+\description{
+ Given the signal and optional control (input) data, determine location of the
+ statistically significant point binding positions. If the control data
+ is not provided, the statistical significance can be assessed based on
+ tag randomization. The method also provides options for masking
+ regions exhibiting strong signals within the control data.
+}
+\usage{
+find.binding.positions(signal.data, e.value = NULL, fdr = NULL, masked.data = NULL, control.data = NULL, min.dist = 200, window.size = 4e+07, cluster = NULL, debug = T, n.randomizations = 3, shuffle.window = 1, min.thr = 0, topN = NULL, tag.count.whs = 100, enrichment.z = 2, method = tag.wtd, tec.filter = T, tec.window.size = 10000, tec.masking.window.size=tec.window.size, tec.z = 5, tec.poisson.z=5,tec.poisson.ratio=5, n.control.samples = 1, enrichment.background.scales = c(1, 5, 10), background.density.scaling = F, use.randomized.controls = F, ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ ~~ tag data ~~
+ \item{signal.data}{ signal tag vector list }
+ \item{control.data}{ optional control (input) tag vector list }
+
+ ~~ position stringency criteria ~~
+ \item{e.value}{ E-value defining the desired statistical significance
+ of binding positions. }
+ \item{fdr}{ FDR defining statistical significance of binding positions }
+ \item{topN}{ instead of determining statistical significance
+ thresholds, return the specified number of highest-scoring
+ positions}
+
+ ~~ other params ~~
+ \item{whs}{ window half-sized that should be used for binding
+ detection (e.g. determined from cross-correlation profiles)}
+ \item{masked.data}{ optional set of coordinates that should be masked
+ (e.g. known non-unique regions) }
+ \item{min.dist}{ minimal distance that must separate detected binding
+ positions. In case multiple binding positions are detected within
+ such distance, the position with the highest score is returned. }
+ \item{window.size}{ size of the window used to segment the chromosome
+ during calculations to reduce memory usage. }
+ \item{cluster}{ optional \code{snow} cluster to parallelize the
+ processing on }
+ \item{min.thr}{ minimal score requirement for a peak }
+ \item{background.density.scaling}{ If TRUE, regions of significant tag
+ enrichment will be masked out when calculating size ratio of the
+ signal to control datasets (to estimate ratio of the background tag
+ density). If FALSE, the dataset ratio will be equal to the ratio of
+ the number of tags in each dataset.}
+
+ ~~ randomized controls ~~
+ \item{n.randomizations}{ number of tag randomziations that should be
+ performed (when the control data is not provided) }
+ \item{use.randomized.controls}{ Use randomized tag control, even if
+ \code{control.data} is supplied. }
+ \item{shuffle.window}{ during tag randomizations, tags will be split
+ into groups of \code{shuffle.window} and will be maintained
+ together throughout the randomization. }
+
+ ~~ fold-enrichment confidence intervals
+ \item{tag.count.whs}{ half-size of a window used to assess fold
+ enrichment of a binding position}
+ \item{enrichment.z}{ Z-score used to define the significance level of
+ the fold-enrichment confidence intervals }
+ \item{enrichment.background.scales}{ In estimating the peak
+ fold-enrichment confidence intervals, the background tag density is
+ estimated based on windows with half-sizes of
+ \code{2*tag.count.whs*enrichment.background.scales}. }
+ \item{method}{ either \code{tag.wtd} for WTD method, or
+ \code{tag.lwcc} for MTC method}
+ \item{mle.filter}{ If turned on, will exclude predicted positions
+ whose MLE enrichment ratio (for any of the background scales) is
+ below a specified min.mle.threshold }
+ \item{min.mle.threshold}{ MLE enrichment ratio threshold that each
+ predicted position must exceed if mle.filter is turned on. }
+
+ ~~ masking regions of significant control enrichment ~~
+ \item{tec.filter}{ Whether to mask out the regions exhibiting
+ significant enrichment in the control data in doing other
+ calculations. The regions are identified using Poisson statistics
+ within sliding windows, either relative to the scaled signal (tec.z), or
+ relative to randomly-distributed expectation (tec.poisson.z).}
+ \item{tec.window.size}{ size of the window used to determine
+ significantly enrichent control regions }
+ \item{tec.masking.window.size}{ size of the window used to mask
+ the area around significantly enrichent control regions }
+ \item{tec.z}{ Z-score defining statistical stringency by which a given
+ window is determined to be significantly higher in the input than in
+ the signal, and masked if that is the case.}
+ \item{tec.poisson.z}{ Z-score defining statistical stringency by which a given
+ window is determined to be significantly higher than the
+ tec.poisson.ratio above the expected uniform input background. }
+ \item{tec.poisson.ratio}{ Fold ratio by which input must exceed the
+ level expected from the uniform distribution. }
+
+
+
+
+}
+\value{
+ \item{npl}{A per-chromosome list containing data frames describing
+ determined binding positions. Column description:
+ \itemize{
+ \item{x} {position}
+ \item{y}{ score }
+ \item{evalue}{ E-value }
+ \item{fdr}{ FDR. For peaks higher than the maximum control peak,
+ the highest dataset FDR is reported }
+ \item{enr}{ lower bound of the fold-enrichment ratio confidence
+ interval. This is the estimate determined using scale of
+ 1. Estimates corresponding to higher scales are returned in other enr columns
+ with scale appearing in the name.}
+ \item{enr.mle}{ enrichment ratio maximum likely estimate }
+ }
+ }
+ \item{thr}{ info on the chosen statistical threshold of the peak scores}
+}
+
+\examples{
+ # find binding positions using WTD method, 200bp half-window size,
+control data, 1% FDR
+ bp <-
+find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.wtd,whs=200);
+
+ # find binding positions using MTC method, using 5 tag randomizations,
+ # keeping pairs of tag positions together (shuffle.window=2)
+ bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.lwcc,whs=200,use.randomized.controls=T,n.randomizations=5,shuffle.window=2)
+
+ # print out the number of determined positions
+ print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks"));
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.binding.characteristics.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.binding.characteristics.Rd
new file mode 100755
index 0000000..ec7aca2
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.binding.characteristics.Rd
@@ -0,0 +1,58 @@
+\name{get.binding.characteristics}
+\alias{get.binding.characteristics}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate characteristics of observed DNA-binding signal from
+ cross-correlation profiles }
+\description{
+ The methods calculates strand cross-correlation profile to determine binding
+ peak separation distance and approximate window size that should be used
+ for binding detection. If quality scores were given for the tags,
+ which quality bins improve the cross-correlation pattern.
+}
+\usage{
+get.binding.characteristics(data, srange = c(50, 500), bin = 5, cluster = NULL, debug = F, min.tag.count = 1000, acceptance.z.score = 3, remove.tag.anomalies = T, anomalies.z = 5,accept.all.tags=F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{data}{ Tag/quality data: output of \code{read.eland.tags} or similar function }
+ \item{srange}{ A range within which the binding peak separation is
+ expected to fall. Should be larger than probe size to avoid artifacts. }
+ \item{bin}{ Resolution (in basepairs) at which cross-corrrelation
+ should be calculated. bin=1 is ideal, but takes longer to calculate. }
+ \item{cluster}{ optional snow cluster for parallel processing }
+ \item{debug}{ whether to print debug messages }
+ \item{min.tag.count}{ minimal number of tags on the chromosome to be
+ considered in the cross-correlation calculations }
+ \item{acceptance.z.score}{ A Z-score used to determine if a given tag
+ quality bin provides significant improvement to the strand cross-correlation }
+ \item{remove.tag.anomalies}{ Whether to remove singular tag count peaks prior to
+ calculation. This is recommended, since such positions may distort the
+ cross-correlation profile and increase the necessary computational time. }
+ \item{anomalies.z}{ Z-score for determining if the number of tags at a
+ given position is significantly higher about background, and should be
+ considered an anomaly.}
+ \item{accept.all.tags}{ Whether tag alignment quality calculations
+ should be skipped and all available tags should be accepted in the
+ downstream analysis.}
+}
+\value{
+ \item{cross.correlation }{ Cross-correlation profile as an $x/$y data.frame}
+ \item{peak }{Position ($x) and height ($y) of automatically detected
+ cross-correlation peak.}
+ \item{whs}{ Optimized window half-size for binding detection (based
+ on the width of the cross-correlation peak) }
+ \item{quality.bin.acceptance}{ A list structure, describing the
+ effect of inclusion of different tag quality bins on
+ cross-correlation, and a resolution on which bins should be
+ considered.
+ \itemize{
+ \item{informative.bins}{ A boolean vector indicating whether the
+ inclusion of tags from the tag quality bin specified in the name
+ attribute significantly increases cross-correlation profile near
+ the peak.}
+ \item{quality.cc}{ A list giving the cross-correlation profile
+ after the inclusion of the tags from different quality bins
+ }
+ }
+ }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.broad.enrichment.clusters.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.broad.enrichment.clusters.Rd
new file mode 100755
index 0000000..1a6cff0
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.broad.enrichment.clusters.Rd
@@ -0,0 +1,27 @@
+\name{get.broad.enrichment.clusters}
+\alias{get.broad.enrichment.clusters}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Determine broad clusters of enrichment }
+\description{
+ Scan chromosomes with a pre-defined window size, comparing scaled ChIP
+ and input tag coutns to see if their ratio exceeds that expected from
+ a Poisson process (normalized for dataset size).
+}
+\usage{
+get.broad.enrichment.clusters(chip.tags, input.tags, window.size=1e3,z.thr=3,tag.shift=146/2)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{chip.tags}{ foreground tag vector list }
+ \item{input.tags}{ background tag vector list }
+ \item{window.size}{ window size to be used for tag counting }
+ \item{z.thr}{ Z-score to be used as a significance threshold }
+ \item{tag.shift}{ number of base pairs by which positive and negative
+ tag coordinates should be shifted towards eachother (half of binding
+ peak separation distance)}
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $s/$e/$rv data.frame giving the starting, ending positions and the log2
+ enrichment estimate for that region.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.conservative.fold.enrichment.profile.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.conservative.fold.enrichment.profile.Rd
new file mode 100755
index 0000000..0b20432
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.conservative.fold.enrichment.profile.Rd
@@ -0,0 +1,59 @@
+\name{get.conservative.fold.enrichment.profile}
+\alias{get.conservative.fold.enrichment.profile}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Estimate minimal fold enrichment/depletion along the chromosomes }
+\description{
+ The method provides a statistical assessment of enrichment/depletion
+ along the chromosomes. To assess tag density enrichment/depletion, a
+ sliding window of a specified size (\code{fws}) is used to calculate
+ the density of the foreground tags (\code{ftl}). Multiple, typically
+ larger windows are used to estimate background tag (\code{btl}) density around the
+ same location. The densities are compared as ratios of two Poisson
+ processes to estimate lower bound of foreground enrichment, or upper
+ bound of foreground depletion. If multiple window sizes were used to
+ estimate the background tag density, the most conservative one is
+ chosen for each point.
+}
+\usage{
+get.conservative.fold.enrichment.profile(ftl, btl, fws, bwsl = c(1, 5, 25, 50) * fws, step = 50, tag.shift = 146/2, alpha = 0.05, use.most.informative.scale = F, quick.calculation = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{ftl}{ foreground tag vector list }
+ \item{btl}{ background tag vector list }
+ \item{fws}{ foreground window size }
+ \item{bwsl}{ background window scales. The size(s) of background windows
+ will be \code{fws*bwsl}. }
+ \item{step}{ spacing between positions at which the
+ enrichment/depletion is evaluated }
+ \item{tag.shift}{ number of basepairs by which positive and negative
+ tag coordinates should be shifted towards eachother (half of binding
+ peak separation distance)}
+ \item{alpha}{ desired level of statistical significance }
+ \item{use.most.informative.scale}{ for each position, instead of
+ evaluating enrichment ratio bounds for all background window scales,
+ choose the one with the highest observed density to speed up the calculations}
+ \item{quick.calculation}{ Use square root transformation method
+ instead of a Bayesian method. This speeds up the caclulation
+ considerably and is turned on by default. }
+ \item{background.density.scaling}{ If TRUE, regions of significant tag
+ enrichment will be masked out when calculating size ratio of the
+ signal to control datasets (to estimate ratio of the background tag
+ density). If FALSE, the dataset ratio will be equal to the ratio of
+ the number of tags in each dataset.}
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $x/$y data.frame giving the position and the log2
+ conservative estimate of enrichment/depletion fold ratios around that
+ position.
+ Use \code{\link{writewig}} to output the structure to a WIG
+ file.
+}
+\references{ R.M.Price, D.G. Bonett "Estimating the ratio fo two Poisson
+ rates", Comp. Stat & Data Anal. 32(2000) 345}
+\seealso{ \code{\link{get.smoothed.tag.density}} }
+\examples{
+ enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01);
+ writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale");
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.Rd
new file mode 100755
index 0000000..cf60fe8
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.Rd
@@ -0,0 +1,46 @@
+\name{get.mser}
+\alias{get.mser}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate minimal saturated enrichment fold ratio }
+\description{
+ Determine if the dataset has reached absolute saturation, or otherwise
+ find minimal fold enrichment ratio above which the detection of peaks
+ has stabilized enough to meet the saturation criteria.
+}
+\usage{
+get.mser(signal.data, control.data, n.chains = 5, step.size = 1e+05, chains = NULL, cluster = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), n.steps = 1, ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.data}{ signal tag vector list }
+ \item{control.data}{ control tag vector list }
+ \item{n.chains}{ number of dataset subsamples to use }
+ \item{step.size}{ subsampling step describing the saturation
+ criteria. The criteria requires the set of detected binding sites to
+ be stable (as described by the \code{test.agreement} param) when the
+ number of tags in the dataset is reduced by \code{step.size}. The
+ value can either be an integer above one, in which case it specifies a fixed
+ number of tags, or a real value below one, in which case it
+ specifies the fraction of tags that should be removed (e.g. 0.1 will
+ remove 10% of tags).
+ }
+ \item{test.agreement}{ Fraction of the detected peaks that should
+ agree between the full and subsampled datasets. }
+ \item{chains}{ optional parameter, giving pre-calculated chains }
+ \item{cluster}{ optional \code{snow} cluster to parallelize processing }
+
+ \item{return.chains}{ whether subsampled dataset results should be returned as
+ well }
+ \item{enrichment.background.scales}{ one or multiple window scales at
+ which the background tag density should be assessed. See
+ \code{enrichment.background.scales} in
+ \code{\link{find.binding.positions}}. If multiple scales are provided,
+ multiple MSER estimates will be returned.}
+ \item{\dots}{ additional parameters should be the same as those passed
+ to the \code{\link{find.binding.positions}}}
+}
+\value{
+ A single, or multple (if multiple \code{enrichment.background.scales} were
+ provided) MSER value. A value of 1 or very close to it implies that
+ the dataset has reached absolute saturation based on the given criteria.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.interpolation.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.interpolation.Rd
new file mode 100755
index 0000000..e10b81e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.mser.interpolation.Rd
@@ -0,0 +1,56 @@
+\name{get.mser.interpolation}
+\alias{get.mser.interpolation}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Interpolate MSER dependency on the tag count }
+\description{
+ MSER generally decreases with increasing sequencing depth. This
+ function interpolates the dependency of MSER on tag counts as a
+ log-log linear function. The log-log fit is used to estimate the depth
+ of sequencing required to reach desired \code{target.fold.enrichment}.
+}
+\usage{
+get.mser.interpolation(signal.data, control.data, target.fold.enrichment = 5, n.chains = 10, n.steps = 6, step.size = 1e+05, chains = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), excluded.steps = c(seq(2, n.steps - 2)), ...)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.data}{ signal chromosome tag vector list }
+ \item{control.data}{ control chromosome tag vector list }
+ \item{target.fold.enrichment}{ target MSER for which the depth should
+ be estimated}
+ \item{n.steps}{ number of steps in each subset chain. }
+ \item{step.size}{ Either number of tags or fraction of the dataset
+ size, see \code{step.size} parameter for \code{\link{get.mser}}. }
+ \item{test.agreement}{ Fraction of the detected peaks that should
+ agree between the full and subsampled datasets. See \code{test.agreement} parameter for \code{\link{get.mser}}}
+ \item{n.chains}{ number of random subset chains }
+ \item{chains}{ optional structure of pre-calculated chains
+ (e.g. generated by an earlier call with \code{return.chains=T}.}
+
+ \item{return.chains}{ whether to return peak predictions calculated on
+ random chains. These can be passed back using \code{chains} argument
+ to skip subsampling/prediction steps, and just recalculate the depth
+ estimate for a different MSER.}
+ \item{enrichment.background.scales}{ see \code{enrichment.background.scales} parameter for \code{\link{get.mser}} }
+ \item{excluded.steps}{ Intermediate subsampling steps that should be excluded from
+ the chains to speed up the calculation. By default, all intermediate
+ steps except for first two and last two are skipped. Adding
+ intermediate steps improves interpolation at the expense of
+ computational time.}
+ \item{\dots}{ additional parameters are passed to \code{\link{get.mser}} }
+}
+\details{
+ To simulate sequencing growth, the method calculates peak predictions
+ on random chains. Each chain is produced by sequential random
+ subsampling of the original data. The number of steps in the chain
+ indicates how many times the random subsampling will be performed.
+}
+\value{
+ Normally reurns a list, specifying for each backgroundscale:
+ \item{prediction}{estimated sequencing depth required to reach
+ specified target MSER}
+ \item{log10.fit}{linear fit model, a result of \code{lm()} call}
+
+ If \code{return.chains=T}, the above structure is returned under
+ \code{interpolation} field, along with \code{chains} field containing
+ results of \code{\link{find.binding.positions}} calls on subsampled chains.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.enrichment.mle.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.enrichment.mle.Rd
new file mode 100755
index 0000000..fe80329
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.enrichment.mle.Rd
@@ -0,0 +1,35 @@
+\name{get.smoothed.enrichment.mle}
+\alias{get.smoothed.enrichment.mle}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate chromosome-wide profiles of smoothed enrichment estimate }
+\description{
+ Given signal and control tag positions, the method calculates log2
+ signal to control enrichment esimates (maximum likelihood) for each
+ chromosome, based on the smoothed tag density profile (see \link{get.smoothed.tag.density}).
+}
+\usage{
+get.smoothed.enrichment.mle(signal.tags, control.tags, bandwidth = 150,tag.shift = 146/2, step = 50)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output
+ of \code{\link{select.informative.tags}} }
+ \item{control.tags}{ control (input) tags }
+ \item{pseudocount}{ pseudocount value to be added to tag density -
+ defaults to 1 }
+ other parameters (such as bandwidth, step.size and tag.shift) are
+ passed to \link{get.smoothed.tag.density} - see appropriate reference
+ for details.
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $x/$y data.frame giving the position and associated
+ log2 signal/control enrichment estimate.
+}
+\seealso{ \code{\link{writewig}} }
+\examples{
+ # get smoothed enrichment estimate profile using 500bp bandwidth at
+ # 50bp steps
+ smoothed.M <- get.smoothed.enrichment.mle(chip.data,bandwidth=500,step=50);
+ writewig(smoothed.M,"example.smoothedM.wig","Example smoothed log2 intensity ratio estimate");
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.tag.density.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.tag.density.Rd
new file mode 100755
index 0000000..9807249
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/get.smoothed.tag.density.Rd
@@ -0,0 +1,45 @@
+\name{get.smoothed.tag.density}
+\alias{get.smoothed.tag.density}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Calculate chromosome-wide profiles of smoothed tag density }
+\description{
+ Given tag positions, the method calculates for each chromosome a tag
+ density profile, smoothed by the Gaussian kernel. If the optional
+ control tags are provided, the difference between ChIP and control tag
+ density is returned.
+}
+\usage{
+get.smoothed.tag.density(signal.tags, control.tags = NULL, bandwidth = 150, bg.weight = NULL, tag.shift = 146/2, step = round(bandwidth/3))
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output
+ of \code{\link{select.informative.tags}} }
+ \item{control.tags}{ optional control (input) tags }
+ \item{bandwidth}{ standard deviation of the Gaussian kernel }
+ \item{bg.weight}{ optional weight by which the background density
+ should be multipled for scaling. If not supplied, the weight is
+ calculated based on the ratio of the reduced ChIP to input dataset sizes. }
+ \item{tag.shift}{ Distance by which the positive and negative strand
+ tags should be shifted towards eachother. This
+ normally corresponds to the half of the cross-correlation peak
+ position (e.g. \code{get.binding.characteristics()}$peak$x/2) }
+ \item{step}{ The distance between the regularly spaced points for
+ which the values should be calculated. }
+ \item{background.density.scaling}{ If TRUE, regions of significant tag
+ enrichment will be masked out when calculating size ratio of the
+ signal to control datasets (to estimate ratio of the background tag
+ density). If FALSE, the dataset ratio will be equal to the ratio of
+ the number of tags in each dataset.}
+}
+\value{
+ A list of elements corresponding to chromosomes, with each element
+ being an $x/$y data.frame giving the position and associated tag
+ density. Use \code{\link{writewig}} to output the structure to a WIG
+ file.
+}
+\seealso{ \code{\link{writewig}} }
+\examples{
+ smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2));
+ writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density");
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/output.binding.results.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/output.binding.results.Rd
new file mode 100755
index 0000000..744476e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/output.binding.results.Rd
@@ -0,0 +1,26 @@
+\name{output.binding.results}
+\alias{output.binding.results}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Write out determined binding peaks into a text file table }
+\description{
+ Writes out determined binding positions into a text file. The file
+ will contain a table with each row corresponding to a detected
+ position, with the following columns:
+ \itemize{
+ \item{chr}{ chromosome or target sequence }
+ \item{pos}{ position of detected binding site on the chromosome/sequence}
+ \item{score}{a score reflecting magnitude of the binding}
+ \item{Evalue}{E-value corresponding to the peak magnitude}
+ \item{FDR}{FDR corresponding to the peak magnitude}
+ \item{enrichment.lb}{lower bound of the fold-enrichment ratio}
+ \item{enrichment.mle}{maximum likelihood estimate of the fold-enrichment ratio}
+ }
+}
+\usage{
+output.binding.results(results, filename)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{results}{ output of the \code{\link{find.binding.positions}} }
+ \item{filename}{ file name }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bam.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bam.tags.Rd
new file mode 100755
index 0000000..c4a579e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bam.tags.Rd
@@ -0,0 +1,24 @@
+\name{read.bam.tags}
+\alias{read.bam.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read BAM alignment file }
+\description{
+ Reads in aligned reads from BAM file. Note: no split (non-unique)
+ alignemnts should be reported in the BAM file.
+}
+\usage{
+read.bam.tags(filename, read.tag.names = F, fix.chromosome.names = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ BAM file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bin.maqmap.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bin.maqmap.tags.Rd
new file mode 100755
index 0000000..8260d61
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bin.maqmap.tags.Rd
@@ -0,0 +1,23 @@
+\name{read.bin.maqmap.tags}
+\alias{read.bin.maqmap.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read MAQ binary alignment map file }
+\description{
+ Reads in MAQ binary map alignment result file
+}
+\usage{
+read.bin.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ MAQ map output file (binary) }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bowtie.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bowtie.tags.Rd
new file mode 100755
index 0000000..678e9fc
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.bowtie.tags.Rd
@@ -0,0 +1,23 @@
+\name{read.bowtie.tags}
+\alias{read.bowtie.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read bowtie text alignment output file }
+\description{
+ Reads in bowtie alignment results in text format
+}
+\usage{
+read.bowtie.tags(filename, read.tag.names = F, fix.chromosome.names = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ bowtie text output file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.eland.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.eland.tags.Rd
new file mode 100755
index 0000000..aa29d6b
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.eland.tags.Rd
@@ -0,0 +1,30 @@
+\name{read.eland.tags}
+\alias{read.eland.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read eland output file }
+\description{
+ Reads in ELAND output file, returning 5'-end tag coordinates and
+ number of mismatches associated with each mapped tag.
+}
+\usage{
+read.eland.tags(filename, read.tag.names = F, fix.chromosome.names = T, max.eland.tag.length = -1,extended=F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ ELAND output file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+ \item{max.eland.tag.length}{ Specifies max length of the tag sequence
+ considered by ELAND. This needs to be specified if the tags are
+ longer than the sequences considred by ELAND during alignment. }
+ \item{extended}{ Whether the file is written out in "extended" format
+ provided in GA pipeline 1.0. }
+ \item{multi}{ Whether the file is written in "multi" format, showing multiple alignments of the reads }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.maqmap.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.maqmap.tags.Rd
new file mode 100755
index 0000000..31c5309
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.maqmap.tags.Rd
@@ -0,0 +1,23 @@
+\name{read.maqmap.tags}
+\alias{read.maqmap.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read MAQ text alignment output file }
+\description{
+ Reads in MAQ alignment results in text format (that results from "maq mapview" command.)
+}
+\usage{
+read.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ MAQ text output file }
+ \item{read.tag.names}{ Whether the tag names should be read in }
+ \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of
+ the sequence names }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Number of mismatches }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.meland.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.meland.tags.Rd
new file mode 100755
index 0000000..c21a815
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/read.meland.tags.Rd
@@ -0,0 +1,29 @@
+\name{read.meland.tags}
+\alias{read.meland.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Read modified BED tag alignment file that contains variable match
+ length information }
+\description{
+ Reads in an extended BED tag alignment file. An example line given below:
+ \code{49 . U1 . 1 . . 23 chr2 -234567}
+ The line above specifies a 23-bp portion of the tag tag with id 49 was
+ aligned with 1 mismatch to the negative strand of chr2 at position 234567.
+}
+\usage{
+read.meland.tags(filename, read.tag.names = F, fix.chromosome.names = T)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{filename}{ name of the extended BED file }
+ \item{read.tag.names}{ whether to read in tag names }
+ \item{fix.chromosome.names}{ whether to remove ".fa" from the sequence
+ name ends. }
+}
+\value{
+ \item{tags }{ A vector of 5' tag coordinates, with negative values
+ corresponding to tags mapped to the negative strand. }
+ \item{quality }{ Quality expressed as a float x.y, where x is
+ tag.length - aligned.tag.portion.length, and y is the number of
+ mismatches (must be less than 10). }
+ \item{names }{ Tag names, if \code{read.tag.names} was set }
+} \ No newline at end of file
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/remove.local.tag.anomalies.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/remove.local.tag.anomalies.Rd
new file mode 100755
index 0000000..705705f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/remove.local.tag.anomalies.Rd
@@ -0,0 +1,46 @@
+\name{remove.local.tag.anomalies}
+\alias{remove.local.tag.anomalies}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Restrict or remove positions with too many tags relative to
+ local background. }
+\description{
+ In Solexa ChIP-seq experiments some anomalous positions contain
+ extremely high number of tags at the exact coordinates. The function
+ scans the chromosomes, determining local tag density based on a
+ provided \code{window.size}, doing two types of corrections:
+ 1. removing all tags from positions that exceed local density by
+ \code{eliminate.fold}; 2. reducing the tag count at positions
+ exceeding \code{cap.fold} to the maximal allowed count. The
+ statistical significance of counts exceeding either of these two
+ threshold densities is calculated based on Poisson model, with
+ confidence interval determined by the \code{z.threshold} Z-score parameter.
+}
+\usage{
+remove.local.tag.anomalies(tags, window.size = 200, eliminate.fold = 10, cap.fold = 4, z.threshold = 3)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{tags}{ Chromosome-list of tag vectors }
+ \item{window.size}{ Size of the window used to assess local
+ density. Increasing the window size considerably beyond the size of
+ the binding features will result in flattened profiles, with bound
+ positions exhibiting a difference of just 1 tag beyond the background. }
+ \item{eliminate.fold}{ Threshold definining fold-over background
+ density above which the position is considered anomalous and removed
+ completely.}
+ \item{cap.fold}{ Threshold fold-over background density above which
+ the position is capped to the maximum statistically likely given
+ local tag density }
+ \item{z.threshold}{ Z-score used to assess significance of a given
+ position exceeding either of the two density thresholds. }
+}
+\value{
+ A modified chromosome-wise tag vector list.
+}
+\references{ ~put references to the literature/web site here ~ }
+
+\note{ ~~further notes~~
+ Increasing window.size to very large values will result in flat
+ profiles similar to those described by Zhang et al. "Model-based
+ Analysis of ChIP-Seq (MACS)." Genome Biol. 2008 Sep 17;9(9):R137.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/select.informative.tags.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/select.informative.tags.Rd
new file mode 100755
index 0000000..73a4155
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/select.informative.tags.Rd
@@ -0,0 +1,29 @@
+\name{select.informative.tags}
+\alias{select.informative.tags}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Choose informative tags }
+\description{
+ For datasets with tag alignment quality information (e.g. number of
+ mismatches for Eland alignments),
+ \code{\link{get.binding.characteristics}} determines whether inclusion
+ of tags from each specific quality bin improves the cross-correlation
+ profile. The present function is then used to actually select these
+ informative tags, discarding all other information, including quality
+ scores that are not used in further processing.
+}
+\usage{
+select.informative.tags(data, binding.characteristics)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{data}{ Full alignment data (a list with $tags and $quality elements) }
+ \item{binding.characteristics}{ result of a
+ \code{\link{get.binding.characteristics}} call. If NULL value is
+ supplied,all tags will be accepted. }
+}
+\value{
+ A chromosome-wise tag list. Each element of the list corresponds to a
+ chromosome and is a numeric vector of 5' tag coordinates, with sign
+ designating DNA strand.
+ This form of tag data is used for most of the other processing.
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/spp-package.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/spp-package.Rd
new file mode 100755
index 0000000..542bafc
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/spp-package.Rd
@@ -0,0 +1,144 @@
+\name{spp-package}
+\alias{spp-package}
+\alias{spp}
+\docType{package}
+\title{
+ChIP-seq (Solexa) Processing Pipeline
+}
+\description{
+A set of routines for reading short sequence alignments, calculating tag
+density, estimates of statistically significant enrichment/depletion
+along the chromosome, identifying point binding positions (peaks), and
+characterizing saturation properties related to sequencing depth.
+}
+\details{
+\tabular{ll}{
+Package: \tab spp\cr
+Type: \tab Package\cr
+Version: \tab 1.8\cr
+Date: \tab 2008-11-14\cr
+License: \tab What license is it under?\cr
+LazyLoad: \tab yes\cr
+}
+See example below for typical processing sequence.y
+}
+\author{Peter Kharchenko <peter.kharchenko@post.harvard.edu>}
+\references{
+Kharchenko P., Tolstorukov M., Park P. "Design and analysis of ChIP-seq
+experiments for DNA-binding proteins." Nature Biotech. doi:10.1038/nbt.1508
+}
+
+\examples{
+
+ # load the library
+ library(spp);
+
+ ## The following section shows how to initialize a cluster of 8 nodes for parallel processing
+ ## To enable parallel processing, uncomment the next three lines, and comment out "cluster<-NULL";
+ ## see "snow" package manual for details.
+ #library(snow)
+ #cluster <- makeCluster(2);
+ #invisible(clusterCall(cluster,source,"routines.r"));
+ cluster <- NULL;
+
+
+
+ # read in tag alignments
+ chip.data <- read.eland.tags("chip.eland.alignment");
+ input.data <- read.eland.tags("input.eland.alignment");
+
+ # get binding info from cross-correlation profile
+ # srange gives the possible range for the size of the protected region;
+ # srange should be higher than tag length; making the upper boundary too high will increase calculation time
+ #
+ # bin - bin tags within the specified number of basepairs to speed up calculation;
+ # increasing bin size decreases the accuracy of the determined parameters
+ binding.characteristics <- get.binding.characteristics(chip.data,srange=c(50,500),bin=5,cluster=cluster);
+
+
+ # plot cross-correlation profile
+ pdf(file="example.crosscorrelation.pdf",width=5,height=5)
+ par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
+ plot(binding.characteristics$cross.correlation,type='l',xlab="strand shift",ylab="cross-correlation");
+ abline(v=binding.characteristics$peak$x,lty=2,col=2)
+ dev.off();
+
+ # select informative tags based on the binding characteristics
+ chip.data <- select.informative.tags(chip.data,binding.characteristics);
+ input.data <- select.informative.tags(input.data,binding.characteristics);
+
+ # restrict or remove positions with anomalous number of tags relative
+ # to the local density
+ chip.data <- remove.local.tag.anomalies(chip.data);
+ input.data <- remove.local.tag.anomalies(input.data);
+
+
+ # output smoothed tag density (subtracting re-scaled input) into a WIG file
+ # note that the tags are shifted by half of the peak separation distance
+ smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2));
+ writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density");
+ rm(smoothed.density);
+
+ # output conservative enrichment estimates
+ # alpha specifies significance level at which confidence intervals will be estimated
+ enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01);
+ writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale");
+ rm(enrichment.estimates);
+
+
+ # binding detection parameters
+ # desired FDR. Alternatively, an E-value can be supplied to the method calls below instead of the fdr parameter
+ fdr <- 1e-2;
+ # the binding.characteristics contains the optimized half-size for binding detection window
+ detection.window.halfsize <- binding.characteristics$whs;
+
+ # determine binding positions using wtd method
+ bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize,cluster=cluster)
+
+ # alternatively determined binding positions using lwcc method (note: this takes longer than wtd)
+ # bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.lwcc,whs=detection.window.halfsize,cluster=cluster)
+
+ print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks"));
+
+ # output detected binding positions
+ output.binding.results(bp,"example.binding.positions.txt");
+
+
+ # -------------------------------------------------------------------------------------------
+ # the set of commands in the following section illustrates methods for saturation analysis
+ # these are separated from the previous section, since they are highly CPU intensive
+ # -------------------------------------------------------------------------------------------
+
+ # determine MSER
+ # note: this will take approximately 10-15x the amount of time the initial binding detection did
+ # The saturation criteria here is 0.99 consistency in the set of binding positions when adding 1e5 tags.
+ # To ensure convergence the number of subsampled chains (n.chains) should be higher (80)
+ mser <- get.mser(chip.data,input.data,step.size=1e5,test.agreement=0.99,n.chains=8,cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize)
+
+ print(paste("MSER at a current depth is",mser));
+
+ # note: an MSER value of 1 or very near one implies that the set of detected binding positions satisfies saturation criteria without
+ # additional selection by fold-enrichment ratios. In other words, the dataset has reached saturation in a traditional sense (absolute saturation).
+
+ # interpolate MSER dependency on tag count
+ # note: this requires considerably more calculations than the previous steps (~ 3x more than the first MSER calculation)
+ # Here we interpolate MSER dependency to determine a point at which MSER of 2 is reached
+ # The interpolation will be based on the difference in MSER at the current depth, and a depth at 5e5 fewer tags (n.steps=6);
+ # evaluation of the intermediate points is omitted here to speed up the calculation (excluded.steps parameter)
+ # A total of 7 chains is used here to speed up calculation, whereas a higher number of chains (50) would give good convergence
+ msers <- get.mser.interpolation(chip.data,input.data,step.size=1e5,test.agreement=0.99, target.fold.enrichment=2, n.chains=7,n.steps=6,excluded.steps=c(2:4),cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize)
+
+ print(paste("predicted sequencing depth =",round(unlist(lapply(msers,function(x) x$prediction))/1e6,5)," million tags"))
+
+
+ # note: the interpolation will return NA prediction if the dataset has reached absolute saturation at the current depth.
+ # note: use return.chains=T to also calculated random chains (returned under msers$chains field) - these can be passed back as
+ # "get.mser.interpolation( ..., chains=msers$chains)" to calculate predictions for another target.fold.enrichment value
+ # without having to recalculate the random chain predictions.
+
+ ## stop cluster if it was initialized
+ #stopCluster(cluster);
+
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.broadpeak.info.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.broadpeak.info.Rd
new file mode 100755
index 0000000..0ed5f66
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.broadpeak.info.Rd
@@ -0,0 +1,16 @@
+\name{write.broadpeak.info}
+\alias{write.broadpeak.info}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Write out determined broad enrichment regions using broadPeak format }
+\description{
+ Writes out broad regions of enrichment determined by the
+ get.broad.enrichment.clusters method in a broadPeak format.
+}
+\usage{
+write.broadpeak.info(broadpeak.results, filename)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{broadpeak.results}{ output of the \code{\link{get.broad.enrichment.clusters}} }
+ \item{filename}{ file name }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.narrowpeak.binding.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.narrowpeak.binding.Rd
new file mode 100755
index 0000000..ca259bb
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/write.narrowpeak.binding.Rd
@@ -0,0 +1,21 @@
+\name{write.narrowpeak.binding}
+\alias{write.narrowpeak.binding}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ Write out determined binding peaks using narrowPeak format }
+\description{
+ Writes out determined binding positions into a narrowPeak file.
+ The region will correspond to associated broad enrichment region, if
+ such were added using add.broad.peak.regions method. Otherwise the
+ region size will be determined using margin (which defaults to the
+ window half size that was used to determine binding positions)
+}
+\usage{
+write.narrowpeak.binding(results, filename,margin=results$whs)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{results}{ output of the \code{\link{find.binding.positions}} }
+ \item{filename}{ file name }
+ \item{margin}{ explicit value of the margin to be used if the borad
+ region information is absent (defaults to peak detection window half-size}
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/writewig.Rd b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/writewig.Rd
new file mode 100755
index 0000000..f7e23d9
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/man/writewig.Rd
@@ -0,0 +1,31 @@
+\name{writewig}
+\alias{writewig}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{ A function to save a list of chromosome-wise x/y data frames
+ into a WIG file format. }
+\description{
+ Takes a list that contains an $x and $y data.frame for a number of
+ chromosomes and writes it out to a WIG BED style format.
+}
+\usage{
+writewig(dat, fname, feature, threshold = 5, zip = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{dat}{ Chromosome coordinate-value data. \code{dat} is a list,
+ each member of a list is a data frame with $x and $y columns
+ containing chromosome positions and associated values. The names of
+ the list elements correspond to the chromosomes. }
+ \item{fname}{ Filename to which the output should be written }
+ \item{feature}{ Data description to be incorporated into the WIG header }
+ \item{threshold}{ Optional threshold to be saved in the WIG file}
+ \item{zip}{ Wheter to invoke a zip program to compress the file }
+}
+
+\seealso{ ~~objects to See Also as \code{\link{help}}, ~~~ }
+\examples{
+
+data <- list("chr1"=data.frame(x=c(100,130,200),y=c(1.2,4.0,2.3)));
+writewig(data,"filename");
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.cpp
new file mode 100755
index 0000000..6a89987
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.cpp
@@ -0,0 +1,398 @@
+// ***************************************************************************
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading & writing BGZF files
+// ***************************************************************************
+
+#include <BGZF.h>
+using namespace BamTools;
+
+#include <algorithm>
+using namespace std;
+
+BgzfData::BgzfData(void)
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
+ , CompressedBlockSize(MAX_BLOCK_SIZE)
+ , BlockLength(0)
+ , BlockOffset(0)
+ , BlockAddress(0)
+ , IsOpen(false)
+ , IsWriteOnly(false)
+ , IsWriteUncompressed(false)
+ , Stream(NULL)
+ , UncompressedBlock(NULL)
+ , CompressedBlock(NULL)
+{
+ try {
+ CompressedBlock = new char[CompressedBlockSize];
+ UncompressedBlock = new char[UncompressedBlockSize];
+ } catch( std::bad_alloc& ba ) {
+ fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n");
+ exit(1);
+ }
+}
+
+// destructor
+BgzfData::~BgzfData(void) {
+ if( CompressedBlock ) delete[] CompressedBlock;
+ if( UncompressedBlock ) delete[] UncompressedBlock;
+}
+
+// closes BGZF file
+void BgzfData::Close(void) {
+
+ // skip if file not open, otherwise set flag
+ if ( !IsOpen ) return;
+
+ // if writing to file, flush the current BGZF block,
+ // then write an empty block (as EOF marker)
+ if ( IsWriteOnly ) {
+ FlushBlock();
+ int blockLength = DeflateBlock();
+ fwrite(CompressedBlock, 1, blockLength, Stream);
+ }
+
+ // flush and close
+ fflush(Stream);
+ fclose(Stream);
+ IsWriteUncompressed = false;
+ IsOpen = false;
+}
+
+// compresses the current block
+int BgzfData::DeflateBlock(void) {
+
+ // initialize the gzip header
+ char* buffer = CompressedBlock;
+ memset(buffer, 0, 18);
+ buffer[0] = GZIP_ID1;
+ buffer[1] = (char)GZIP_ID2;
+ buffer[2] = CM_DEFLATE;
+ buffer[3] = FLG_FEXTRA;
+ buffer[9] = (char)OS_UNKNOWN;
+ buffer[10] = BGZF_XLEN;
+ buffer[12] = BGZF_ID1;
+ buffer[13] = BGZF_ID2;
+ buffer[14] = BGZF_LEN;
+
+ // set compression level
+ const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );
+
+ // loop to retry for blocks that do not compress enough
+ int inputLength = BlockOffset;
+ int compressedLength = 0;
+ unsigned int bufferSize = CompressedBlockSize;
+
+ while ( true ) {
+
+ // initialize zstream values
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)UncompressedBlock;
+ zs.avail_in = inputLength;
+ zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+ // initialize the zlib compression algorithm
+ if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n");
+ exit(1);
+ }
+
+ // compress the data
+ int status = deflate(&zs, Z_FINISH);
+ if ( status != Z_STREAM_END ) {
+
+ deflateEnd(&zs);
+
+ // reduce the input length and try again
+ if ( status == Z_OK ) {
+ inputLength -= 1024;
+ if( inputLength < 0 ) {
+ fprintf(stderr, "BGZF ERROR: input reduction failed.\n");
+ exit(1);
+ }
+ continue;
+ }
+
+ fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
+ exit(1);
+ }
+
+ // finalize the compression routine
+ if ( deflateEnd(&zs) != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n");
+ exit(1);
+ }
+
+ compressedLength = zs.total_out;
+ compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ if ( compressedLength > MAX_BLOCK_SIZE ) {
+ fprintf(stderr, "BGZF ERROR: deflate overflow.\n");
+ exit(1);
+ }
+
+ break;
+ }
+
+ // store the compressed length
+ BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+ // store the CRC32 checksum
+ unsigned int crc = crc32(0, NULL, 0);
+ crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+ // ensure that we have less than a block of data left
+ int remaining = BlockOffset - inputLength;
+ if ( remaining > 0 ) {
+ if ( remaining > inputLength ) {
+ fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n");
+ exit(1);
+ }
+ memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
+ }
+
+ BlockOffset = remaining;
+ return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BgzfData::FlushBlock(void) {
+
+ // flush all of the remaining blocks
+ while ( BlockOffset > 0 ) {
+
+ // compress the data block
+ int blockLength = DeflateBlock();
+
+ // flush the data to our output stream
+ int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
+
+ if ( numBytesWritten != blockLength ) {
+ fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
+ exit(1);
+ }
+
+ BlockAddress += blockLength;
+ }
+}
+
+// de-compresses the current block
+int BgzfData::InflateBlock(const int& blockLength) {
+
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)CompressedBlock + 18;
+ zs.avail_in = blockLength - 16;
+ zs.next_out = (Bytef*)UncompressedBlock;
+ zs.avail_out = UncompressedBlockSize;
+
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if ( status != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");
+ return -1;
+ }
+
+ status = inflate(&zs, Z_FINISH);
+ if ( status != Z_STREAM_END ) {
+ inflateEnd(&zs);
+ fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n");
+ return -1;
+ }
+
+ status = inflateEnd(&zs);
+ if ( status != Z_OK ) {
+ fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");
+ return -1;
+ }
+
+ return zs.total_out;
+}
+
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
+bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {
+
+ // determine open mode
+ if ( strcmp(mode, "rb") == 0 )
+ IsWriteOnly = false;
+ else if ( strcmp(mode, "wb") == 0)
+ IsWriteOnly = true;
+ else {
+ fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode);
+ return false;
+ }
+
+ // ----------------------------------------------------------------
+ // open Stream to read to/write from file, stdin, or stdout
+ // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)
+
+ // read/write BGZF data to/from a file
+ if ( (filename != "stdin") && (filename != "stdout") )
+ Stream = fopen(filename.c_str(), mode);
+
+ // read BGZF data from stdin
+ else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
+ Stream = freopen(NULL, mode, stdin);
+
+ // write BGZF data to stdout
+ else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
+ Stream = freopen(NULL, mode, stdout);
+
+ if ( !Stream ) {
+ fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() );
+ return false;
+ }
+
+ // set flags, return success
+ IsOpen = true;
+ IsWriteUncompressed = isWriteUncompressed;
+ return true;
+}
+
+// reads BGZF data into a byte buffer
+int BgzfData::Read(char* data, const unsigned int dataLength) {
+
+ if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;
+
+ char* output = data;
+ unsigned int numBytesRead = 0;
+ while ( numBytesRead < dataLength ) {
+
+ int bytesAvailable = BlockLength - BlockOffset;
+ if ( bytesAvailable <= 0 ) {
+ if ( !ReadBlock() ) return -1;
+ bytesAvailable = BlockLength - BlockOffset;
+ if ( bytesAvailable <= 0 ) break;
+ }
+
+ char* buffer = UncompressedBlock;
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+ memcpy(output, buffer + BlockOffset, copyLength);
+
+ BlockOffset += copyLength;
+ output += copyLength;
+ numBytesRead += copyLength;
+ }
+
+ if ( BlockOffset == BlockLength ) {
+ BlockAddress = ftell64(Stream);
+ BlockOffset = 0;
+ BlockLength = 0;
+ }
+
+ return numBytesRead;
+}
+
+// reads a BGZF block
+bool BgzfData::ReadBlock(void) {
+
+ char header[BLOCK_HEADER_LENGTH];
+ int64_t blockAddress = ftell64(Stream);
+
+ int count = fread(header, 1, sizeof(header), Stream);
+ if ( count == 0 ) {
+ BlockLength = 0;
+ return true;
+ }
+
+ if ( count != sizeof(header) ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n");
+ return false;
+ }
+
+ if ( !BgzfData::CheckBlockHeader(header) ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n");
+ return false;
+ }
+
+ int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;
+ char* compressedBlock = CompressedBlock;
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;
+
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);
+ if ( count != remaining ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n");
+ return false;
+ }
+
+ count = InflateBlock(blockLength);
+ if ( count < 0 ) {
+ fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n");
+ return false;
+ }
+
+ if ( BlockLength != 0 )
+ BlockOffset = 0;
+
+ BlockAddress = blockAddress;
+ BlockLength = count;
+ return true;
+}
+
+// seek to position in BGZF file
+bool BgzfData::Seek(int64_t position) {
+
+ if ( !IsOpen ) return false;
+
+ int blockOffset = (position & 0xFFFF);
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+
+ if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
+ fprintf(stderr, "BGZF ERROR: unable to seek in file\n");
+ return false;
+ }
+
+ BlockLength = 0;
+ BlockAddress = blockAddress;
+ BlockOffset = blockOffset;
+ return true;
+}
+
+// get file position in BGZF file
+int64_t BgzfData::Tell(void) {
+ if ( !IsOpen )
+ return false;
+ else
+ return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {
+
+ if ( !IsOpen || !IsWriteOnly ) return false;
+
+ // initialize
+ unsigned int numBytesWritten = 0;
+ const char* input = data;
+ unsigned int blockLength = UncompressedBlockSize;
+
+ // copy the data to the buffer
+ while ( numBytesWritten < dataLen ) {
+
+ unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
+ char* buffer = UncompressedBlock;
+ memcpy(buffer + BlockOffset, input, copyLength);
+
+ BlockOffset += copyLength;
+ input += copyLength;
+ numBytesWritten += copyLength;
+
+ if ( BlockOffset == blockLength )
+ FlushBlock();
+ }
+
+ return numBytesWritten;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.h
new file mode 100755
index 0000000..46b82a3
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BGZF.h
@@ -0,0 +1,322 @@
+// ***************************************************************************
+// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading & writing BGZF files
+// ***************************************************************************
+
+#ifndef BGZF_H
+#define BGZF_H
+
+#include <api_global.h>
+#include <zlib.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+// Platform-specific large-file support
+#ifndef BAMTOOLS_LFS
+#define BAMTOOLS_LFS
+ #ifdef WIN32
+ #define ftell64(a) _ftelli64(a)
+ #define fseek64(a,b,c) _fseeki64(a,b,c)
+ #else
+ #define ftell64(a) ftello(a)
+ #define fseek64(a,b,c) fseeko(a,b,c)
+ #endif
+#endif // BAMTOOLS_LFS
+
+// Platform-specific type definitions
+#ifndef BAMTOOLS_TYPES
+#define BAMTOOLS_TYPES
+ #ifdef _MSC_VER
+ typedef char int8_t;
+ typedef unsigned char uint8_t;
+ typedef short int16_t;
+ typedef unsigned short uint16_t;
+ typedef int int32_t;
+ typedef unsigned int uint32_t;
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+ #else
+ #include <stdint.h>
+ #endif
+#endif // BAMTOOLS_TYPES
+
+namespace BamTools {
+
+// zlib constants
+const int GZIP_ID1 = 31;
+const int GZIP_ID2 = 139;
+const int CM_DEFLATE = 8;
+const int FLG_FEXTRA = 4;
+const int OS_UNKNOWN = 255;
+const int BGZF_XLEN = 6;
+const int BGZF_ID1 = 66;
+const int BGZF_ID2 = 67;
+const int BGZF_LEN = 2;
+const int GZIP_WINDOW_BITS = -15;
+const int Z_DEFAULT_MEM_LEVEL = 8;
+
+// BZGF constants
+const int BLOCK_HEADER_LENGTH = 18;
+const int BLOCK_FOOTER_LENGTH = 8;
+const int MAX_BLOCK_SIZE = 65536;
+const int DEFAULT_BLOCK_SIZE = 65536;
+
+struct API_EXPORT BgzfData {
+
+ // data members
+ public:
+ unsigned int UncompressedBlockSize;
+ unsigned int CompressedBlockSize;
+ unsigned int BlockLength;
+ unsigned int BlockOffset;
+ uint64_t BlockAddress;
+ bool IsOpen;
+ bool IsWriteOnly;
+ bool IsWriteUncompressed;
+ FILE* Stream;
+ char* UncompressedBlock;
+ char* CompressedBlock;
+
+ // constructor & destructor
+ public:
+ BgzfData(void);
+ ~BgzfData(void);
+
+ // main interface methods
+ public:
+ // closes BGZF file
+ void Close(void);
+ // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing)
+ bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false);
+ // reads BGZF data into a byte buffer
+ int Read(char* data, const unsigned int dataLength);
+ // seek to position in BGZF file
+ bool Seek(int64_t position);
+ // get file position in BGZF file
+ int64_t Tell(void);
+ // writes the supplied data into the BGZF buffer
+ unsigned int Write(const char* data, const unsigned int dataLen);
+
+ // internal methods
+ private:
+ // compresses the current block
+ int DeflateBlock(void);
+ // flushes the data in the BGZF block
+ void FlushBlock(void);
+ // de-compresses the current block
+ int InflateBlock(const int& blockLength);
+ // reads a BGZF block
+ bool ReadBlock(void);
+
+ // static 'utility' methods
+ public:
+ // checks BGZF block header
+ static inline bool CheckBlockHeader(char* header);
+ // packs an unsigned integer into the specified buffer
+ static inline void PackUnsignedInt(char* buffer, unsigned int value);
+ // packs an unsigned short into the specified buffer
+ static inline void PackUnsignedShort(char* buffer, unsigned short value);
+ // unpacks a buffer into a double
+ static inline double UnpackDouble(char* buffer);
+ static inline double UnpackDouble(const char* buffer);
+ // unpacks a buffer into a float
+ static inline float UnpackFloat(char* buffer);
+ static inline float UnpackFloat(const char* buffer);
+ // unpacks a buffer into a signed int
+ static inline signed int UnpackSignedInt(char* buffer);
+ static inline signed int UnpackSignedInt(const char* buffer);
+ // unpacks a buffer into a signed short
+ static inline signed short UnpackSignedShort(char* buffer);
+ static inline signed short UnpackSignedShort(const char* buffer);
+ // unpacks a buffer into an unsigned int
+ static inline unsigned int UnpackUnsignedInt(char* buffer);
+ static inline unsigned int UnpackUnsignedInt(const char* buffer);
+ // unpacks a buffer into an unsigned short
+ static inline unsigned short UnpackUnsignedShort(char* buffer);
+ static inline unsigned short UnpackUnsignedShort(const char* buffer);
+};
+
+// -------------------------------------------------------------
+// static 'utility' method implementations
+
+// checks BGZF block header
+inline
+bool BgzfData::CheckBlockHeader(char* header) {
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (char)GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );
+}
+
+// 'packs' an unsigned integer into the specified buffer
+inline
+void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+ buffer[2] = (char)(value >> 16);
+ buffer[3] = (char)(value >> 24);
+}
+
+// 'packs' an unsigned short into the specified buffer
+inline
+void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {
+ buffer[0] = (char)value;
+ buffer[1] = (char)(value >> 8);
+}
+
+// 'unpacks' a buffer into a double (includes both non-const & const char* flavors)
+inline
+double BgzfData::UnpackDouble(char* buffer) {
+ union { double value; unsigned char valueBuffer[sizeof(double)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ un.valueBuffer[4] = buffer[4];
+ un.valueBuffer[5] = buffer[5];
+ un.valueBuffer[6] = buffer[6];
+ un.valueBuffer[7] = buffer[7];
+ return un.value;
+}
+
+inline
+double BgzfData::UnpackDouble(const char* buffer) {
+ union { double value; unsigned char valueBuffer[sizeof(double)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ un.valueBuffer[4] = buffer[4];
+ un.valueBuffer[5] = buffer[5];
+ un.valueBuffer[6] = buffer[6];
+ un.valueBuffer[7] = buffer[7];
+ return un.value;
+}
+
+// 'unpacks' a buffer into a float (includes both non-const & const char* flavors)
+inline
+float BgzfData::UnpackFloat(char* buffer) {
+ union { float value; unsigned char valueBuffer[sizeof(float)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+inline
+float BgzfData::UnpackFloat(const char* buffer) {
+ union { float value; unsigned char valueBuffer[sizeof(float)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors)
+inline
+signed int BgzfData::UnpackSignedInt(char* buffer) {
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+inline
+signed int BgzfData::UnpackSignedInt(const char* buffer) {
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors)
+inline
+signed short BgzfData::UnpackSignedShort(char* buffer) {
+ union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+inline
+signed short BgzfData::UnpackSignedShort(const char* buffer) {
+ union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors)
+inline
+unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+inline
+unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) {
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ un.valueBuffer[2] = buffer[2];
+ un.valueBuffer[3] = buffer[3];
+ return un.value;
+}
+
+// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors)
+inline
+unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+inline
+unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) {
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;
+ un.value = 0;
+ un.valueBuffer[0] = buffer[0];
+ un.valueBuffer[1] = buffer[1];
+ return un.value;
+}
+
+} // namespace BamTools
+
+#endif // BGZF_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.cpp
new file mode 100755
index 0000000..73a586c
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.cpp
@@ -0,0 +1,696 @@
+// ***************************************************************************
+// BamAlignment.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#include <BamAlignment.h>
+using namespace BamTools;
+
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <map>
+#include <utility>
+using namespace std;
+
+// default ctor
+BamAlignment::BamAlignment(void)
+ : RefID(-1)
+ , Position(-1)
+ , MateRefID(-1)
+ , MatePosition(-1)
+ , InsertSize(0)
+{ }
+
+// copy ctor
+BamAlignment::BamAlignment(const BamAlignment& other)
+ : Name(other.Name)
+ , Length(other.Length)
+ , QueryBases(other.QueryBases)
+ , AlignedBases(other.AlignedBases)
+ , Qualities(other.Qualities)
+ , TagData(other.TagData)
+ , RefID(other.RefID)
+ , Position(other.Position)
+ , Bin(other.Bin)
+ , MapQuality(other.MapQuality)
+ , AlignmentFlag(other.AlignmentFlag)
+ , CigarData(other.CigarData)
+ , MateRefID(other.MateRefID)
+ , MatePosition(other.MatePosition)
+ , InsertSize(other.InsertSize)
+ , SupportData(other.SupportData)
+{ }
+
+// dtor
+BamAlignment::~BamAlignment(void) { }
+
+// Queries against alignment flags
+bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
+bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
+bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
+bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
+bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
+bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
+bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
+bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
+bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
+bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
+
+// Manipulate alignment flags
+void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }
+void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }
+void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }
+void BamAlignment::SetIsMapped(bool ok) { SetIsUnmapped(!ok); }
+void BamAlignment::SetIsMateMapped(bool ok) { SetIsMateUnmapped(!ok); }
+void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }
+void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }
+void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }
+void BamAlignment::SetIsPrimaryAlignment(bool ok) { SetIsSecondaryAlignment(!ok); }
+void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }
+void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }
+void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }
+void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }
+void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }
+
+// calculates alignment end position, based on starting position and CIGAR operations
+int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const {
+
+ // initialize alignment end to starting position
+ int alignEnd = Position;
+
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator cigarIter = CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {
+ const char cigarType = (*cigarIter).Type;
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' )
+ alignEnd += (*cigarIter).Length;
+ else if ( usePadded && cigarType == 'I' )
+ alignEnd += (*cigarIter).Length;
+ }
+
+ // adjust for zeroBased, if necessary
+ if (zeroBased)
+ return alignEnd - 1;
+ else
+ return alignEnd;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, copy tag data to temp buffer
+ string newTag = tag + type + value;
+ const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) {
+ return AddTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag already exists, return false
+ // use EditTag explicitly instead
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;
+
+ // otherwise, convert value to string
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+
+ // copy original tag data to temp buffer
+ string newTag = tag + type;
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float
+ char originalTagData[newTagDataLength];
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term
+
+ // append newTag
+ strcat(originalTagData + tagDataLength, newTag.data());
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));
+
+ // store temp buffer back in TagData
+ const char* newTagData = (const char*)originalTagData;
+ TagData.assign(newTagData, newTagDataLength);
+
+ // return success
+ return true;
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type != "Z" && type != "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + value.size()];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ const unsigned int dataLength = strlen(value.c_str());
+ memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "f" || type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) {
+ return EditTag(tag, type, (const uint32_t&)value);
+}
+
+bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) {
+
+ if ( SupportData.HasCoreOnly ) return false;
+ if ( tag.size() != 2 || type.size() != 1 ) return false;
+ if ( type == "Z" || type == "H" ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ // make sure array is more than big enough
+ char newTagData[originalTagDataLength + sizeof(value)];
+
+ // copy original tag data up til desired tag
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // copy new VALUE in place of current tag data
+ union { float value; char valueBuffer[sizeof(float)]; } un;
+ un.value = value;
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData - 1;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);
+
+ // ensure null-terminator
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;
+
+ // save new tag data
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, attempt AddTag
+ else return AddTag(tag, type, value);
+}
+
+// get "NM" tag data - originally contributed by Aaron Quinlan
+// stores data in 'editDistance', returns success/fail
+bool BamAlignment::GetEditDistance(uint32_t& editDistance) const {
+ return GetTag("NM", (uint32_t&)editDistance);
+}
+
+// get "RG" tag data
+// stores data in 'readGroup', returns success/fail
+bool BamAlignment::GetReadGroup(string& readGroup) const {
+ return GetTag("RG", readGroup);
+}
+
+bool BamAlignment::GetTag(const string& tag, string& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+ const unsigned int dataLength = strlen(pTagData);
+ destination.clear();
+ destination.resize(dataLength);
+ memcpy( (char*)destination.data(), pTagData, dataLength );
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch (type) {
+
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type for integer destination (float or var-length strings)
+ case 'f':
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTag(const string& tag, int32_t& destination) const {
+ return GetTag(tag, (uint32_t&)destination);
+}
+
+bool BamAlignment::GetTag(const string& tag, float& destination) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, determine data byte-length, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // determine data byte-length
+ const char type = *(pTagData - 1);
+ int destinationLength = 0;
+ switch(type) {
+
+ // 1 byte data
+ case 'A':
+ case 'c':
+ case 'C':
+ destinationLength = 1;
+ break;
+
+ // 2 byte data
+ case 's':
+ case 'S':
+ destinationLength = 2;
+ break;
+
+ // 4 byte data
+ case 'f':
+ case 'i':
+ case 'I':
+ destinationLength = 4;
+ break;
+
+ // unsupported type (var-length strings)
+ case 'Z':
+ case 'H':
+ fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type);
+ return false;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+
+ // store in destination
+ destination = 0.0;
+ memcpy(&destination, pTagData, destinationLength);
+ return true;
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::GetTagType(const string& tag, char& type) const {
+
+ // make sure tag data exists
+ if ( SupportData.HasCoreOnly || TagData.empty() )
+ return false;
+
+ // localize the tag data
+ char* pTagData = (char*)TagData.data();
+ const unsigned int tagDataLength = TagData.size();
+ unsigned int numBytesParsed = 0;
+
+ // lookup tag
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
+
+ // retrieve tag type code
+ type = *(pTagData - 1);
+
+ // validate that type is a proper BAM tag type
+ switch(type) {
+ case 'A':
+ case 'c':
+ case 'C':
+ case 's':
+ case 'S':
+ case 'f':
+ case 'i':
+ case 'I':
+ case 'Z':
+ case 'H':
+ return true;
+
+ // unknown tag type
+ default:
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type);
+ return false;
+ }
+ }
+
+ // tag not found, return failure
+ return false;
+}
+
+bool BamAlignment::RemoveTag(const string& tag) {
+
+ // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed
+ // also, return false if no data present to remove
+ if ( SupportData.HasCoreOnly || TagData.empty() ) return false;
+
+ // localize the tag data
+ char* pOriginalTagData = (char*)TagData.data();
+ char* pTagData = pOriginalTagData;
+ const unsigned int originalTagDataLength = TagData.size();
+ unsigned int newTagDataLength = 0;
+ unsigned int numBytesParsed = 0;
+
+ // if tag found, store data in readGroup, return success
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {
+
+ char newTagData[originalTagDataLength];
+
+ // copy original tag data up til desired tag
+ pTagData -= 3;
+ numBytesParsed -= 3;
+ const unsigned int beginningTagDataLength = numBytesParsed;
+ newTagDataLength += beginningTagDataLength;
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);
+
+ // skip to next tag (if tag for removal is last, return true)
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;
+
+ // copy everything from current tag (the next one after tag for removal) to end
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;
+ memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );
+
+ // save new tag data
+ TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);
+ return true;
+ }
+
+ // tag not found, no removal - return failure
+ return false;
+}
+
+bool BamAlignment::FindTag(const string& tag,
+ char* &pTagData,
+ const unsigned int& tagDataLength,
+ unsigned int& numBytesParsed)
+{
+
+ while ( numBytesParsed < tagDataLength ) {
+
+ const char* pTagType = pTagData;
+ const char* pTagStorageType = pTagData + 2;
+ pTagData += 3;
+ numBytesParsed += 3;
+
+ // check the current tag, return true on match
+ if ( strncmp(pTagType, tag.c_str(), 2) == 0 )
+ return true;
+
+ // get the storage class and find the next tag
+ if ( *pTagStorageType == '\0' ) return false;
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;
+ if ( *pTagData == '\0' ) return false;
+ }
+
+ // checked all tags, none match
+ return false;
+}
+
+bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {
+
+ switch(storageType) {
+
+ case 'A':
+ case 'c':
+ case 'C':
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ case 's':
+ case 'S':
+ numBytesParsed += 2;
+ pTagData += 2;
+ break;
+
+ case 'f':
+ case 'i':
+ case 'I':
+ numBytesParsed += 4;
+ pTagData += 4;
+ break;
+
+ case 'Z':
+ case 'H':
+ while(*pTagData) {
+ ++numBytesParsed;
+ ++pTagData;
+ }
+ // increment for null-terminator
+ ++numBytesParsed;
+ ++pTagData;
+ break;
+
+ default:
+ // error case
+ fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType);
+ return false;
+ }
+
+ // return success
+ return true;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.h
new file mode 100755
index 0000000..f469f5b
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAlignment.h
@@ -0,0 +1,203 @@
+// ***************************************************************************
+// BamAlignment.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the BamAlignment data structure
+// ***************************************************************************
+
+#ifndef BAMALIGNMENT_H
+#define BAMALIGNMENT_H
+
+#include <api_global.h>
+#include <BamAux.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+// forward declare BamAlignment's friend classes
+namespace Internal {
+ class BamReaderPrivate;
+ class BamWriterPrivate;
+} // namespace Internal
+
+// BamAlignment data structure
+// explicitly labeled as 'struct' to indicate that (most of) its fields are public
+struct API_EXPORT BamAlignment {
+
+ // constructors & destructor
+ public:
+ BamAlignment(void);
+ BamAlignment(const BamAlignment& other);
+ ~BamAlignment(void);
+
+ // Queries against alignment flags
+ public:
+ bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate
+ bool IsFailedQC(void) const; // Returns true if this read failed quality control
+ bool IsFirstMate(void) const; // Returns true if alignment is first mate on read
+ bool IsMapped(void) const; // Returns true if alignment is mapped
+ bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped
+ bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand
+ bool IsPaired(void) const; // Returns true if alignment part of paired-end read
+ bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment
+ bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution
+ bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand
+ bool IsSecondMate(void) const; // Returns true if alignment is second mate on read
+
+ // Manipulate alignment flags
+ public:
+ void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag
+ void SetIsFailedQC(bool ok); // Sets "failed quality control" flag
+ void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag
+ void SetIsMapped(bool ok); // Sets "alignment is mapped" flag
+ void SetIsMateMapped(bool ok); // Sets "alignment's mate is mapped" flag
+ void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag
+ void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag
+ void SetIsPrimaryAlignment(bool ok); // Sets "position is primary alignment" flag
+ void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag
+ void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag
+ void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag
+
+ // legacy methods (deprecated, but available)
+ void SetIsMateUnmapped(bool ok); // Complement of IsMateMapped() flag
+ void SetIsSecondaryAlignment(bool ok); // Complement of IsPrimaryAlignment() flag
+ void SetIsUnmapped(bool ok); // Complement of IsMapped() flag
+
+ // Tag data access methods
+ public:
+ // -------------------------------------------------------------------------------------
+ // N.B. - The following tag access methods may not be used on BamAlignments fetched
+ // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in
+ // error message (to keep output clean) but will ALWAYS return false. Only user-created
+ // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here.
+
+ // add tag data (create new TAG entry with TYPE and VALUE)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if new data added, false if error or TAG already exists
+ // N.B. - will NOT modify existing tag. Use EditTag() instead
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - value to associate with tag
+ bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details
+ // returns true if edit was successfaul, false if error
+ // @tag - two character tag name
+ // @type - single character tag type (see SAM/BAM spec for details)
+ // @value - new value for tag
+ bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H
+ bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i
+ bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f
+
+ // specific tag data access methods - these only remain for legacy support
+ // returns whether specific tag could be retrieved
+ bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance))
+ bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup))
+
+ // generic tag data access methods
+ // returns whether tag is found & tag type is compatible with DESTINATION
+ // @tag - two character tag name
+ // @destination - if found, tag value is stored here
+ bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings
+ bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data
+ bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data
+ bool GetTag(const std::string& tag, float& destination) const; // access floating point data
+
+ // retrieve the tag type code for TAG
+ // returns true if tag could be found and type determined
+ bool GetTagType(const std::string& tag, char& type) const;
+
+ // remove tag data
+ // returns true if removal was successful, false if error
+ // N.B. - returns false if TAG does not exist (no removal can occur)
+ // @tag - two character tag name
+ bool RemoveTag(const std::string& tag);
+
+ // Additional data access methods
+ public:
+ // calculates & returns alignment end position, based on starting position and CIGAR operations
+ // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference
+ // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats.
+ int GetEndPosition(bool usePadded = false, bool zeroBased = true) const;
+
+ // 'internal' utility methods
+ private:
+ static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);
+ static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);
+
+ // Data members
+ public:
+ std::string Name; // Read name
+ int32_t Length; // Query length
+ std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
+ std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
+ std::string TagData; // Tag data (accessor methods will pull the requested information out)
+ int32_t RefID; // ID number for reference sequence
+ int32_t Position; // Position (0-based) where alignment starts
+ uint16_t Bin; // Bin in BAM file where this alignment resides
+ uint16_t MapQuality; // Mapping quality score
+ uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
+ int32_t MatePosition; // Position (0-based) where alignment's mate starts
+ int32_t InsertSize; // Mate-pair insert size
+
+ // Internal data, inaccessible to client code
+ // but available BamReaderPrivate & BamWriterPrivate
+ private:
+ struct BamAlignmentSupportData {
+
+ // data members
+ std::string AllCharData;
+ uint32_t BlockLength;
+ uint32_t NumCigarOperations;
+ uint32_t QueryNameLength;
+ uint32_t QuerySequenceLength;
+ bool HasCoreOnly;
+
+ // constructor
+ BamAlignmentSupportData(void)
+ : BlockLength(0)
+ , NumCigarOperations(0)
+ , QueryNameLength(0)
+ , QuerySequenceLength(0)
+ , HasCoreOnly(false)
+ { }
+ };
+ BamAlignmentSupportData SupportData;
+ friend class Internal::BamReaderPrivate;
+ friend class Internal::BamWriterPrivate;
+
+ // Alignment flag query constants
+ // Use the get/set methods above instead
+ private:
+ enum { PAIRED = 1
+ , PROPER_PAIR = 2
+ , UNMAPPED = 4
+ , MATE_UNMAPPED = 8
+ , REVERSE = 16
+ , MATE_REVERSE = 32
+ , READ_1 = 64
+ , READ_2 = 128
+ , SECONDARY = 256
+ , QC_FAILED = 512
+ , DUPLICATE = 1024
+ };
+};
+
+// convenience typedef(s)
+typedef std::vector<BamAlignment> BamAlignmentVector;
+
+} // namespace BamTools
+
+#endif // BAMALIGNMENT_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAux.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAux.h
new file mode 100755
index 0000000..9671303
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamAux.h
@@ -0,0 +1,227 @@
+// ***************************************************************************
+// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic constants, data structures, utilities etc.
+// used throughout the API for handling BAM files
+// ***************************************************************************
+
+#ifndef BAMAUX_H
+#define BAMAUX_H
+
+#include <api_global.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// Platform-specific large-file support
+#ifndef BAMTOOLS_LFS
+#define BAMTOOLS_LFS
+ #ifdef WIN32
+ #define ftell64(a) _ftelli64(a)
+ #define fseek64(a,b,c) _fseeki64(a,b,c)
+ #else
+ #define ftell64(a) ftello(a)
+ #define fseek64(a,b,c) fseeko(a,b,c)
+ #endif
+#endif // BAMTOOLS_LFS
+
+// Platform-specific type definitions
+#ifndef BAMTOOLS_TYPES
+#define BAMTOOLS_TYPES
+ #ifdef _MSC_VER
+ typedef char int8_t;
+ typedef unsigned char uint8_t;
+ typedef short int16_t;
+ typedef unsigned short uint16_t;
+ typedef int int32_t;
+ typedef unsigned int uint32_t;
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+ #else
+ #include <stdint.h>
+ #endif
+#endif // BAMTOOLS_TYPES
+
+namespace BamTools {
+
+// ----------------------------------------------------------------
+// ----------------------------------------------------------------
+// BAM constants
+
+const int BAM_CMATCH = 0;
+const int BAM_CINS = 1;
+const int BAM_CDEL = 2;
+const int BAM_CREF_SKIP = 3;
+const int BAM_CSOFT_CLIP = 4;
+const int BAM_CHARD_CLIP = 5;
+const int BAM_CPAD = 6;
+const int BAM_CIGAR_SHIFT = 4;
+const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
+const int BAM_CORE_SIZE = 32;
+const int BT_SIZEOF_INT = 4;
+
+// ----------------------------------------------------------------
+// ----------------------------------------------------------------
+// Data structs & typedefs
+
+// CIGAR operation data structure
+struct API_EXPORT CigarOp {
+
+ // data members
+ char Type; // Operation type (MIDNSHP)
+ uint32_t Length; // Operation length (number of bases)
+
+ // constructor
+ CigarOp(const char type = '\0',
+ const uint32_t length = 0)
+ : Type(type)
+ , Length(length)
+ { }
+};
+
+// Reference data entry
+struct API_EXPORT RefData {
+
+ // data members
+ std::string RefName; // Name of reference sequence
+ int32_t RefLength; // Length of reference sequence
+ bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
+
+ // constructor
+ RefData(const int32_t& length = 0,
+ bool ok = false)
+ : RefLength(length)
+ , RefHasAlignments(ok)
+ { }
+};
+typedef std::vector<RefData> RefVector;
+
+// General (sequential) genome region
+struct API_EXPORT BamRegion {
+
+ // data members
+ int LeftRefID;
+ int LeftPosition;
+ int RightRefID;
+ int RightPosition;
+
+ // constructor
+ BamRegion(const int& leftID = -1,
+ const int& leftPos = -1,
+ const int& rightID = -1,
+ const int& rightPos = -1)
+ : LeftRefID(leftID)
+ , LeftPosition(leftPos)
+ , RightRefID(rightID)
+ , RightPosition(rightPos)
+ { }
+
+ // copy constructor
+ BamRegion(const BamRegion& other)
+ : LeftRefID(other.LeftRefID)
+ , LeftPosition(other.LeftPosition)
+ , RightRefID(other.RightRefID)
+ , RightPosition(other.RightPosition)
+ { }
+
+ // member functions
+ void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; }
+ bool isLeftBoundSpecified(void) const { return ( LeftRefID >= 0 && LeftPosition >= 0 ); }
+ bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); }
+ bool isRightBoundSpecified(void) const { return ( RightRefID >= 0 && RightPosition >= 0 ); }
+};
+
+// ----------------------------------------------------------------
+// ----------------------------------------------------------------
+// General utilities
+
+// returns true if system is big endian
+inline bool SystemIsBigEndian(void) {
+ const uint16_t one = 0x0001;
+ return ((*(char*) &one) == 0 );
+}
+
+// swaps endianness of 16-bit value 'in place'
+inline void SwapEndian_16(int16_t& x) {
+ x = ((x >> 8) | (x << 8));
+}
+
+inline void SwapEndian_16(uint16_t& x) {
+ x = ((x >> 8) | (x << 8));
+}
+
+// swaps endianness of 32-bit value 'in-place'
+inline void SwapEndian_32(int32_t& x) {
+ x = ( (x >> 24) |
+ ((x << 8) & 0x00FF0000) |
+ ((x >> 8) & 0x0000FF00) |
+ (x << 24)
+ );
+}
+
+inline void SwapEndian_32(uint32_t& x) {
+ x = ( (x >> 24) |
+ ((x << 8) & 0x00FF0000) |
+ ((x >> 8) & 0x0000FF00) |
+ (x << 24)
+ );
+}
+
+// swaps endianness of 64-bit value 'in-place'
+inline void SwapEndian_64(int64_t& x) {
+ x = ( (x >> 56) |
+ ((x << 40) & 0x00FF000000000000ll) |
+ ((x << 24) & 0x0000FF0000000000ll) |
+ ((x << 8) & 0x000000FF00000000ll) |
+ ((x >> 8) & 0x00000000FF000000ll) |
+ ((x >> 24) & 0x0000000000FF0000ll) |
+ ((x >> 40) & 0x000000000000FF00ll) |
+ (x << 56)
+ );
+}
+
+inline void SwapEndian_64(uint64_t& x) {
+ x = ( (x >> 56) |
+ ((x << 40) & 0x00FF000000000000ll) |
+ ((x << 24) & 0x0000FF0000000000ll) |
+ ((x << 8) & 0x000000FF00000000ll) |
+ ((x >> 8) & 0x00000000FF000000ll) |
+ ((x >> 24) & 0x0000000000FF0000ll) |
+ ((x >> 40) & 0x000000000000FF00ll) |
+ (x << 56)
+ );
+}
+
+// swaps endianness of 'next 2 bytes' in a char buffer (in-place)
+inline void SwapEndian_16p(char* data) {
+ uint16_t& value = (uint16_t&)*data;
+ SwapEndian_16(value);
+}
+
+// swaps endianness of 'next 4 bytes' in a char buffer (in-place)
+inline void SwapEndian_32p(char* data) {
+ uint32_t& value = (uint32_t&)*data;
+ SwapEndian_32(value);
+}
+
+// swaps endianness of 'next 8 bytes' in a char buffer (in-place)
+inline void SwapEndian_64p(char* data) {
+ uint64_t& value = (uint64_t&)*data;
+ SwapEndian_64(value);
+}
+
+// returns whether file exists (can be opened OK)
+inline bool FileExists(const std::string& filename) {
+ std::ifstream f(filename.c_str(), std::ifstream::in);
+ return !f.fail();
+}
+
+} // namespace BamTools
+
+#endif // BAMAUX_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.cpp
new file mode 100755
index 0000000..eab8a69
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.cpp
@@ -0,0 +1,230 @@
+// ***************************************************************************
+// BamIndex.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index functionality - both for the default (standardized) BAM
+// index format (.bai) as well as a BamTools-specific (nonstandard) index
+// format (.bti).
+// ***************************************************************************
+
+#include <BamIndex.h>
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamStandardIndex_p.h>
+#include <BamToolsIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+using namespace std;
+
+// --------------------------------------------------
+// BamIndex factory methods
+
+// returns index based on BAM filename 'stub'
+// checks first for preferred type, returns that type if found
+// (if not found, attmempts to load other type(s), returns 0 if NONE found)
+//
+// ** default preferred type is BamToolsIndex ** use this anytime it exists
+BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ const BamIndex::PreferredIndexType& type)
+{
+ // ---------------------------------------------------
+ // attempt to load preferred type first
+
+ const std::string bamtoolsIndexFilename = bamFilename + ".bti";
+ const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename);
+ if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists )
+ return new BamToolsIndex(bgzf, reader);
+
+ const std::string standardIndexFilename = bamFilename + ".bai";
+ const bool standardIndexExists = BamTools::FileExists(standardIndexFilename);
+ if ( (type == BamIndex::STANDARD) && standardIndexExists )
+ return new BamStandardIndex(bgzf, reader);
+
+ // ----------------------------------------------------
+ // preferred type could not be found, try other (non-preferred) types
+ // if none found, return 0
+
+ if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader);
+ if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader);
+ return 0;
+}
+
+// returns index based on explicitly named index file (or 0 if not found)
+BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader)
+{
+ // see if specified file exists
+ const bool indexExists = BamTools::FileExists(indexFilename);
+ if ( !indexExists ) return 0;
+
+ const std::string bamtoolsIndexExtension(".bti");
+ const std::string standardIndexExtension(".bai");
+
+ // if has bamtoolsIndexExtension
+ if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) )
+ return new BamToolsIndex(bgzf, reader);
+
+ // if has standardIndexExtension
+ if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) )
+ return new BamStandardIndex(bgzf, reader);
+
+ // otherwise, unsupported file type
+ return 0;
+}
+
+// -------------------------------
+// BamIndex implementation
+
+// ctor
+BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
+ : m_BGZF(bgzf)
+ , m_reader(reader)
+ , m_cacheMode(BamIndex::LimitedIndexCaching)
+ , m_indexStream(0)
+{
+ if ( m_reader && m_reader->IsOpen() )
+ m_references = m_reader->GetReferenceData();
+}
+
+// dtor
+BamIndex::~BamIndex(void) {
+ if ( IsOpen() )
+ fclose(m_indexStream);
+}
+
+// return true if FILE* is open
+bool BamIndex::IsOpen(void) const {
+ return ( m_indexStream != 0 );
+}
+
+// loads existing data from file into memory
+bool BamIndex::Load(const string& filename) {
+
+ // open index file, abort on error
+ if ( !OpenIndexFile(filename, "rb") ) {
+ fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
+ return false;
+ }
+
+ // check magic number
+ if ( !LoadHeader() ) {
+ fclose(m_indexStream);
+ return false;
+ }
+
+ // load reference data (but only keep in memory if full caching requested)
+ bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
+ if ( !LoadAllReferences(saveInitialLoad) ) {
+ fclose(m_indexStream);
+ return false;
+ }
+
+ // update index cache based on selected mode
+ UpdateCache();
+
+ // return success
+ return true;
+}
+
+// opens index file for reading/writing, return true if opened OK
+bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
+ m_indexStream = fopen(filename.c_str(), mode.c_str());
+ return ( m_indexStream != 0 );
+}
+
+// rewind index file to beginning of index data, return true if rewound OK
+bool BamIndex::Rewind(void) {
+ return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
+}
+
+// change the index caching behavior
+void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
+ if ( mode != m_cacheMode ) {
+ m_cacheMode = mode;
+ UpdateCache();
+ }
+}
+
+// updates in-memory cache of index data, depending on current cache mode
+void BamIndex::UpdateCache(void) {
+
+ // skip if file not open
+ if ( !IsOpen() ) return;
+
+ // reflect requested cache mode behavior
+ switch ( m_cacheMode ) {
+
+ case (BamIndex::FullIndexCaching) :
+ Rewind();
+ LoadAllReferences(true);
+ break;
+
+ case (BamIndex::LimitedIndexCaching) :
+ if ( HasFullDataCache() )
+ KeepOnlyFirstReferenceOffsets();
+ else {
+ ClearAllData();
+ SkipToFirstReference();
+ LoadFirstReference(true);
+ }
+ break;
+ case(BamIndex::NoIndexCaching) :
+ ClearAllData();
+ break;
+ default :
+ // unreachable
+ ;
+ }
+}
+
+// writes in-memory index data out to file
+bool BamIndex::Write(const string& bamFilename) {
+
+ // open index file for writing
+ string indexFilename = bamFilename + Extension();
+ if ( !OpenIndexFile(indexFilename, "wb") ) {
+ fprintf(stderr, "ERROR: Could not open file to save index.\n");
+ return false;
+ }
+
+ // write index header data
+ if ( !WriteHeader() ) {
+ fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
+ fflush(m_indexStream);
+ fclose(m_indexStream);
+ exit(1);
+ }
+
+ // write main index data
+ if ( !WriteAllReferences() ) {
+ fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
+ fflush(m_indexStream);
+ fclose(m_indexStream);
+ exit(1);
+ }
+
+ // flush any remaining output, rewind file, and return success
+ fflush(m_indexStream);
+ fclose(m_indexStream);
+
+ // re-open index file for later reading
+ if ( !OpenIndexFile(indexFilename, "rb") ) {
+ fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
+ return false;
+ }
+
+ // return success/failure of write
+ return true;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.h
new file mode 100755
index 0000000..a1dfbfe
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamIndex.h
@@ -0,0 +1,145 @@
+// ***************************************************************************
+// BamIndex.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides basic BAM index interface
+// ***************************************************************************
+
+#ifndef BAM_INDEX_H
+#define BAM_INDEX_H
+
+#include <api_global.h>
+#include <BamAux.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class BamReader;
+class BgzfData;
+
+namespace Internal {
+ class BamStandardIndex;
+ class BamToolsIndex;
+} // namespace Internal
+
+// --------------------------------------------------
+// BamIndex base class
+class API_EXPORT BamIndex {
+
+ // specify index-caching behavior
+ //
+ // @FullIndexCaching - store entire index file contents in memory
+ // @LimitedIndexCaching - store only index data for current reference
+ // being processed
+ // @NoIndexCaching - do not store any index data. Load as needed to
+ // calculate jump offset
+ public: enum BamIndexCacheMode { FullIndexCaching = 0
+ , LimitedIndexCaching
+ , NoIndexCaching
+ };
+
+ // ctor & dtor
+ public:
+ BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
+ virtual ~BamIndex(void);
+
+ // index interface
+ public:
+ // creates index data (in-memory) from current reader data
+ virtual bool Build(void) =0;
+ // returns supported file extension
+ virtual const std::string Extension(void) const =0;
+ // returns whether reference has alignments or no
+ virtual bool HasAlignments(const int& referenceID) const =0;
+ // attempts to use index to jump to region; returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0;
+ // loads existing data from file into memory
+ virtual bool Load(const std::string& filename);
+ // change the index caching behavior
+ virtual void SetCacheMode(const BamIndexCacheMode mode);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ virtual bool Write(const std::string& bamFilename);
+
+ // derived-classes MUST provide implementation
+ protected:
+ // clear all current index offset data in memory
+ virtual void ClearAllData(void) =0;
+ // return file position after header metadata
+ virtual const off_t DataBeginOffset(void) const =0;
+ // return true if all index data is cached
+ virtual bool HasFullDataCache(void) const =0;
+ // clears index data from all references except the first
+ virtual void KeepOnlyFirstReferenceOffsets(void) =0;
+ // load index data for all references, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ virtual bool LoadAllReferences(bool saveData = true) =0;
+ // load first reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ virtual bool LoadFirstReference(bool saveData = true) =0;
+ // load header data from index file, return true if loaded OK
+ virtual bool LoadHeader(void) =0;
+ // position file pointer to first reference begin, return true if skipped OK
+ virtual bool SkipToFirstReference(void) =0;
+ // write index reference data
+ virtual bool WriteAllReferences(void) =0;
+ // write index header data
+ virtual bool WriteHeader(void) =0;
+
+ // internal methods
+ protected:
+ // rewind index file to beginning of index data, return true if rewound OK
+ bool Rewind(void);
+
+ private:
+ // return true if FILE* is open
+ bool IsOpen(void) const;
+ // opens index file according to requested mode, return true if opened OK
+ bool OpenIndexFile(const std::string& filename, const std::string& mode);
+ // updates in-memory cache of index data, depending on current cache mode
+ void UpdateCache(void);
+
+ // factory methods for returning proper BamIndex-derived type based on available index files
+ public:
+
+ // returns index based on BAM filename 'stub'
+ // checks first for preferred type, returns that type if found
+ // (if not found, attmempts to load other type(s), returns 0 if NONE found)
+ //
+ // ** default preferred type is BamToolsIndex ** use this anytime it exists
+ enum PreferredIndexType { BAMTOOLS = 0, STANDARD };
+ static BamIndex* FromBamFilename(const std::string& bamFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS);
+
+ // returns index based on explicitly named index file (or 0 if not found)
+ static BamIndex* FromIndexFilename(const std::string& indexFilename,
+ BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader);
+
+ // data members
+ protected:
+ BamTools::BgzfData* m_BGZF;
+ BamTools::BamReader* m_reader;
+ BamTools::RefVector m_references;
+ BamIndex::BamIndexCacheMode m_cacheMode;
+ FILE* m_indexStream;
+
+
+ // friends
+ friend class Internal::BamStandardIndex;
+ friend class Internal::BamToolsIndex;
+};
+
+} // namespace BamTools
+
+#endif // BAM_INDEX_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.cpp
new file mode 100755
index 0000000..15b8fb2
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.cpp
@@ -0,0 +1,450 @@
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset. It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+#include <BamMultiReader.h>
+#include <BGZF.h>
+using namespace BamTools;
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+using namespace std;
+
+// -----------------------------------------------------
+// BamMultiReader implementation
+// -----------------------------------------------------
+
+// constructor
+BamMultiReader::BamMultiReader(void)
+ : CurrentRefID(0)
+ , CurrentLeft(0)
+{ }
+
+// destructor
+BamMultiReader::~BamMultiReader(void) {
+ Close();
+}
+
+// close the BAM files
+void BamMultiReader::Close(void) {
+
+ // close all BAM readers and clean up pointers
+ vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin();
+ vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter) {
+
+ BamReader* reader = (*readerIter).first;
+ BamAlignment* alignment = (*readerIter).second;
+
+ // close the reader
+ if ( reader) reader->Close();
+
+ // delete reader pointer
+ delete reader;
+ reader = 0;
+
+ // delete alignment pointer
+ delete alignment;
+ alignment = 0;
+ }
+
+ // clear out the container
+ readers.clear();
+}
+
+// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->CreateIndex(useStandardIndex);
+ }
+ return result;
+}
+
+// sets the index caching mode on the readers
+void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ reader->SetIndexCacheMode(mode);
+ }
+}
+
+// for debugging
+void BamMultiReader::DumpAlignmentIndex(void) {
+ for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
+ }
+}
+
+// makes a virtual, unified header for all the bam files in the multireader
+const string BamMultiReader::GetHeaderText(void) const {
+
+ string mergedHeader = "";
+ map<string, bool> readGroups;
+
+ // foreach extraction entry (each BAM file)
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
+
+ BamReader* reader = rs->first;
+ string headerText = reader->GetHeaderText();
+ if ( headerText.empty() ) continue;
+
+ map<string, bool> currentFileReadGroups;
+ stringstream header(headerText);
+ vector<string> lines;
+ string item;
+ while (getline(header, item))
+ lines.push_back(item);
+
+ for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+
+ // get next line from header, skip if empty
+ string headerLine = *it;
+ if ( headerLine.empty() ) { continue; }
+
+ // if first file, save HD & SQ entries
+ if ( rs == readers.begin() ) {
+ if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
+ mergedHeader.append(headerLine.c_str());
+ mergedHeader.append(1, '\n');
+ }
+ }
+
+ // (for all files) append RG entries if they are unique
+ if ( headerLine.find("@RG") == 0 ) {
+ stringstream headerLineSs(headerLine);
+ string part, readGroupPart, readGroup;
+ while(std::getline(headerLineSs, part, '\t')) {
+ stringstream partSs(part);
+ string subtag;
+ std::getline(partSs, subtag, ':');
+ if (subtag == "ID") {
+ std::getline(partSs, readGroup, ':');
+ break;
+ }
+ }
+ if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+ mergedHeader.append(headerLine.c_str() );
+ mergedHeader.append(1, '\n');
+ readGroups[readGroup] = true;
+ currentFileReadGroups[readGroup] = true;
+ } else {
+ // warn iff we are reading one file and discover duplicated @RG tags in the header
+ // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+ if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+ cerr << "WARNING: duplicate @RG tag " << readGroup
+ << " entry in header of " << reader->GetFilename() << endl;
+ }
+ }
+ }
+ }
+ }
+
+ // return merged header text
+ return mergedHeader;
+}
+
+// get next alignment among all files
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignment(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+ //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignmentCore(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// ---------------------------------------------------------------------------------------
+//
+// NB: The following GetReferenceX() functions assume that we have identical
+// references for all BAM files. We enforce this by invoking the above
+// validation function (ValidateReaders) to verify that our reference data
+// is the same across all files on Open, so we will not encounter a situation
+// in which there is a mismatch and we are still live.
+//
+// ---------------------------------------------------------------------------------------
+
+// returns the number of reference sequences
+const int BamMultiReader::GetReferenceCount(void) const {
+ return readers.front().first->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
+ return readers.front().first->GetReferenceData();
+}
+
+// returns refID from reference name
+const int BamMultiReader::GetReferenceID(const string& refName) const {
+ return readers.front().first->GetReferenceID(refName);
+}
+
+// ---------------------------------------------------------------------------------------
+
+// checks if any readers still have alignments
+bool BamMultiReader::HasOpenReaders() {
+ return alignments.size() > 0;
+}
+
+// returns whether underlying BAM readers ALL have an index loaded
+// this is useful to indicate whether Jump() or SetRegion() are possible
+bool BamMultiReader::IsIndexLoaded(void) const {
+ bool ok = true;
+ vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin();
+ vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd = readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ const BamReader* reader = (*readerIter).first;
+ if ( reader ) ok &= reader->IsIndexLoaded();
+ }
+ return ok;
+}
+
+// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
+bool BamMultiReader::Jump(int refID, int position) {
+
+ //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
+ CurrentRefID = refID;
+ CurrentLeft = position;
+
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Jump(refID, position);
+ if (!result) {
+ cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
+ exit(1);
+ }
+ }
+ if (result) UpdateAlignments();
+ return result;
+}
+
+// opens BAM files
+bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) {
+
+ // for filename in filenames
+ fileNames = filenames; // save filenames in our multireader
+ for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
+
+ const string filename = *it;
+ BamReader* reader = new BamReader;
+
+ bool openedOK = true;
+ openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex);
+
+ // if file opened ok, check that it can be read
+ if ( openedOK ) {
+
+ bool fileOK = true;
+ BamAlignment* alignment = new BamAlignment;
+ fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) );
+
+ if (fileOK) {
+ readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else {
+ cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+ // if only file available & could not be read, return failure
+ if ( filenames.size() == 1 ) return false;
+ }
+ }
+
+ // TODO; any further error handling when openedOK is false ??
+ else
+ return false;
+ }
+
+ // files opened ok, at least one alignment could be read,
+ // now need to check that all files use same reference data
+ ValidateReaders();
+ return true;
+}
+
+void BamMultiReader::PrintFilenames(void) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ cout << reader->GetFilename() << endl;
+ }
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReader::Rewind(void) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Rewind();
+ }
+ return result;
+}
+
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
+ BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
+ return SetRegion(region);
+}
+
+bool BamMultiReader::SetRegion(const BamRegion& region) {
+
+ Region = region;
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ if (!it->first->SetRegion(region)) {
+ cerr << "ERROR: could not jump " << it->first->GetFilename() << " to "
+ << region.LeftRefID << ":" << region.LeftPosition
+ << ".." << region.RightRefID << ":" << region.RightPosition << endl;
+ }
+ }
+
+ UpdateAlignments();
+ return true;
+}
+
+void BamMultiReader::UpdateAlignments(void) {
+ // Update Alignments
+ alignments.clear();
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* br = it->first;
+ BamAlignment* ba = it->second;
+ if (br->GetNextAlignment(*ba)) {
+ alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
+ make_pair(br, ba)));
+ } else {
+ // assume BamReader end of region / EOF
+ }
+ }
+}
+
+// updates the reference id stored in the BamMultiReader
+// to reflect the current state of the readers
+void BamMultiReader::UpdateReferenceID(void) {
+ // the alignments are sorted by position, so the first alignment will always have the lowest reference ID
+ if (alignments.begin()->second.second->RefID != CurrentRefID) {
+ // get the next reference id
+ // while there aren't any readers at the next ref id
+ // increment the ref id
+ int nextRefID = CurrentRefID;
+ while (alignments.begin()->second.second->RefID != nextRefID) {
+ ++nextRefID;
+ }
+ //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+ CurrentRefID = nextRefID;
+ }
+}
+
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+void BamMultiReader::ValidateReaders(void) const {
+ int firstRefCount = readers.front().first->GetReferenceCount();
+ BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ BamTools::RefVector currentRefData = reader->GetReferenceData();
+ BamTools::RefVector::const_iterator f = firstRefData.begin();
+ BamTools::RefVector::const_iterator c = currentRefData.begin();
+ if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
+ cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
+ << " expected " << firstRefCount
+ << " reference sequences but only found " << reader->GetReferenceCount() << endl;
+ exit(1);
+ }
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while (f != firstRefData.end()) {
+ if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
+ cerr << "ERROR: mismatched references found in " << reader->GetFilename()
+ << " expected: " << endl;
+ for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ cerr << "but found: " << endl;
+ for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ exit(1);
+ }
+ ++f; ++c;
+ }
+ }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.h
new file mode 100755
index 0000000..e3c05cc
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamMultiReader.h
@@ -0,0 +1,136 @@
+// ***************************************************************************
+// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// ***************************************************************************
+
+#ifndef BAMMULTIREADER_H
+#define BAMMULTIREADER_H
+
+#include <api_global.h>
+#include <BamReader.h>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+
+namespace BamTools {
+
+// index mapping reference/position pairings to bamreaders and their alignments
+typedef std::multimap<std::pair<int, int>, std::pair<BamReader*, BamAlignment*> > AlignmentIndex;
+
+class API_EXPORT BamMultiReader {
+
+ // constructor / destructor
+ public:
+ BamMultiReader(void);
+ ~BamMultiReader(void);
+
+ // public interface
+ public:
+
+ // positioning
+ int CurrentRefID;
+ int CurrentLeft;
+
+ // region under analysis, specified using SetRegion
+ BamRegion Region;
+
+ // ----------------------
+ // BAM file operations
+ // ----------------------
+
+ // close BAM files
+ void Close(void);
+
+ // opens BAM files (and optional BAM index files, if provided)
+ // @openIndexes - triggers index opening, useful for suppressing
+ // error messages during merging of files in which we may not have
+ // indexes.
+ // @coreMode - setup our first alignments using GetNextAlignmentCore();
+ // also useful for merging
+ // @preferStandardIndex - look for standard BAM index ".bai" first. If false,
+ // will look for BamTools index ".bti".
+ bool Open(const std::vector<std::string>& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false);
+
+ // returns whether underlying BAM readers ALL have an index loaded
+ // this is useful to indicate whether Jump() or SetRegion() are possible
+ bool IsIndexLoaded(void) const;
+
+ // performs random-access jump to reference, position
+ bool Jump(int refID, int position = 0);
+
+ // sets the target region
+ bool SetRegion(const BamRegion& region);
+ bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above
+
+ // returns file pointers to beginning of alignments
+ bool Rewind(void);
+
+ // ----------------------
+ // access alignment data
+ // ----------------------
+ // updates the reference id marker to match the lower limit of our readers
+ void UpdateReferenceID(void);
+
+ // retrieves next available alignment (returns success/fail) from all files
+ bool GetNextAlignment(BamAlignment&);
+ // retrieves next available alignment (returns success/fail) from all files
+ // and populates the support data with information about the alignment
+ // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT
+ bool GetNextAlignmentCore(BamAlignment&);
+ // ... should this be private?
+ bool HasOpenReaders(void);
+
+ // ----------------------
+ // access auxiliary data
+ // ----------------------
+
+ // returns unified SAM header text for all files
+ const std::string GetHeaderText(void) const;
+ // returns number of reference sequences
+ const int GetReferenceCount(void) const;
+ // returns vector of reference objects
+ const BamTools::RefVector GetReferenceData(void) const;
+ // returns reference id (used for BamMultiReader::Jump()) for the given reference name
+ const int GetReferenceID(const std::string& refName) const;
+ // validates that we have a congruent set of BAM files that are aligned against the same reference sequences
+ void ValidateReaders() const;
+
+ // ----------------------
+ // BAM index operations
+ // ----------------------
+
+ // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")
+ bool CreateIndexes(bool useStandardIndex = true);
+
+ // sets the index caching mode for the readers
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ //const int GetReferenceID(const string& refName) const;
+
+ // utility
+ void PrintFilenames(void);
+ void DumpAlignmentIndex(void);
+ void UpdateAlignments(void); // updates our alignment cache
+
+ // private implementation
+ private:
+
+ // the set of readers and alignments which we operate on, maintained throughout the life of this class
+ std::vector<std::pair<BamReader*, BamAlignment*> > readers;
+
+ // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment
+ // when a reader reaches EOF, its entry is removed from this index
+ AlignmentIndex alignments;
+
+ std::vector<std::string> fileNames;
+};
+
+} // namespace BamTools
+
+#endif // BAMMULTIREADER_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.cpp
new file mode 100755
index 0000000..70339a6
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.cpp
@@ -0,0 +1,66 @@
+// ***************************************************************************
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include <BamReader.h>
+#include <BamReader_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <vector>
+using namespace std;
+
+// constructor
+BamReader::BamReader(void) {
+ d = new BamReaderPrivate(this);
+}
+
+// destructor
+BamReader::~BamReader(void) {
+ delete d;
+ d = 0;
+}
+
+// file operations
+void BamReader::Close(void) { d->Close(); }
+bool BamReader::HasIndex(void) const { return d->HasIndex; }
+bool BamReader::IsIndexLoaded(void) const { return HasIndex(); }
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }
+bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); }
+bool BamReader::Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex)
+{
+ return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex);
+}
+bool BamReader::Rewind(void) { return d->Rewind(); }
+bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {
+ return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );
+}
+
+// access alignment data
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }
+
+// access auxiliary data
+const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); }
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }
+const std::string BamReader::GetFilename(void) const { return d->Filename; }
+
+// index operations
+bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }
+void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); }
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.h
new file mode 100755
index 0000000..b5d9a26
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader.h
@@ -0,0 +1,130 @@
+// ***************************************************************************
+// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+#include <api_global.h>
+#include <BamAlignment.h>
+#include <BamIndex.h>
+#include <string>
+
+namespace BamTools {
+
+namespace Internal {
+ class BamReaderPrivate;
+} // namespace Internal
+
+class API_EXPORT BamReader {
+
+ // constructor / destructor
+ public:
+ BamReader(void);
+ ~BamReader(void);
+
+ // public interface
+ public:
+
+ // ----------------------
+ // BAM file operations
+ // ----------------------
+
+ // close BAM file
+ void Close(void);
+ // returns whether reader is open for reading or not
+ bool IsOpen(void) const;
+ // performs random-access jump using (reference, position) as a left-bound
+ bool Jump(int refID, int position = 0);
+ // opens BAM file (and optional BAM index file, if provided)
+ // @lookForIndex - if no indexFilename provided, look in BAM file's directory for an existing index file
+ // default behavior is to skip index file search if no index filename given
+ // @preferStandardIndex - if true, give priority in index file searching to standard BAM index (*.bai)
+ // default behavior is to prefer the BamToolsIndex (*.bti) if both are available
+ bool Open(const std::string& filename,
+ const std::string& indexFilename = "",
+ const bool lookForIndex = false,
+ const bool preferStandardIndex = false);
+ // returns file pointer to beginning of alignments
+ bool Rewind(void);
+ // sets a region of interest (with left & right bound reference/position)
+ // returns success/failure of seeking to left bound of region
+ bool SetRegion(const BamRegion& region);
+ bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound);
+
+ // ----------------------
+ // access alignment data
+ // ----------------------
+
+ // retrieves next available alignment (returns success/fail)
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ // retrieves next available alignment core data (returns success/fail)
+ // ** DOES NOT parse any character data (read name, bases, qualities, tag data) **
+ // useful for operations requiring ONLY aligner-related information
+ // (refId/position, alignment flags, CIGAR, mapQuality, etc)
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);
+
+ // ----------------------
+ // access auxiliary data
+ // ----------------------
+
+ // returns SAM header text
+ const std::string GetHeaderText(void) const;
+ // returns number of reference sequences
+ int GetReferenceCount(void) const;
+ // returns vector of reference objects
+ const BamTools::RefVector& GetReferenceData(void) const;
+ // returns reference id (used for BamReader::Jump()) for the given reference name
+ int GetReferenceID(const std::string& refName) const;
+ // returns the name of the file associated with this BamReader
+ const std::string GetFilename(void) const;
+
+ // ----------------------
+ // BAM index operations
+ // ----------------------
+
+ // creates index for BAM file, saves to file
+ // default behavior is to create the BAM standard index (".bai")
+ // set flag to false to create the BamTools-specific index (".bti")
+ bool CreateIndex(bool useStandardIndex = true);
+ // returns whether index data is available for reading
+ // (e.g. if true, BamReader should be able to seek to a region)
+ bool HasIndex(void) const;
+ // change the index caching behavior
+ // default BamReader/Index mode is LimitedIndexCaching
+ // @mode - can be either FullIndexCaching, LimitedIndexCaching,
+ // or NoIndexCaching. See BamIndex.h for more details
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ // deprecated methods
+ public:
+
+ // deprecated (but still available): prefer HasIndex() instead
+ //
+ // Deprecated purely for API semantic clarity - HasIndex() should be clearer
+ // than IsIndexLoaded() in light of the new caching modes that may clear the
+ // index data from memory, but leave the index file open for later random access
+ // seeks.
+ //
+ // For example, what would (IsIndexLoaded() == true) mean when cacheMode has been
+ // explicitly set to NoIndexCaching? This is confusing at best, misleading about
+ // current memory behavior at worst.
+ //
+ // returns whether index data is available
+ // (e.g. if true, BamReader should be able to seek to a region)
+ bool IsIndexLoaded(void) const;
+
+ // private implementation
+ private:
+ Internal::BamReaderPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMREADER_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.cpp
new file mode 100755
index 0000000..f319a1e
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.cpp
@@ -0,0 +1,729 @@
+// ***************************************************************************
+// BamReader_p.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamReader_p.h>
+#include <BamStandardIndex_p.h>
+#include <BamToolsIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <vector>
+using namespace std;
+
+// constructor
+BamReaderPrivate::BamReaderPrivate(BamReader* parent)
+ : HeaderText("")
+ , Index(0)
+ , HasIndex(false)
+ , AlignmentsBeginOffset(0)
+// , m_header(0)
+ , IndexCacheMode(BamIndex::LimitedIndexCaching)
+ , HasAlignmentsInRegion(true)
+ , Parent(parent)
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
+ , CIGAR_LOOKUP("MIDNSHP")
+{
+ IsBigEndian = SystemIsBigEndian();
+}
+
+// destructor
+BamReaderPrivate::~BamReaderPrivate(void) {
+ Close();
+}
+
+// adjusts requested region if necessary (depending on where data actually begins)
+void BamReaderPrivate::AdjustRegion(BamRegion& region) {
+
+ // check for valid index first
+ if ( Index == 0 ) return;
+
+ // see if any references in region have alignments
+ HasAlignmentsInRegion = false;
+ int currentId = region.LeftRefID;
+
+ const int rightBoundRefId = ( region.isRightBoundSpecified() ? region.RightRefID : References.size() - 1 );
+ while ( currentId <= rightBoundRefId ) {
+ HasAlignmentsInRegion = Index->HasAlignments(currentId);
+ if ( HasAlignmentsInRegion ) break;
+ ++currentId;
+ }
+
+ // if no data found on any reference in region
+ if ( !HasAlignmentsInRegion ) return;
+
+ // if left bound of desired region had no data, use first reference that had data
+ // otherwise, leave requested region as-is
+ if ( currentId != region.LeftRefID ) {
+ region.LeftRefID = currentId;
+ region.LeftPosition = 0;
+ }
+}
+
+// fills out character data for BamAlignment data
+bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {
+
+ // calculate character lengths/offsets
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;
+ const unsigned int tagDataLength = dataLength - tagDataOffset;
+
+ // check offsets to see what char data exists
+ const bool hasSeqData = ( seqDataOffset < dataLength );
+ const bool hasQualData = ( qualDataOffset < dataLength );
+ const bool hasTagData = ( tagDataOffset < dataLength );
+
+ // set up char buffers
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();
+ const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 );
+ const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 );
+ char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 );
+
+ // store alignment name (relies on null char in name as terminator)
+ bAlignment.Name.assign((const char*)(allCharData));
+
+ // save query sequence
+ bAlignment.QueryBases.clear();
+ if ( hasSeqData ) {
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ bAlignment.QueryBases.append(1, singleBase);
+ }
+ }
+
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
+ bAlignment.Qualities.clear();
+ if ( hasQualData ) {
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleQuality = (char)(qualData[i]+33);
+ bAlignment.Qualities.append(1, singleQuality);
+ }
+ }
+
+ // if QueryBases is empty (and this is a allowed case)
+ if ( bAlignment.QueryBases.empty() )
+ bAlignment.AlignedBases = bAlignment.QueryBases;
+
+ // if QueryBases contains data, then build AlignedBases using CIGAR data
+ else {
+
+ // resize AlignedBases
+ bAlignment.AlignedBases.clear();
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+
+ // iterate over CigarOps
+ int k = 0;
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+
+ const CigarOp& op = (*cigarIter);
+ switch(op.Type) {
+
+ case ('M') :
+ case ('I') :
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases
+ // fall through
+
+ case ('S') :
+ k += op.Length; // for 'S' - soft clip, skip over query bases
+ break;
+
+ case ('D') :
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character
+ break;
+
+ case ('P') :
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character
+ break;
+
+ case ('N') :
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence
+ break;
+
+ case ('H') :
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op
+
+ default:
+ fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // save tag data
+ bAlignment.TagData.clear();
+ if ( hasTagData ) {
+ if ( IsBigEndian ) {
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i += sizeof(uint64_t);
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // store tagData in alignment
+ bAlignment.TagData.resize(tagDataLength);
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);
+ }
+
+ // clear the core-only flag
+ bAlignment.SupportData.HasCoreOnly = false;
+
+ // return success
+ return true;
+}
+
+// clear index data structure
+void BamReaderPrivate::ClearIndex(void) {
+ delete Index;
+ Index = 0;
+ HasIndex = false;
+}
+
+// closes the BAM file
+void BamReaderPrivate::Close(void) {
+
+ // close BGZF file stream
+ mBGZF.Close();
+
+ // clear out index data
+ ClearIndex();
+
+ // clear out header data
+ HeaderText.clear();
+// if ( m_header ) {
+// delete m_header;
+// m_header = 0;
+// }
+
+ // clear out region flags
+ Region.clear();
+}
+
+// creates index for BAM file, saves to file
+// default behavior is to create the BAM standard index (".bai")
+// set flag to false to create the BamTools-specific index (".bti")
+bool BamReaderPrivate::CreateIndex(bool useStandardIndex) {
+
+ // clear out prior index data
+ ClearIndex();
+
+ // create index based on type requested
+ if ( useStandardIndex )
+ Index = new BamStandardIndex(&mBGZF, Parent);
+ else
+ Index = new BamToolsIndex(&mBGZF, Parent);
+
+ // set index cache mode to full for writing
+ Index->SetCacheMode(BamIndex::FullIndexCaching);
+
+ // build new index
+ bool ok = true;
+ ok &= Index->Build();
+ HasIndex = ok;
+
+ // mark empty references
+ MarkReferences();
+
+ // attempt to save index data to file
+ ok &= Index->Write(Filename);
+
+ // set client's desired index cache mode
+ Index->SetCacheMode(IndexCacheMode);
+
+ // return success/fail of both building & writing index
+ return ok;
+}
+
+const string BamReaderPrivate::GetHeaderText(void) const {
+
+ return HeaderText;
+
+// if ( m_header )
+// return m_header->Text();
+// else
+// return string("");
+}
+
+// get next alignment (from specified region, if given)
+bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
+
+ // if valid alignment found, attempt to parse char data, and return success/failure
+ if ( GetNextAlignmentCore(bAlignment) )
+ return BuildCharData(bAlignment);
+
+ // no valid alignment found
+ else return false;
+}
+
+// retrieves next available alignment core data (returns success/fail)
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)
+// these can be accessed, if necessary, from the supportData
+// useful for operations requiring ONLY positional or other alignment-related information
+bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {
+
+ // if region is set but has no alignments
+ if ( !Region.isNull() && !HasAlignmentsInRegion )
+ return false;
+
+ // if valid alignment available
+ if ( LoadNextAlignment(bAlignment) ) {
+
+ // set core-only flag
+ bAlignment.SupportData.HasCoreOnly = true;
+
+ // if region not specified with at least a left boundary, return success
+ if ( !Region.isLeftBoundSpecified() ) return true;
+
+ // determine region state (before, within, after)
+ BamReaderPrivate::RegionState state = IsOverlap(bAlignment);
+
+ // if alignment lies after region, return false
+ if ( state == AFTER_REGION ) return false;
+
+ while ( state != WITHIN_REGION ) {
+ // if no valid alignment available (likely EOF) return failure
+ if ( !LoadNextAlignment(bAlignment) ) return false;
+ // if alignment lies after region, return false (no available read within region)
+ state = IsOverlap(bAlignment);
+ if ( state == AFTER_REGION ) return false;
+ }
+
+ // return success (alignment found that overlaps region)
+ return true;
+ }
+
+ // no valid alignment
+ else return false;
+}
+
+// returns RefID for given RefName (returns References.size() if not found)
+int BamReaderPrivate::GetReferenceID(const string& refName) const {
+
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = References.begin();
+ RefVector::const_iterator refEnd = References.end();
+ for ( ; refIter != refEnd; ++refIter)
+ refNames.push_back( (*refIter).RefName );
+
+ // return 'index-of' refName ( if not found, returns refNames.size() )
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
+}
+
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
+BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
+
+ // if alignment is on any reference sequence before left bound
+ if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;
+
+ // if alignment starts on left bound reference
+ else if ( bAlignment.RefID == Region.LeftRefID ) {
+
+ // if alignment starts at or after left boundary
+ if ( bAlignment.Position >= Region.LeftPosition) {
+
+ // if right boundary is specified AND
+ // left/right boundaries are on same reference AND
+ // alignment starts past right boundary
+ if ( Region.isRightBoundSpecified() &&
+ Region.LeftRefID == Region.RightRefID &&
+ bAlignment.Position > Region.RightPosition )
+ return AFTER_REGION;
+
+ // otherwise, alignment is within region
+ return WITHIN_REGION;
+ }
+
+ // alignment starts before left boundary
+ else {
+ // check if alignment overlaps left boundary
+ if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
+ else return BEFORE_REGION;
+ }
+ }
+
+ // alignment starts on a reference after the left bound
+ else {
+
+ // if region has a right boundary
+ if ( Region.isRightBoundSpecified() ) {
+
+ // alignment is on reference between boundaries
+ if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;
+
+ // alignment is on reference after right boundary
+ else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;
+
+ // alignment is on right bound reference
+ else {
+ // check if alignment starts before or at right boundary
+ if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
+ else return AFTER_REGION;
+ }
+ }
+
+ // otherwise, alignment is after left bound reference, but there is no right boundary
+ else return WITHIN_REGION;
+ }
+}
+
+// load BAM header data
+void BamReaderPrivate::LoadHeaderData(void) {
+
+// m_header = new BamHeader(&mBGZF);
+// bool headerLoadedOk = m_header->Load();
+// if ( !headerLoadedOk )
+// cerr << "BamReader could not load header" << endl;
+
+ // check to see if proper BAM header
+ char buffer[4];
+ if (mBGZF.Read(buffer, 4) != 4) {
+ fprintf(stderr, "Could not read header type\n");
+ exit(1);
+ }
+
+ if (strncmp(buffer, "BAM\001", 4)) {
+ fprintf(stderr, "wrong header type!\n");
+ exit(1);
+ }
+
+ // get BAM header text length
+ mBGZF.Read(buffer, 4);
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(headerTextLength);
+
+ // get BAM header text
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);
+ mBGZF.Read(headerText, headerTextLength);
+ HeaderText = (string)((const char*)headerText);
+
+ // clean up calloc-ed temp variable
+ free(headerText);
+}
+
+// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail
+bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {
+
+ // clear out any existing index data
+ ClearIndex();
+
+ // if no index filename provided, so we need to look for available index files
+ if ( IndexFilename.empty() ) {
+
+ // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag
+ const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);
+ Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+
+ // generate proper IndexFilename based on type of index created
+ IndexFilename = Filename + Index->Extension();
+ }
+
+ else {
+
+ // attempt to load BamIndex based on IndexFilename provided by client
+ Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+ }
+
+ // set cache mode for BamIndex
+ Index->SetCacheMode(IndexCacheMode);
+
+ // loading the index data from file
+ HasIndex = Index->Load(IndexFilename);
+
+ // mark empty references
+ MarkReferences();
+
+ // return index status
+ return HasIndex;
+}
+
+// populates BamAlignment with alignment data under file pointer, returns success/fail
+bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
+
+ // read in the 'block length' value, make sure it's not zero
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }
+ if ( bAlignment.SupportData.BlockLength == 0 ) return false;
+
+ // read in core alignment data, make sure the right size of data was read
+ char x[BAM_CORE_SIZE];
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false;
+
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) )
+ SwapEndian_32p(&x[i]);
+ }
+
+ // set BamAlignment 'core' and 'support' data
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
+
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
+ bAlignment.Bin = tempValue >> 16;
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;
+
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
+ bAlignment.AlignmentFlag = tempValue >> 16;
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;
+
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
+
+ // set BamAlignment length
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;
+
+ // read in character data - make sure proper data size was read
+ bool readCharDataOK = false;
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);
+
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) {
+
+ // store 'allCharData' in supportData structure
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
+
+ // set success flag
+ readCharDataOK = true;
+
+ // save CIGAR ops
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
+ // even when GetNextAlignmentCore() is called
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+ CigarOp op;
+ bAlignment.CigarData.clear();
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {
+
+ // swap if necessary
+ if ( IsBigEndian ) SwapEndian_32(cigarData[i]);
+
+ // build CigarOp structure
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+ // save CigarOp
+ bAlignment.CigarData.push_back(op);
+ }
+ }
+
+ free(allCharData);
+ return readCharDataOK;
+}
+
+// loads reference data from BAM file
+void BamReaderPrivate::LoadReferenceData(void) {
+
+ // get number of reference sequences
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);
+ if ( numberRefSeqs == 0 ) return;
+ References.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
+
+ // get length of reference name
+ mBGZF.Read(buffer, 4);
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refNameLength);
+ char* refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ mBGZF.Read(refName, refNameLength);
+ mBGZF.Read(buffer, 4);
+ int refLength = BgzfData::UnpackSignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refLength);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ References.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+}
+
+// mark references with no alignment data
+void BamReaderPrivate::MarkReferences(void) {
+
+ // ensure index is available
+ if ( !HasIndex ) return;
+
+ // mark empty references
+ for ( int i = 0; i < (int)References.size(); ++i )
+ References.at(i).RefHasAlignments = Index->HasAlignments(i);
+}
+
+// opens BAM file (and index)
+bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {
+
+ // store filenames
+ Filename = filename;
+ IndexFilename = indexFilename;
+
+ // open the BGZF file for reading, return false on failure
+ if ( !mBGZF.Open(filename, "rb") ) return false;
+
+ // retrieve header text & reference data
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store file offset of first alignment
+ AlignmentsBeginOffset = mBGZF.Tell();
+
+ // if no index filename provided
+ if ( IndexFilename.empty() ) {
+
+ // client did not specify that index SHOULD be found
+ // useful for cases where sequential access is all that is required
+ if ( !lookForIndex ) return true;
+
+ // otherwise, look for index file, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex) ;
+ }
+
+ // client supplied an index filename
+ // attempt to load index data, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex);
+}
+
+// returns BAM file pointer to beginning of alignment data
+bool BamReaderPrivate::Rewind(void) {
+
+ // rewind to first alignment, return false if unable to seek
+ if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;
+
+ // retrieve first alignment data, return false if unable to read
+ BamAlignment al;
+ if ( !LoadNextAlignment(al) ) return false;
+
+ // reset default region info using first alignment in file
+ Region.clear();
+ HasAlignmentsInRegion = true;
+
+ // rewind back to beginning of first alignment
+ // return success/fail of seek
+ return mBGZF.Seek(AlignmentsBeginOffset);
+}
+
+// change the index caching behavior
+void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
+ IndexCacheMode = mode;
+ if ( Index == 0 ) return;
+ Index->SetCacheMode(mode);
+}
+
+// asks Index to attempt a Jump() to specified region
+// returns success/failure
+bool BamReaderPrivate::SetRegion(const BamRegion& region) {
+
+ // clear out any prior BamReader region data
+ //
+ // N.B. - this is cleared so that BamIndex now has free reign to call
+ // GetNextAlignmentCore() and do overlap checking without worrying about BamReader
+ // performing any overlap checking of its own and moving on to the next read... Calls
+ // to GetNextAlignmentCore() with no Region set, simply return the next alignment.
+ // This ensures that the Index is able to do just that. (All without exposing
+ // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)
+ Region.clear();
+
+ // check for existing index
+ if ( !HasIndex ) return false;
+
+ // adjust region if necessary to reflect where data actually begins
+ BamRegion adjustedRegion(region);
+ AdjustRegion(adjustedRegion);
+
+ // if no data present, return true
+ // not an error, but BamReader knows that no data is there for future alignment access
+ // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions
+ // that other BAMs have data)
+ if ( !HasAlignmentsInRegion ) {
+ Region = adjustedRegion;
+ return true;
+ }
+
+ // attempt jump to user-specified region return false if jump could not be performed at all
+ // (invalid index, unknown reference, etc)
+ //
+ // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag
+ // * This covers case where a region is requested that lies beyond the last alignment on a reference
+ // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false
+ // BamMultiReader is then able to successfully pull alignments from a region from multiple files
+ // even if one or more have no data.
+ if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false;
+
+ // save region and return success
+ Region = adjustedRegion;
+ return true;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.h
new file mode 100755
index 0000000..8011a1f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamReader_p.h
@@ -0,0 +1,137 @@
+// ***************************************************************************
+// BamReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_P_H
+#define BAMREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAlignment.h>
+#include <BamIndex.h>
+#include <BGZF.h>
+#include <string>
+
+namespace BamTools {
+
+class BamReader;
+
+namespace Internal {
+
+class BamReaderPrivate {
+
+ // enums
+ public: enum RegionState { BEFORE_REGION = 0
+ , WITHIN_REGION
+ , AFTER_REGION
+ };
+
+ // ctor & dtor
+ public:
+ BamReaderPrivate(BamReader* parent);
+ ~BamReaderPrivate(void);
+
+ // 'public' interface to BamReader
+ public:
+
+ // file operations
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex);
+ bool Rewind(void);
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);
+
+ // access auxiliary data
+ const std::string GetHeaderText(void) const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // index operations
+ bool CreateIndex(bool useStandardIndex);
+ void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode);
+
+ // 'internal' methods
+ public:
+
+ // ---------------------------------------
+ // reading alignments and auxiliary data
+
+ // adjusts requested region if necessary (depending on where data actually begins)
+ void AdjustRegion(BamRegion& region);
+ // fills out character data for BamAlignment data
+ bool BuildCharData(BamAlignment& bAlignment);
+ // checks to see if alignment overlaps current region
+ RegionState IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // mark references with 'HasAlignments' status
+ void MarkReferences(void);
+
+ // ---------------------------------
+ // index file handling
+
+ // clear out inernal index data structure
+ void ClearIndex(void);
+ // loads index from BAM index file
+ bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);
+
+ // data members
+ public:
+
+ // general file data
+ BgzfData mBGZF;
+ std::string HeaderText;
+ BamIndex* Index;
+ RefVector References;
+ bool HasIndex;
+ int64_t AlignmentsBeginOffset;
+ std::string Filename;
+ std::string IndexFilename;
+
+// Internal::BamHeader* m_header;
+
+ // index caching mode
+ BamIndex::BamIndexCacheMode IndexCacheMode;
+
+ // system data
+ bool IsBigEndian;
+
+ // user-specified region values
+ BamRegion Region;
+ bool HasAlignmentsInRegion;
+
+ // parent BamReader
+ BamReader* Parent;
+
+ // BAM character constants
+ const char* DNA_LOOKUP;
+ const char* CIGAR_LOOKUP;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMREADER_P_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.cpp
new file mode 100755
index 0000000..af9d093
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.cpp
@@ -0,0 +1,910 @@
+// ***************************************************************************
+// BamStandardIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamStandardIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+using namespace std;
+
+BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader)
+ : BamIndex(bgzf, reader)
+ , m_dataBeginOffset(0)
+ , m_hasFullDataCache(false)
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+BamStandardIndex::~BamStandardIndex(void) {
+ ClearAllData();
+}
+
+// calculate bins that overlap region
+int BamStandardIndex::BinsFromRegion(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ uint16_t bins[MAX_BIN])
+{
+ // get region boundaries
+ uint32_t begin = (unsigned int)region.LeftPosition;
+ uint32_t end;
+
+ // if right bound specified AND left&right bounds are on same reference
+ // OK to use right bound position
+ if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
+ end = (unsigned int)region.RightPosition;
+
+ // otherwise, use end of left bound reference as cutoff
+ else
+ end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1;
+
+ // initialize list, bin '0' always a valid bin
+ int i = 0;
+ bins[i++] = 0;
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
+
+ // return number of bins stored
+ return i;
+}
+
+// creates index data (in-memory) from current reader data
+bool BamStandardIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ m_reader->Rewind();
+
+ // get reference count, reserve index space
+ const int numReferences = (int)m_references.size();
+ m_indexData.clear();
+ m_hasFullDataCache = false;
+ SetReferenceCount(numReferences);
+
+ // sets default constant for bin, ID, offset, coordinate variables
+ const uint32_t defaultValue = 0xffffffffu;
+
+ // bin data
+ uint32_t saveBin(defaultValue);
+ uint32_t lastBin(defaultValue);
+
+ // reference ID data
+ int32_t saveRefID(defaultValue);
+ int32_t lastRefID(defaultValue);
+
+ // offset data
+ uint64_t saveOffset = m_BGZF->Tell();
+ uint64_t lastOffset = saveOffset;
+
+ // coordinate data
+ int32_t lastCoordinate = defaultValue;
+
+ BamAlignment bAlignment;
+ while ( m_reader->GetNextAlignmentCore(bAlignment) ) {
+
+ // change of chromosome, save ID, reset bin
+ if ( lastRefID != bAlignment.RefID ) {
+ lastRefID = bAlignment.RefID;
+ lastBin = defaultValue;
+ }
+
+ // if lastCoordinate greater than BAM position - file not sorted properly
+ else if ( lastCoordinate > bAlignment.Position ) {
+ fprintf(stderr, "BAM file not properly sorted:\n");
+ fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
+ lastCoordinate, bAlignment.Position, bAlignment.RefID);
+ exit(1);
+ }
+
+ // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
+ if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
+
+ // save linear offset entry (matched to BAM entry refID)
+ BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ ReferenceIndex& refIndex = (*indexIter).second;
+ LinearOffsetVector& offsets = refIndex.Offsets;
+ SaveLinearOffset(offsets, bAlignment, lastOffset);
+ }
+
+ // if current BamAlignment bin != lastBin, "then possibly write the binning index"
+ if ( bAlignment.Bin != lastBin ) {
+
+ // if not first time through
+ if ( saveBin != defaultValue ) {
+
+ // save Bam bin entry
+ BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ ReferenceIndex& refIndex = (*indexIter).second;
+ BamBinMap& binMap = refIndex.Bins;
+ SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // update saveOffset
+ saveOffset = lastOffset;
+
+ // update bin values
+ saveBin = bAlignment.Bin;
+ lastBin = bAlignment.Bin;
+
+ // update saveRefID
+ saveRefID = bAlignment.RefID;
+
+ // if invalid RefID, break out
+ if ( saveRefID < 0 ) break;
+ }
+
+ // make sure that current file pointer is beyond lastOffset
+ if ( m_BGZF->Tell() <= (int64_t)lastOffset ) {
+ fprintf(stderr, "Error in BGZF offsets.\n");
+ exit(1);
+ }
+
+ // update lastOffset
+ lastOffset = m_BGZF->Tell();
+
+ // update lastCoordinate
+ lastCoordinate = bAlignment.Position;
+ }
+
+ // save any leftover BAM data (as long as refID is valid)
+ if ( saveRefID >= 0 ) {
+ // save Bam bin entry
+ BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ ReferenceIndex& refIndex = (*indexIter).second;
+ BamBinMap& binMap = refIndex.Bins;
+ SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // simplify index by merging chunks
+ MergeChunks();
+
+ // iterate through references in index
+ // sort offsets in linear offset vector
+ BamStandardIndexData::iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::iterator indexEnd = m_indexData.end();
+ for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
+
+ // get reference index data
+ ReferenceIndex& refIndex = (*indexIter).second;
+ LinearOffsetVector& offsets = refIndex.Offsets;
+
+ // sort linear offsets
+ sort(offsets.begin(), offsets.end());
+ }
+
+ // rewind file pointer to beginning of alignments, return success/fail
+ return m_reader->Rewind();
+}
+
+// check index file magic number, return true if OK
+bool BamStandardIndex::CheckMagicNumber(void) {
+
+ // read in magic number
+ char magic[4];
+ size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream);
+
+ // compare to expected value
+ if ( strncmp(magic, "BAI\1", 4) != 0 ) {
+ fprintf(stderr, "Problem with index file - invalid format.\n");
+ fclose(m_indexStream);
+ return false;
+ }
+
+ // return success/failure of load
+ return (elementsRead == 4);
+}
+
+// clear all current index offset data in memory
+void BamStandardIndex::ClearAllData(void) {
+ BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+ const int& refId = (*indexIter).first;
+ ClearReferenceOffsets(refId);
+ }
+}
+
+// clear all index offset data for desired reference
+void BamStandardIndex::ClearReferenceOffsets(const int& refId) {
+
+ // look up refId, skip if not found
+ BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end() ) return ;
+
+ // clear reference data
+ ReferenceIndex& refEntry = (*indexIter).second;
+ refEntry.Bins.clear();
+ refEntry.Offsets.clear();
+
+ // set flag
+ m_hasFullDataCache = false;
+}
+
+// return file position after header metadata
+const off_t BamStandardIndex::DataBeginOffset(void) const {
+ return m_dataBeginOffset;
+}
+
+// calculates offset(s) for a given region
+bool BamStandardIndex::GetOffsets(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ vector<int64_t>& offsets,
+ bool* hasAlignmentsInRegion)
+{
+ // return false if leftBound refID is not found in index data
+ if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
+ return false;
+
+ // load index data for region if not already cached
+ if ( !IsDataLoaded(region.LeftRefID) ) {
+ bool loadedOk = true;
+ loadedOk &= SkipToReference(region.LeftRefID);
+ loadedOk &= LoadReference(region.LeftRefID);
+ if ( !loadedOk ) return false;
+ }
+
+ // calculate which bins overlap this region
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+ int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
+
+ // get bins for this reference
+ BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ const ReferenceIndex& refIndex = (*indexIter).second;
+ const BamBinMap& binMap = refIndex.Bins;
+
+ // get minimum offset to consider
+ const LinearOffsetVector& linearOffsets = refIndex.Offsets;
+ const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
+ ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
+
+ // store all alignment 'chunk' starts (file offsets) for bins in this region
+ for ( int i = 0; i < numBins; ++i ) {
+
+ const uint16_t binKey = bins[i];
+ map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
+ if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
+
+ // iterate over chunks
+ const ChunkVector& chunks = (*binIter).second;
+ std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
+ std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
+ for ( ; chunksIter != chunksEnd; ++chunksIter) {
+
+ // if valid chunk found, store its file offset
+ const Chunk& chunk = (*chunksIter);
+ if ( chunk.Stop > minOffset )
+ offsets.push_back( chunk.Start );
+ }
+ }
+ }
+
+ // clean up memory
+ free(bins);
+
+ // sort the offsets before returning
+ sort(offsets.begin(), offsets.end());
+
+ // set flag & return success
+ *hasAlignmentsInRegion = (offsets.size() != 0 );
+
+ // if cache mode set to none, dump the data we just loaded
+ if (m_cacheMode == BamIndex::NoIndexCaching )
+ ClearReferenceOffsets(region.LeftRefID);
+
+ // return succes
+ return true;
+}
+
+// returns whether reference has alignments or no
+bool BamStandardIndex::HasAlignments(const int& refId) const {
+ BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end() ) return false; // error
+ const ReferenceIndex& refEntry = (*indexIter).second;
+ return refEntry.HasAlignments;
+}
+
+// return true if all index data is cached
+bool BamStandardIndex::HasFullDataCache(void) const {
+ return m_hasFullDataCache;
+}
+
+// returns true if index cache has data for desired reference
+bool BamStandardIndex::IsDataLoaded(const int& refId) const {
+
+ // look up refId, return false if not found
+ BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end() ) return false;
+
+ // see if reference has alignments
+ // if not, it's not a problem to have no offset data
+ const ReferenceIndex& refEntry = (*indexIter).second;
+ if ( !refEntry.HasAlignments ) return true;
+
+ // return whether bin map contains data
+ return ( !refEntry.Bins.empty() );
+}
+
+// attempts to use index to jump to region; returns success/fail
+bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // make sure left-bound position is valid
+ if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength )
+ return false;
+
+ // calculate offsets for this region
+ // if failed, print message, set flag, and return failure
+ vector<int64_t> offsets;
+ if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
+ fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
+ *hasAlignmentsInRegion = false;
+ return false;
+ }
+
+ // iterate through offsets
+ BamAlignment bAlignment;
+ bool result = true;
+ for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
+
+ // attempt seek & load first available alignment
+ // set flag to true if data exists
+ result &= m_BGZF->Seek(*o);
+ *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment);
+
+ // if this alignment corresponds to desired position
+ // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
+ if ( ((bAlignment.RefID == region.LeftRefID) &&
+ ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
+ (bAlignment.RefID > region.LeftRefID) )
+ {
+ if ( o != offsets.begin() ) --o;
+ return m_BGZF->Seek(*o);
+ }
+ }
+
+ // if error in jumping, print message & set flag
+ if ( !result ) {
+ fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
+ *hasAlignmentsInRegion = false;
+ }
+
+ // return success/failure
+ return result;
+}
+
+// clears index data from all references except the first
+void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) {
+ BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
+ KeepOnlyReferenceOffsets((*indexBegin).first);
+}
+
+// clears index data from all references except the one specified
+void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) {
+ BamStandardIndexData::iterator mapIter = m_indexData.begin();
+ BamStandardIndexData::iterator mapEnd = m_indexData.end();
+ for ( ; mapIter != mapEnd; ++mapIter ) {
+ const int entryRefId = (*mapIter).first;
+ if ( entryRefId != refId )
+ ClearReferenceOffsets(entryRefId);
+ }
+}
+
+bool BamStandardIndex::LoadAllReferences(bool saveData) {
+
+ // skip if data already loaded
+ if ( m_hasFullDataCache ) return true;
+
+ // get number of reference sequences
+ uint32_t numReferences;
+ if ( !LoadReferenceCount((int&)numReferences) )
+ return false;
+
+ // iterate over reference entries
+ bool loadedOk = true;
+ for ( int i = 0; i < (int)numReferences; ++i )
+ loadedOk &= LoadReference(i, saveData);
+
+ // set flag
+ if ( loadedOk && saveData )
+ m_hasFullDataCache = true;
+
+ // return success/failure of loading references
+ return loadedOk;
+}
+
+// load header data from index file, return true if loaded OK
+bool BamStandardIndex::LoadHeader(void) {
+
+ bool loadedOk = CheckMagicNumber();
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of load
+ return loadedOk;
+}
+
+// load a single index bin entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // get bin ID
+ uint32_t binId;
+ elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(binId);
+
+ // load alignment chunks for this bin
+ ChunkVector chunks;
+ bool chunksOk = LoadChunks(chunks, saveData);
+
+ // store bin entry
+ if ( chunksOk && saveData )
+ refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
+
+ // return success/failure of load
+ return ( (elementsRead == 1) && chunksOk );
+}
+
+bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // get number of bins
+ int32_t numBins;
+ elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numBins);
+
+ // set flag
+ refEntry.HasAlignments = ( numBins != 0 );
+
+ // iterate over bins
+ bool binsOk = true;
+ for ( int i = 0; i < numBins; ++i )
+ binsOk &= LoadBin(refEntry, saveData);
+
+ // return success/failure of load
+ return ( (elementsRead == 1) && binsOk );
+}
+
+// load a single index bin entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // read in chunk data
+ uint64_t start;
+ uint64_t stop;
+ elementsRead += fread(&start, sizeof(start), 1, m_indexStream);
+ elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // save data if requested
+ if ( saveData ) chunks.push_back( Chunk(start, stop) );
+
+ // return success/failure of load
+ return ( elementsRead == 2 );
+}
+
+bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // read in number of chunks
+ uint32_t numChunks;
+ elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numChunks);
+
+ // initialize space for chunks if we're storing this data
+ if ( saveData ) chunks.reserve(numChunks);
+
+ // iterate over chunks
+ bool chunksOk = true;
+ for ( int i = 0; i < (int)numChunks; ++i )
+ chunksOk &= LoadChunk(chunks, saveData);
+
+ // sort chunk vector
+ sort( chunks.begin(), chunks.end(), ChunkLessThan );
+
+ // return success/failure of load
+ return ( (elementsRead == 1) && chunksOk );
+}
+
+// load a single index linear offset entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
+
+ size_t elementsRead = 0;
+
+ // read in number of linear offsets
+ int32_t numLinearOffsets;
+ elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
+
+ // set up destination vector (if we're saving the data)
+ LinearOffsetVector linearOffsets;
+ if ( saveData ) linearOffsets.reserve(numLinearOffsets);
+
+ // iterate over linear offsets
+ uint64_t linearOffset;
+ for ( int i = 0; i < numLinearOffsets; ++i ) {
+ elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+ if ( saveData ) linearOffsets.push_back(linearOffset);
+ }
+
+ // sort linear offsets
+ sort ( linearOffsets.begin(), linearOffsets.end() );
+
+ // save in reference index entry if desired
+ if ( saveData ) refEntry.Offsets = linearOffsets;
+
+ // return success/failure of load
+ return ( elementsRead == (size_t)(numLinearOffsets + 1) );
+}
+
+bool BamStandardIndex::LoadFirstReference(bool saveData) {
+ BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
+ return LoadReference((*indexBegin).first, saveData);
+}
+
+// load a single reference from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamStandardIndex::LoadReference(const int& refId, bool saveData) {
+
+ // look up refId
+ BamStandardIndexData::iterator indexIter = m_indexData.find(refId);
+
+ // if reference not previously loaded, create new entry
+ if ( indexIter == m_indexData.end() ) {
+ ReferenceIndex newEntry;
+ newEntry.HasAlignments = false;
+ m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
+ }
+
+ // load reference data
+ indexIter = m_indexData.find(refId);
+ ReferenceIndex& entry = (*indexIter).second;
+ bool loadedOk = true;
+ loadedOk &= LoadBins(entry, saveData);
+ loadedOk &= LoadLinearOffsets(entry, saveData);
+ return loadedOk;
+}
+
+// loads number of references, return true if loaded OK
+bool BamStandardIndex::LoadReferenceCount(int& numReferences) {
+
+ size_t elementsRead = 0;
+
+ // read reference count
+ elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // return success/failure of load
+ return ( elementsRead == 1 );
+}
+
+// merges 'alignment chunks' in BAM bin (used for index building)
+void BamStandardIndex::MergeChunks(void) {
+
+ // iterate over reference enties
+ BamStandardIndexData::iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ // get BAM bin map for this reference
+ ReferenceIndex& refIndex = (*indexIter).second;
+ BamBinMap& bamBinMap = refIndex.Bins;
+
+ // iterate over BAM bins
+ BamBinMap::iterator binIter = bamBinMap.begin();
+ BamBinMap::iterator binEnd = bamBinMap.end();
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // get chunk vector for this bin
+ ChunkVector& binChunks = (*binIter).second;
+ if ( binChunks.size() == 0 ) continue;
+
+ ChunkVector mergedChunks;
+ mergedChunks.push_back( binChunks[0] );
+
+ // iterate over chunks
+ int i = 0;
+ ChunkVector::iterator chunkIter = binChunks.begin();
+ ChunkVector::iterator chunkEnd = binChunks.end();
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
+
+ // get 'currentChunk' based on numeric index
+ Chunk& currentChunk = mergedChunks[i];
+
+ // get iteratorChunk based on vector iterator
+ Chunk& iteratorChunk = (*chunkIter);
+
+ // if chunk ends where (iterator) chunk starts, then merge
+ if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
+ currentChunk.Stop = iteratorChunk.Stop;
+
+ // otherwise
+ else {
+ // set currentChunk + 1 to iteratorChunk
+ mergedChunks.push_back(iteratorChunk);
+ ++i;
+ }
+ }
+
+ // saved merged chunk vector
+ (*binIter).second = mergedChunks;
+ }
+ }
+}
+
+// saves BAM bin entry for index
+void BamStandardIndex::SaveBinEntry(BamBinMap& binMap,
+ const uint32_t& saveBin,
+ const uint64_t& saveOffset,
+ const uint64_t& lastOffset)
+{
+ // look up saveBin
+ BamBinMap::iterator binIter = binMap.find(saveBin);
+
+ // create new chunk
+ Chunk newChunk(saveOffset, lastOffset);
+
+ // if entry doesn't exist
+ if ( binIter == binMap.end() ) {
+ ChunkVector newChunks;
+ newChunks.push_back(newChunk);
+ binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
+ }
+
+ // otherwise
+ else {
+ ChunkVector& binChunks = (*binIter).second;
+ binChunks.push_back( newChunk );
+ }
+}
+
+// saves linear offset entry for index
+void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets,
+ const BamAlignment& bAlignment,
+ const uint64_t& lastOffset)
+{
+ // get converted offsets
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
+ int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
+
+ // resize vector if necessary
+ int oldSize = offsets.size();
+ int newSize = endOffset + 1;
+ if ( oldSize < newSize )
+ offsets.resize(newSize, 0);
+
+ // store offset
+ for( int i = beginOffset + 1; i <= endOffset; ++i ) {
+ if ( offsets[i] == 0 )
+ offsets[i] = lastOffset;
+ }
+}
+
+// initializes index data structure to hold @count references
+void BamStandardIndex::SetReferenceCount(const int& count) {
+ for ( int i = 0; i < count; ++i )
+ m_indexData[i].HasAlignments = false;
+}
+
+bool BamStandardIndex::SkipToFirstReference(void) {
+ BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
+ return SkipToReference( (*indexBegin).first );
+}
+
+// position file pointer to desired reference begin, return true if skipped OK
+bool BamStandardIndex::SkipToReference(const int& refId) {
+
+ // attempt rewind
+ if ( !Rewind() ) return false;
+
+ // read in number of references
+ uint32_t numReferences;
+ size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // iterate over reference entries
+ bool skippedOk = true;
+ int currentRefId = 0;
+ while (currentRefId != refId) {
+ skippedOk &= LoadReference(currentRefId, false);
+ ++currentRefId;
+ }
+
+ // return success
+ return skippedOk;
+}
+
+// write header to new index file
+bool BamStandardIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write magic number
+ elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream);
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of write
+ return (elementsWritten == 4);
+}
+
+// write index data for all references to new index file
+bool BamStandardIndex::WriteAllReferences(void) {
+
+ size_t elementsWritten = 0;
+
+ // write number of reference sequences
+ int32_t numReferenceSeqs = m_indexData.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
+ elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream);
+
+ // iterate over reference sequences
+ bool refsOk = true;
+ BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
+ BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++ indexIter )
+ refsOk &= WriteReference( (*indexIter).second );
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && refsOk );
+}
+
+// write index data for bin to new index file
+bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
+
+ size_t elementsWritten = 0;
+
+ // write BAM bin ID
+ uint32_t binKey = binId;
+ if ( m_isBigEndian ) SwapEndian_32(binKey);
+ elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream);
+
+ // write chunks
+ bool chunksOk = WriteChunks(chunks);
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && chunksOk );
+}
+
+// write index data for bins to new index file
+bool BamStandardIndex::WriteBins(const BamBinMap& bins) {
+
+ size_t elementsWritten = 0;
+
+ // write number of bins
+ int32_t binCount = bins.size();
+ if ( m_isBigEndian ) SwapEndian_32(binCount);
+ elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream);
+
+ // iterate over bins
+ bool binsOk = true;
+ BamBinMap::const_iterator binIter = bins.begin();
+ BamBinMap::const_iterator binEnd = bins.end();
+ for ( ; binIter != binEnd; ++binIter )
+ binsOk &= WriteBin( (*binIter).first, (*binIter).second );
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && binsOk );
+}
+
+// write index data for chunk entry to new index file
+bool BamStandardIndex::WriteChunk(const Chunk& chunk) {
+
+ size_t elementsWritten = 0;
+
+ // localize alignment chunk offsets
+ uint64_t start = chunk.Start;
+ uint64_t stop = chunk.Stop;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // write to index file
+ elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream);
+ elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream);
+
+ // return success/failure of write
+ return ( elementsWritten == 2 );
+}
+
+// write index data for chunk entry to new index file
+bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) {
+
+ size_t elementsWritten = 0;
+
+ // write chunks
+ int32_t chunkCount = chunks.size();
+ if ( m_isBigEndian ) SwapEndian_32(chunkCount);
+ elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream);
+
+ // iterate over chunks
+ bool chunksOk = true;
+ ChunkVector::const_iterator chunkIter = chunks.begin();
+ ChunkVector::const_iterator chunkEnd = chunks.end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter )
+ chunksOk &= WriteChunk( (*chunkIter) );
+
+ // return success/failure of write
+ return ( (elementsWritten == 1) && chunksOk );
+}
+
+// write index data for linear offsets entry to new index file
+bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) {
+
+ size_t elementsWritten = 0;
+
+ // write number of linear offsets
+ int32_t offsetCount = offsets.size();
+ if ( m_isBigEndian ) SwapEndian_32(offsetCount);
+ elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream);
+
+ // iterate over linear offsets
+ LinearOffsetVector::const_iterator offsetIter = offsets.begin();
+ LinearOffsetVector::const_iterator offsetEnd = offsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+
+ // write linear offset
+ uint64_t linearOffset = (*offsetIter);
+ if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+ elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream);
+ }
+
+ // return success/failure of write
+ return ( elementsWritten == (size_t)(offsetCount + 1) );
+}
+
+// write index data for a single reference to new index file
+bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) {
+ bool refOk = true;
+ refOk &= WriteBins(refEntry.Bins);
+ refOk &= WriteLinearOffsets(refEntry.Offsets);
+ return refOk;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.h
new file mode 100755
index 0000000..4a40ac0
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamStandardIndex_p.h
@@ -0,0 +1,213 @@
+// ***************************************************************************
+// BamStandardIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#ifndef BAM_STANDARD_INDEX_FORMAT_H
+#define BAM_STANDARD_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAux.h>
+#include <BamIndex.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class BamAlignment;
+
+namespace Internal {
+
+// BAM index constants
+const int MAX_BIN = 37450; // =(8^6-1)/7+1
+const int BAM_LIDX_SHIFT = 14;
+
+// --------------------------------------------------
+// BamStandardIndex data structures & typedefs
+struct Chunk {
+
+ // data members
+ uint64_t Start;
+ uint64_t Stop;
+
+ // constructor
+ Chunk(const uint64_t& start = 0,
+ const uint64_t& stop = 0)
+ : Start(start)
+ , Stop(stop)
+ { }
+};
+
+inline
+bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
+ return lhs.Start < rhs.Start;
+}
+
+typedef std::vector<Chunk> ChunkVector;
+typedef std::map<uint32_t, ChunkVector> BamBinMap;
+typedef std::vector<uint64_t> LinearOffsetVector;
+
+struct ReferenceIndex {
+
+ // data members
+ BamBinMap Bins;
+ LinearOffsetVector Offsets;
+ bool HasAlignments;
+
+ // constructor
+ ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
+ const LinearOffsetVector& offsets = LinearOffsetVector(),
+ const bool hasAlignments = false)
+ : Bins(binMap)
+ , Offsets(offsets)
+ , HasAlignments(hasAlignments)
+ { }
+};
+
+typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData;
+
+class BamStandardIndex : public BamIndex {
+
+ // ctor & dtor
+ public:
+ BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
+ ~BamStandardIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // returns supported file extension
+ const std::string Extension(void) const { return std::string(".bai"); }
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index to jump to region; returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ public:
+ // clear all current index offset data in memory
+ void ClearAllData(void);
+ // return file position after header metadata
+ const off_t DataBeginOffset(void) const;
+ // return true if all index data is cached
+ bool HasFullDataCache(void) const;
+ // clears index data from all references except the first
+ void KeepOnlyFirstReferenceOffsets(void);
+ // load index data for all references, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadAllReferences(bool saveData = true);
+ // load first reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadFirstReference(bool saveData = true);
+ // load header data from index file, return true if loaded OK
+ bool LoadHeader(void);
+ // position file pointer to first reference begin, return true if skipped OK
+ bool SkipToFirstReference(void);
+ // write index reference data
+ bool WriteAllReferences(void);
+ // write index header data
+ bool WriteHeader(void);
+
+ // 'internal' methods
+ public:
+
+ // -----------------------
+ // index file operations
+
+ // check index file magic number, return true if OK
+ bool CheckMagicNumber(void);
+ // check index file version, return true if OK
+ bool CheckVersion(void);
+ // load a single index bin entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
+ bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
+ // load a single index bin entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadChunk(ChunkVector& chunks, bool saveData = true);
+ bool LoadChunks(ChunkVector& chunks, bool saveData = true);
+ // load a single index linear offset entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
+ // load a single reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadReference(const int& refId, bool saveData = true);
+ // loads number of references, return true if loaded OK
+ bool LoadReferenceCount(int& numReferences);
+ // position file pointer to desired reference begin, return true if skipped OK
+ bool SkipToReference(const int& refId);
+ // write index data for bin to new index file
+ bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
+ // write index data for bins to new index file
+ bool WriteBins(const BamBinMap& bins);
+ // write index data for chunk entry to new index file
+ bool WriteChunk(const Chunk& chunk);
+ // write index data for chunk entry to new index file
+ bool WriteChunks(const ChunkVector& chunks);
+ // write index data for linear offsets entry to new index file
+ bool WriteLinearOffsets(const LinearOffsetVector& offsets);
+ // write index data single reference to new index file
+ bool WriteReference(const ReferenceIndex& refEntry);
+
+ // -----------------------
+ // index data operations
+
+ // calculate bins that overlap region
+ int BinsFromRegion(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ uint16_t bins[MAX_BIN]);
+ // clear all index offset data for desired reference
+ void ClearReferenceOffsets(const int& refId);
+ // calculates offset(s) for a given region
+ bool GetOffsets(const BamRegion& region,
+ const bool isRightBoundSpecified,
+ std::vector<int64_t>& offsets,
+ bool* hasAlignmentsInRegion);
+ // returns true if index cache has data for desired reference
+ bool IsDataLoaded(const int& refId) const;
+ // clears index data from all references except the one specified
+ void KeepOnlyReferenceOffsets(const int& refId);
+ // simplifies index by merging 'chunks'
+ void MergeChunks(void);
+ // saves BAM bin entry for index
+ void SaveBinEntry(BamBinMap& binMap,
+ const uint32_t& saveBin,
+ const uint64_t& saveOffset,
+ const uint64_t& lastOffset);
+ // saves linear offset entry for index
+ void SaveLinearOffset(LinearOffsetVector& offsets,
+ const BamAlignment& bAlignment,
+ const uint64_t& lastOffset);
+ // initializes index data structure to hold @count references
+ void SetReferenceCount(const int& count);
+
+ // data members
+ private:
+
+ BamStandardIndexData m_indexData;
+ off_t m_dataBeginOffset;
+ bool m_hasFullDataCache;
+ bool m_isBigEndian;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAM_STANDARD_INDEX_FORMAT_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.cpp
new file mode 100755
index 0000000..1728b62
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.cpp
@@ -0,0 +1,577 @@
+// ***************************************************************************
+// BamToolsIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamReader.h>
+#include <BGZF.h>
+#include <BamToolsIndex_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <map>
+using namespace std;
+
+BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
+ : BamIndex(bgzf, reader)
+ , m_blockSize(1000)
+ , m_dataBeginOffset(0)
+ , m_hasFullDataCache(false)
+ , m_inputVersion(0)
+ , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamToolsIndex::~BamToolsIndex(void) {
+ ClearAllData();
+}
+
+// creates index data (in-memory) from current reader data
+bool BamToolsIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ if ( !m_reader->Rewind() ) return false;
+
+ // initialize index data structure with space for all references
+ const int numReferences = (int)m_references.size();
+ m_indexData.clear();
+ m_hasFullDataCache = false;
+ SetReferenceCount(numReferences);
+
+ // set up counters and markers
+ int32_t currentBlockCount = 0;
+ int64_t currentAlignmentOffset = m_BGZF->Tell();
+ int32_t blockRefId = 0;
+ int32_t blockMaxEndPosition = 0;
+ int64_t blockStartOffset = currentAlignmentOffset;
+ int32_t blockStartPosition = -1;
+
+ // plow through alignments, storing index entries
+ BamAlignment al;
+ while ( m_reader->GetNextAlignmentCore(al) ) {
+
+ // if block contains data (not the first time through) AND alignment is on a new reference
+ if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
+
+ // store previous data
+ BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ SaveOffsetEntry(blockRefId, entry);
+
+ // intialize new block for current alignment's reference
+ currentBlockCount = 0;
+ blockMaxEndPosition = al.GetEndPosition();
+ blockStartOffset = currentAlignmentOffset;
+ }
+
+ // if beginning of block, save first alignment's refID & position
+ if ( currentBlockCount == 0 ) {
+ blockRefId = al.RefID;
+ blockStartPosition = al.Position;
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // check end position
+ int32_t alignmentEndPosition = al.GetEndPosition();
+ if ( alignmentEndPosition > blockMaxEndPosition )
+ blockMaxEndPosition = alignmentEndPosition;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if ( currentBlockCount == m_blockSize ) {
+ BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ SaveOffsetEntry(blockRefId, entry);
+ blockStartOffset = m_BGZF->Tell();
+ currentBlockCount = 0;
+ }
+
+ // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
+ // necessary because we won't know if this next alignment is on a new reference until we actually read it
+ currentAlignmentOffset = m_BGZF->Tell();
+ }
+
+ // store final block with data
+ BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ SaveOffsetEntry(blockRefId, entry);
+
+ // set flag
+ m_hasFullDataCache = true;
+
+ // return success/failure of rewind
+ return m_reader->Rewind();
+}
+
+// check index file magic number, return true if OK
+bool BamToolsIndex::CheckMagicNumber(void) {
+
+ // see if index is valid BAM index
+ char magic[4];
+ size_t elementsRead = fread(magic, 1, 4, m_indexStream);
+ if ( elementsRead != 4 ) return false;
+ if ( strncmp(magic, "BTI\1", 4) != 0 ) {
+ fprintf(stderr, "Problem with index file - invalid format.\n");
+ return false;
+ }
+
+ // otherwise ok
+ return true;
+}
+
+// check index file version, return true if OK
+bool BamToolsIndex::CheckVersion(void) {
+
+ // read version from file
+ size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
+
+ // if version is negative, or zero
+ if ( m_inputVersion <= 0 ) {
+ fprintf(stderr, "Problem with index file - invalid version.\n");
+ return false;
+ }
+
+ // if version is newer than can be supported by this version of bamtools
+ else if ( m_inputVersion > m_outputVersion ) {
+ fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
+ fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
+ return false;
+ }
+
+ // ------------------------------------------------------------------
+ // check for deprecated, unsupported versions
+ // (typically whose format did not accomodate a particular bug fix)
+
+ else if ( (Version)m_inputVersion == BTI_1_0 ) {
+ fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
+ fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
+ return false;
+ }
+
+ else if ( (Version)m_inputVersion == BTI_1_1 ) {
+ fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
+ fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
+ return false;
+ }
+
+ // otherwise ok
+ else return true;
+}
+
+// clear all current index offset data in memory
+void BamToolsIndex::ClearAllData(void) {
+ BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
+ BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+ const int& refId = (*indexIter).first;
+ ClearReferenceOffsets(refId);
+ }
+}
+
+// clear all index offset data for desired reference
+void BamToolsIndex::ClearReferenceOffsets(const int& refId) {
+ if ( m_indexData.find(refId) == m_indexData.end() ) return;
+ vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
+ offsets.clear();
+ m_hasFullDataCache = false;
+}
+
+// return file position after header metadata
+const off_t BamToolsIndex::DataBeginOffset(void) const {
+ return m_dataBeginOffset;
+}
+
+// calculate BAM file offset for desired region
+// return true if no error (*NOT* equivalent to "has alignments or valid offset")
+// check @hasAlignmentsInRegion to determine this status
+// @region - target region
+// @offset - resulting seek target
+// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
+// N.B. - ignores isRightBoundSpecified
+bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
+
+ // return false if leftBound refID is not found in index data
+ BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
+ if ( indexIter == m_indexData.end()) return false;
+
+ // load index data for region if not already cached
+ if ( !IsDataLoaded(region.LeftRefID) ) {
+ bool loadedOk = true;
+ loadedOk &= SkipToReference(region.LeftRefID);
+ loadedOk &= LoadReference(region.LeftRefID);
+ if ( !loadedOk ) return false;
+ }
+
+ // localize index data for this reference (& sanity check that data actually exists)
+ indexIter = m_indexData.find(region.LeftRefID);
+ if ( indexIter == m_indexData.end()) return false;
+ const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
+ if ( referenceOffsets.empty() ) return false;
+
+ // -------------------------------------------------------
+ // calculate nearest index to jump to
+
+ // save first offset
+ offset = (*referenceOffsets.begin()).StartOffset;
+
+ // iterate over offsets entries on this reference
+ vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
+ vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+ const BamToolsIndexEntry& entry = (*offsetIter);
+ // break if alignment 'entry' overlaps region
+ if ( entry.MaxEndPosition >= region.LeftPosition ) break;
+ offset = (*offsetIter).StartOffset;
+ }
+
+ // set flag based on whether an index entry was found for this region
+ *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
+
+ // if cache mode set to none, dump the data we just loaded
+ if (m_cacheMode == BamIndex::NoIndexCaching )
+ ClearReferenceOffsets(region.LeftRefID);
+
+ // return success
+ return true;
+}
+
+// returns whether reference has alignments or no
+bool BamToolsIndex::HasAlignments(const int& refId) const {
+
+ BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end()) return false;
+ const BamToolsReferenceEntry& refEntry = (*indexIter).second;
+ return refEntry.HasAlignments;
+}
+
+// return true if all index data is cached
+bool BamToolsIndex::HasFullDataCache(void) const {
+ return m_hasFullDataCache;
+}
+
+// returns true if index cache has data for desired reference
+bool BamToolsIndex::IsDataLoaded(const int& refId) const {
+
+ BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
+ if ( indexIter == m_indexData.end()) return false;
+ const BamToolsReferenceEntry& refEntry = (*indexIter).second;
+
+ if ( !refEntry.HasAlignments ) return true; // no data period
+
+ // return whether offsets list contains data
+ return !refEntry.Offsets.empty();
+}
+
+// attempts to use index to jump to region; returns success/fail
+bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // clear flag
+ *hasAlignmentsInRegion = false;
+
+ // check valid BamReader state
+ if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) {
+ fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
+ return false;
+ }
+
+ // make sure left-bound position is valid
+ if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength )
+ return false;
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
+ fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
+ return false;
+ }
+
+ // return success/failure of seek
+ return m_BGZF->Seek(offset);
+}
+
+// clears index data from all references except the first
+void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) {
+ BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
+ KeepOnlyReferenceOffsets( (*indexBegin).first );
+}
+
+// clears index data from all references except the one specified
+void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) {
+ BamToolsIndexData::iterator mapIter = m_indexData.begin();
+ BamToolsIndexData::iterator mapEnd = m_indexData.end();
+ for ( ; mapIter != mapEnd; ++mapIter ) {
+ const int entryRefId = (*mapIter).first;
+ if ( entryRefId != refId )
+ ClearReferenceOffsets(entryRefId);
+ }
+}
+
+// load index data for all references, return true if loaded OK
+bool BamToolsIndex::LoadAllReferences(bool saveData) {
+
+ // skip if data already loaded
+ if ( m_hasFullDataCache ) return true;
+
+ // read in number of references
+ int32_t numReferences;
+ if ( !LoadReferenceCount(numReferences) ) return false;
+ //SetReferenceCount(numReferences);
+
+ // iterate over reference entries
+ bool loadedOk = true;
+ for ( int i = 0; i < numReferences; ++i )
+ loadedOk &= LoadReference(i, saveData);
+
+ // set flag
+ if ( loadedOk && saveData )
+ m_hasFullDataCache = true;
+
+ // return success/failure of load
+ return loadedOk;
+}
+
+// load header data from index file, return true if loaded OK
+bool BamToolsIndex::LoadHeader(void) {
+
+ // check magic number
+ if ( !CheckMagicNumber() ) return false;
+
+ // check BTI version
+ if ( !CheckVersion() ) return false;
+
+ // read in block size
+ size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of load
+ return (elementsRead == 1);
+}
+
+// load a single index entry from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) {
+
+ // read in index entry data members
+ size_t elementsRead = 0;
+ BamToolsIndexEntry entry;
+ elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream);
+ elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream);
+ elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream);
+ if ( elementsRead != 3 ) {
+ cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
+ return false;
+ }
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(entry.MaxEndPosition);
+ SwapEndian_64(entry.StartOffset);
+ SwapEndian_32(entry.StartPosition);
+ }
+
+ // save data
+ if ( saveData )
+ SaveOffsetEntry(refId, entry);
+
+ // return success/failure of load
+ return true;
+}
+
+// load a single reference from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamToolsIndex::LoadFirstReference(bool saveData) {
+ BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
+ return LoadReference( (*indexBegin).first, saveData );
+}
+
+// load a single reference from file, return true if loaded OK
+// @saveData - save data in memory if true, just read & discard if false
+bool BamToolsIndex::LoadReference(const int& refId, bool saveData) {
+
+ // read in number of offsets for this reference
+ uint32_t numOffsets;
+ size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(numOffsets);
+
+ // initialize offsets container for this reference
+ SetOffsetCount(refId, (int)numOffsets);
+
+ // iterate over offset entries
+ for ( unsigned int j = 0; j < numOffsets; ++j )
+ LoadIndexEntry(refId, saveData);
+
+ // return success/failure of load
+ return true;
+}
+
+// loads number of references, return true if loaded OK
+bool BamToolsIndex::LoadReferenceCount(int& numReferences) {
+
+ size_t elementsRead = 0;
+
+ // read reference count
+ elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // return success/failure of load
+ return ( elementsRead == 1 );
+}
+
+// saves an index offset entry in memory
+void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
+ BamToolsReferenceEntry& refEntry = m_indexData[refId];
+ refEntry.HasAlignments = true;
+ refEntry.Offsets.push_back(entry);
+}
+
+// pre-allocates size for offset vector
+void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) {
+ BamToolsReferenceEntry& refEntry = m_indexData[refId];
+ refEntry.Offsets.reserve(offsetCount);
+ refEntry.HasAlignments = ( offsetCount > 0);
+}
+
+// initializes index data structure to hold @count references
+void BamToolsIndex::SetReferenceCount(const int& count) {
+ for ( int i = 0; i < count; ++i )
+ m_indexData[i].HasAlignments = false;
+}
+
+// position file pointer to first reference begin, return true if skipped OK
+bool BamToolsIndex::SkipToFirstReference(void) {
+ BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
+ return SkipToReference( (*indexBegin).first );
+}
+
+// position file pointer to desired reference begin, return true if skipped OK
+bool BamToolsIndex::SkipToReference(const int& refId) {
+
+ // attempt rewind
+ if ( !Rewind() ) return false;
+
+ // read in number of references
+ int32_t numReferences;
+ size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
+ if ( elementsRead != 1 ) return false;
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+
+ // iterate over reference entries
+ bool skippedOk = true;
+ int currentRefId = 0;
+ while (currentRefId != refId) {
+ skippedOk &= LoadReference(currentRefId, false);
+ ++currentRefId;
+ }
+
+ // return success/failure of skip
+ return skippedOk;
+}
+
+// write header to new index file
+bool BamToolsIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write BTI index format 'magic number'
+ elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream);
+
+ // write BTI index format version
+ int32_t currentVersion = (int32_t)m_outputVersion;
+ if ( m_isBigEndian ) SwapEndian_32(currentVersion);
+ elementsWritten += fwrite(&currentVersion, sizeof(currentVersion), 1, m_indexStream);
+
+ // write block size
+ int32_t blockSize = m_blockSize;
+ if ( m_isBigEndian ) SwapEndian_32(blockSize);
+ elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
+
+ // store offset of beginning of data
+ m_dataBeginOffset = ftell64(m_indexStream);
+
+ // return success/failure of write
+ return ( elementsWritten == 6 );
+}
+
+// write index data for all references to new index file
+bool BamToolsIndex::WriteAllReferences(void) {
+
+ size_t elementsWritten = 0;
+
+ // write number of references
+ int32_t numReferences = (int32_t)m_indexData.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
+
+ // iterate through references in index
+ bool refOk = true;
+ BamToolsIndexData::const_iterator refIter = m_indexData.begin();
+ BamToolsIndexData::const_iterator refEnd = m_indexData.end();
+ for ( ; refIter != refEnd; ++refIter )
+ refOk &= WriteReferenceEntry( (*refIter).second );
+
+ return ( (elementsWritten == 1) && refOk );
+}
+
+// write current reference index data to new index file
+bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
+
+ size_t elementsWritten = 0;
+
+ // write number of offsets listed for this reference
+ uint32_t numOffsets = refEntry.Offsets.size();
+ if ( m_isBigEndian ) SwapEndian_32(numOffsets);
+ elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
+
+ // iterate over offset entries
+ bool entriesOk = true;
+ vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
+ vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter )
+ entriesOk &= WriteIndexEntry( (*offsetIter) );
+
+ return ( (elementsWritten == 1) && entriesOk );
+}
+
+// write current index offset entry to new index file
+bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) {
+
+ // copy entry data
+ int32_t maxEndPosition = entry.MaxEndPosition;
+ int64_t startOffset = entry.StartOffset;
+ int32_t startPosition = entry.StartPosition;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(maxEndPosition);
+ SwapEndian_64(startOffset);
+ SwapEndian_32(startPosition);
+ }
+
+ // write the reference index entry
+ size_t elementsWritten = 0;
+ elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
+ elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream);
+ elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream);
+ return ( elementsWritten == 3 );
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.h
new file mode 100755
index 0000000..3305fb6
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamToolsIndex_p.h
@@ -0,0 +1,192 @@
+// ***************************************************************************
+// BamToolsIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#ifndef BAMTOOLS_INDEX_FORMAT_H
+#define BAMTOOLS_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAux.h>
+#include <BamIndex.h>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+namespace Internal {
+
+// individual index offset entry
+struct BamToolsIndexEntry {
+
+ // data members
+ int32_t MaxEndPosition;
+ int64_t StartOffset;
+ int32_t StartPosition;
+
+ // ctor
+ BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
+ const int64_t& startOffset = 0,
+ const int32_t& startPosition = 0)
+ : MaxEndPosition(maxEndPosition)
+ , StartOffset(startOffset)
+ , StartPosition(startPosition)
+ { }
+};
+
+// reference index entry
+struct BamToolsReferenceEntry {
+
+ // data members
+ bool HasAlignments;
+ std::vector<BamToolsIndexEntry> Offsets;
+
+ // ctor
+ BamToolsReferenceEntry(void)
+ : HasAlignments(false)
+ { }
+};
+
+// the actual index data structure
+typedef std::map<int, BamToolsReferenceEntry> BamToolsIndexData;
+
+class BamToolsIndex : public BamIndex {
+
+ // keep a list of any supported versions here
+ // (might be useful later to handle any 'legacy' versions if the format changes)
+ // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
+ //
+ // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
+ //
+ // if ( indexVersion >= BTI_1_2 )
+ // do something new
+ // else
+ // do the old thing
+ enum Version { BTI_1_0 = 1
+ , BTI_1_1
+ , BTI_1_2
+ };
+
+
+ // ctor & dtor
+ public:
+ BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader);
+ ~BamToolsIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // returns supported file extension
+ const std::string Extension(void) const { return std::string(".bti"); }
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index to jump to region; returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ public:
+ // clear all current index offset data in memory
+ void ClearAllData(void);
+ // return file position after header metadata
+ const off_t DataBeginOffset(void) const;
+ // return true if all index data is cached
+ bool HasFullDataCache(void) const;
+ // clears index data from all references except the first
+ void KeepOnlyFirstReferenceOffsets(void);
+ // load index data for all references, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadAllReferences(bool saveData = true);
+ // load first reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadFirstReference(bool saveData = true);
+ // load header data from index file, return true if loaded OK
+ bool LoadHeader(void);
+ // position file pointer to first reference begin, return true if skipped OK
+ bool SkipToFirstReference(void);
+ // write index reference data
+ bool WriteAllReferences(void);
+ // write index header data
+ bool WriteHeader(void);
+
+ // 'internal' methods
+ public:
+
+ // -----------------------
+ // index file operations
+
+ // check index file magic number, return true if OK
+ bool CheckMagicNumber(void);
+ // check index file version, return true if OK
+ bool CheckVersion(void);
+ // return true if FILE* is open
+ bool IsOpen(void) const;
+ // load a single index entry from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadIndexEntry(const int& refId, bool saveData = true);
+ // load a single reference from file, return true if loaded OK
+ // @saveData - save data in memory if true, just read & discard if false
+ bool LoadReference(const int& refId, bool saveData = true);
+ // loads number of references, return true if loaded OK
+ bool LoadReferenceCount(int& numReferences);
+ // position file pointer to desired reference begin, return true if skipped OK
+ bool SkipToReference(const int& refId);
+ // write current reference index data to new index file
+ bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
+ // write current index offset entry to new index file
+ bool WriteIndexEntry(const BamToolsIndexEntry& entry);
+
+ // -----------------------
+ // index data operations
+
+ // clear all index offset data for desired reference
+ void ClearReferenceOffsets(const int& refId);
+ // calculate BAM file offset for desired region
+ // return true if no error (*NOT* equivalent to "has alignments or valid offset")
+ // check @hasAlignmentsInRegion to determine this status
+ // @region - target region
+ // @offset - resulting seek target
+ // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
+ bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
+ // returns true if index cache has data for desired reference
+ bool IsDataLoaded(const int& refId) const;
+ // clears index data from all references except the one specified
+ void KeepOnlyReferenceOffsets(const int& refId);
+ // saves an index offset entry in memory
+ void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
+ // pre-allocates size for offset vector
+ void SetOffsetCount(const int& refId, const int& offsetCount);
+ // initializes index data structure to hold @count references
+ void SetReferenceCount(const int& count);
+
+ // data members
+ private:
+ int32_t m_blockSize;
+ BamToolsIndexData m_indexData;
+ off_t m_dataBeginOffset;
+ bool m_hasFullDataCache;
+ bool m_isBigEndian;
+ int32_t m_inputVersion; // Version is serialized as int
+ Version m_outputVersion;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMTOOLS_INDEX_FORMAT_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.cpp
new file mode 100755
index 0000000..f168a2f
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.cpp
@@ -0,0 +1,47 @@
+// ***************************************************************************
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include <BamWriter.h>
+#include <BamWriter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+using namespace std;
+
+// constructor
+BamWriter::BamWriter(void) {
+ d = new BamWriterPrivate;
+}
+
+// destructor
+BamWriter::~BamWriter(void) {
+ delete d;
+ d = 0;
+}
+
+// closes the alignment archive
+void BamWriter::Close(void) {
+ d->Close();
+}
+
+// opens the alignment archive
+bool BamWriter::Open(const string& filename,
+ const string& samHeader,
+ const RefVector& referenceSequences,
+ bool isWriteUncompressed)
+{
+ return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);
+}
+
+// saves the alignment to the alignment archive
+void BamWriter::SaveAlignment(const BamAlignment& al) {
+ d->SaveAlignment(al);
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.h
new file mode 100755
index 0000000..bcbdddd
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter.h
@@ -0,0 +1,50 @@
+// ***************************************************************************
+// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_H
+#define BAMWRITER_H
+
+#include <api_global.h>
+#include <BamAlignment.h>
+#include <string>
+
+namespace BamTools {
+
+namespace Internal {
+ class BamWriterPrivate;
+} // namespace Internal
+
+class API_EXPORT BamWriter {
+
+ // constructor/destructor
+ public:
+ BamWriter(void);
+ ~BamWriter(void);
+
+ // public interface
+ public:
+ // closes the alignment archive
+ void Close(void);
+ // opens the alignment archive
+ bool Open(const std::string& filename,
+ const std::string& samHeader,
+ const BamTools::RefVector& referenceSequences,
+ bool writeUncompressed = false);
+ // saves the alignment to the alignment archive
+ void SaveAlignment(const BamTools::BamAlignment& al);
+
+ // private implementation
+ private:
+ Internal::BamWriterPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMWRITER_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.cpp
new file mode 100755
index 0000000..bc3beb0
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.cpp
@@ -0,0 +1,379 @@
+// ***************************************************************************
+// BamWriter_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include <BamAlignment.h>
+#include <BamWriter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+BamWriterPrivate::BamWriterPrivate(void) {
+ IsBigEndian = SystemIsBigEndian();
+}
+
+BamWriterPrivate::~BamWriterPrivate(void) {
+ mBGZF.Close();
+}
+
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+ mBGZF.Close();
+}
+
+// calculates minimum bin for a BAM alignment interval
+const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
+ --end;
+ if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
+ if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
+ if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
+ if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
+ if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
+ return 0;
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+ // initialize
+ const unsigned int numCigarOperations = cigarOperations.size();
+ packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
+
+ // pack the cigar data into the string
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+ unsigned int cigarOp;
+ vector<CigarOp>::const_iterator coIter;
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
+
+ switch(coIter->Type) {
+ case 'M':
+ cigarOp = BAM_CMATCH;
+ break;
+ case 'I':
+ cigarOp = BAM_CINS;
+ break;
+ case 'D':
+ cigarOp = BAM_CDEL;
+ break;
+ case 'N':
+ cigarOp = BAM_CREF_SKIP;
+ break;
+ case 'S':
+ cigarOp = BAM_CSOFT_CLIP;
+ break;
+ case 'H':
+ cigarOp = BAM_CHARD_CLIP;
+ break;
+ case 'P':
+ cigarOp = BAM_CPAD;
+ break;
+ default:
+ fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+ exit(1);
+ }
+
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
+ }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+ // prepare the encoded query string
+ const unsigned int queryLen = query.size();
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
+ encodedQuery.resize(encodedQueryLen);
+ char* pEncodedQuery = (char*)encodedQuery.data();
+ const char* pQuery = (const char*)query.data();
+
+ unsigned char nucleotideCode;
+ bool useHighWord = true;
+
+ while(*pQuery) {
+
+ switch(*pQuery) {
+
+ case '=':
+ nucleotideCode = 0;
+ break;
+
+ case 'A':
+ nucleotideCode = 1;
+ break;
+
+ case 'C':
+ nucleotideCode = 2;
+ break;
+
+ case 'G':
+ nucleotideCode = 4;
+ break;
+
+ case 'T':
+ nucleotideCode = 8;
+ break;
+
+ case 'N':
+ nucleotideCode = 15;
+ break;
+
+ default:
+ fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+ exit(1);
+ }
+
+ // pack the nucleotide code
+ if(useHighWord) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ pEncodedQuery++;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ pQuery++;
+ }
+}
+
+// opens the alignment archive
+bool BamWriterPrivate::Open(const string& filename,
+ const string& samHeader,
+ const RefVector& referenceSequences,
+ bool isWriteUncompressed)
+{
+ // open the BGZF file for writing, return failure if error
+ if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
+ return false;
+
+ // ================
+ // write the header
+ // ================
+
+ // write the BAM signature
+ const unsigned char SIGNATURE_LENGTH = 4;
+ const char* BAM_SIGNATURE = "BAM\1";
+ mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
+
+ // write the SAM header text length
+ uint32_t samHeaderLen = samHeader.size();
+ if (IsBigEndian) SwapEndian_32(samHeaderLen);
+ mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
+
+ // write the SAM header text
+ if(samHeaderLen > 0)
+ mBGZF.Write(samHeader.data(), samHeaderLen);
+
+ // write the number of reference sequences
+ uint32_t numReferenceSequences = referenceSequences.size();
+ if (IsBigEndian) SwapEndian_32(numReferenceSequences);
+ mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
+
+ // =============================
+ // write the sequence dictionary
+ // =============================
+
+ RefVector::const_iterator rsIter;
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+
+ // write the reference sequence name length
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
+
+ // write the reference sequence name
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+ // write the reference sequence length
+ int32_t referenceLength = rsIter->RefLength;
+ if (IsBigEndian) SwapEndian_32(referenceLength);
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
+ }
+
+ // return success
+ return true;
+}
+
+// saves the alignment to the alignment archive
+void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
+
+ // if BamAlignment contains only the core data and a raw char data buffer
+ // (as a result of BamReader::GetNextAlignmentCore())
+ if ( al.SupportData.HasCoreOnly ) {
+
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the raw char data
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
+ }
+
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+ // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
+ else {
+
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+ // force calculation of Bin before storing
+ const int endPosition = al.GetEndPosition();
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLength = encodedQuery.size();
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+ if (IsBigEndian) SwapEndian_32(blockSize);
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[8];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+ // write the query name
+ mBGZF.Write(al.Name.c_str(), nameLength);
+
+ // write the packed cigar
+ if ( IsBigEndian ) {
+
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {
+ if ( IsBigEndian )
+ SwapEndian_32p(&cigarData[i]);
+ }
+
+ mBGZF.Write(cigarData, packedCigarLength);
+ free(cigarData);
+ }
+ else
+ mBGZF.Write(packedCigar.data(), packedCigarLength);
+
+ // write the encoded query sequence
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ string baseQualities(al.Qualities);
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for(unsigned int i = 0; i < queryLength; i++) {
+ pBaseQualities[i] -= 33;
+ }
+ mBGZF.Write(pBaseQualities, queryLength);
+
+ // write the read group tag
+ if ( IsBigEndian ) {
+
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i+=2; // sizeof(uint16_t)
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i+=4; // sizeof(uint32_t)
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i+=8; // sizeof(uint64_t)
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ free(tagData);
+ exit(1);
+ }
+ }
+
+ mBGZF.Write(tagData, tagDataLength);
+ free(tagData);
+ }
+ else
+ mBGZF.Write(al.TagData.data(), tagDataLength);
+ }
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.h
new file mode 100755
index 0000000..f738da7
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/BamWriter_p.h
@@ -0,0 +1,63 @@
+// ***************************************************************************
+// BamWriter_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_P_H
+#define BAMWRITER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include <BamAux.h>
+#include <BGZF.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+class BamWriterPrivate {
+
+ // ctor & dtor
+ public:
+ BamWriterPrivate(void);
+ ~BamWriterPrivate(void);
+
+ // "public" interface to BamWriter
+ public:
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& samHeader,
+ const BamTools::RefVector& referenceSequences,
+ bool isWriteUncompressed);
+ void SaveAlignment(const BamAlignment& al);
+
+ // internal methods
+ public:
+ const unsigned int CalculateMinimumBin(const int begin, int end) const;
+ void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar);
+ void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+
+ // data members
+ public:
+ BgzfData mBGZF;
+ bool IsBigEndian;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMWRITER_P_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/Makevars.in b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/Makevars.in
new file mode 100755
index 0000000..1cf255a
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/Makevars.in
@@ -0,0 +1,4 @@
+PKG_LIBS=@LIBS@ -lz
+PKG_CFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@
+PKG_CXXFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/api_global.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/api_global.h
new file mode 100755
index 0000000..24f72f2
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/api_global.h
@@ -0,0 +1,22 @@
+// ***************************************************************************
+// api_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides macros for exporting & importing BamTools API library symbols
+// ***************************************************************************
+
+#ifndef API_GLOBAL_H
+#define API_GLOBAL_H
+
+#include "bamtools_global.h"
+
+#ifdef BAMTOOLS_API_LIBRARY
+# define API_EXPORT BAMTOOLS_LIBRARY_EXPORT
+#else
+# define API_EXPORT BAMTOOLS_LIBRARY_IMPORT
+#endif
+
+#endif // API_GLOBAL_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamread.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamread.cpp
new file mode 100755
index 0000000..f90bafa
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamread.cpp
@@ -0,0 +1,224 @@
+#include "pc.h"
+#include "config.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <ext/hash_map>
+#include <boost/tokenizer.hpp>
+
+#include "BamAlignment.h"
+#include "BamAux.h" /* RefVector/RefData */
+#include "BamReader.h"
+
+
+extern "C" {
+// pliu 20160911
+//#include "R.h"
+//#include "Rmath.h"
+//////
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+
+class lessAbsoluteValue {
+public:
+ bool operator()(int a, int b) const {
+ return abs(a) < abs(b);
+ }
+};
+
+
+
+
+
+//#define DEBUG 1
+
+extern "C" {
+
+
+ // read in bam file
+ SEXP read_bam(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+ boost::char_separator<char> sep2(",");
+
+ BamTools::BamReader bamf;
+
+ if (!bamf.Open(fname)) {
+ cout << "ERROR: failed to open BAM file '" << fname << "'" << endl;
+ } else {
+
+ Rprintf("opened %s\n",fname);
+ BamTools::RefVector refs = bamf.GetReferenceData();
+ BamTools::BamAlignment al;
+
+ int fcount=0;
+ while (bamf.GetNextAlignment(al)) {
+ if (!al.IsMapped() || !al.IsPrimaryAlignment()) {
+ continue;
+ }
+
+ string tagname=al.Name;
+ string chr=refs[al.RefID].RefName;
+ int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0));
+ if(al.IsReverseStrand()) { fpos=-1*fpos; }
+
+ uint32_t nms;
+ int nm=0;
+ if (al.GetEditDistance(nms)) {
+ nm=nms;
+ }
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(al.Name);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ bamf.Close();
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+
+
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamtools_global.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamtools_global.h
new file mode 100755
index 0000000..be7e034
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bamtools_global.h
@@ -0,0 +1,32 @@
+// ***************************************************************************
+// bamtools_global.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 19 November 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic definitions for exporting & importing library symbols
+// ***************************************************************************
+
+#ifndef BAMTOOLS_GLOBAL_H
+#define BAMTOOLS_GLOBAL_H
+
+// BAMTOOLS_LIBRARY_EXPORT
+#ifndef BAMTOOLS_LIBRARY_EXPORT
+# if defined(WIN32)
+# define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport)
+# else
+# define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default")))
+# endif
+#endif // BAMTOOLS_LIBRARY_EXPORT
+
+// BAMTOOLS_LIBRARY_IMPORT
+#ifndef BAMTOOLS_LIBRARY_IMPORT
+# if defined(WIN32)
+# define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport)
+# else
+# define BAMTOOLS_LIBRARY_IMPORT
+# endif
+#endif // BAMTOOLS_LIBRARY_IMPORT
+
+#endif // BAMTOOLS_GLOBAL_H
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bed2vector.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bed2vector.cpp
new file mode 100755
index 0000000..c272f35
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/bed2vector.cpp
@@ -0,0 +1,2630 @@
+#include "pc.h"
+#include "config.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <ext/hash_map>
+#include <boost/tokenizer.hpp>
+
+#ifdef HAVE_LIBBZ2
+#include <bzlib.h>
+#endif
+
+extern "C" {
+// pliu 20160911
+//#include "R.h"
+//#include "Rmath.h"
+//////
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+
+class lessAbsoluteValue {
+public:
+ bool operator()(int a, int b) const {
+ return abs(a) < abs(b);
+ }
+};
+
+
+
+#ifdef HAVE_LIBBZ2
+int get_bzline(BZFILE* b,string& line) {
+ char c;
+ int nBuf;
+ int bzerror=BZ_OK;
+
+ while(bzerror == BZ_OK) {
+ nBuf=BZ2_bzRead(&bzerror, b, &c, 1);
+ if(bzerror==BZ_OK) {
+ if(c=='\n') {
+ return bzerror;
+ } else {
+ line+=c;
+ }
+ }
+ }
+ return bzerror;
+}
+
+int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) {
+ line="";
+ if(bz2file) {
+ int bzerror=get_bzline(b,line);
+ if(bzerror==BZ_OK) {
+ return(1);
+ } else {
+ if(bzerror!=BZ_STREAM_END) {
+ cerr<<"encountered BZERROR="<<bzerror<<endl;
+ }
+ return(0);
+ }
+ } else {
+ char *cline=NULL;
+ size_t n;
+ if(getline(&cline,&n,f) != -1) {
+ if(cline) {
+ cline[strlen(cline)-1]='\0';
+ line+=cline;
+ free(cline);
+ }
+ return(1);
+ } else {
+ return(0);
+ }
+ }
+}
+#endif
+
+
+/**
+ * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign
+ * corresponding to the strand.
+ */
+
+//#define DEBUG 1
+
+extern "C" {
+SEXP read_bed_ends(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ ifstream bed_file(fname);
+
+#ifdef DEBUG
+ Rprintf("opened %s\n",fname);
+#endif
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+
+ int fcount=0;
+ while(getline(bed_file,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str()));
+ string str_start=*sit++;
+ int fstart=atoi(str_start.c_str());
+ string str_end=*sit++;
+ int fend=atoi(str_end.c_str());
+ int fpos=fstart;
+ if(sit!=tok.end()) {
+ string u0=*sit++;
+ string nfield=*sit++;
+ string strand=*sit++;
+ if(strand=="-") {
+ fpos=-1*fend;
+ }
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ bed_file.close();
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ }
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ SEXP nv;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ int* i_nv=INTEGER(nv);
+ int i=0;
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_nv[i++]=*pi;
+ }
+ SET_VECTOR_ELT(ans, csi-pos.begin(), nv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+SEXP read_meland_old(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ ifstream bed_file(fname);
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+
+ int fcount=0;
+ while(getline(bed_file,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ sit++; sit++;
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string str_len=*sit++;
+ int len=atoi(str_len.c_str());
+ string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ bed_file.close();
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi,lsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+
+
+
+ SEXP tv,nv,lv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator ili=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*ili++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 3)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ int get_a_line(FILE *f,string& line) {
+ line="";
+ char cline[1024];
+ if(fgets(cline,1024,f)) {
+ line+=cline;
+ return(1);
+ } else {
+ return(0);
+ }
+ }
+
+
+ SEXP read_meland(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+
+ Rprintf("opened %s\n",fname);
+
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ sit++;
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string str_len=*sit++;
+ int len=atoi(str_len.c_str());
+ string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi,lsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 3, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,lv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator ili=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*ili++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 3, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+// reads regular eland files, recording mismatch positions
+SEXP read_eland_mismatches(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > mm1; // position of the first mismatch (or 0 for none)
+ vector< vector<int> > mm2; // position of the second mismatch
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ sit++;
+ string seq=*sit++;
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string chr=*sit++;
+ // extract chromosome name from this
+ int chrp=chr.find("chr");
+ int pp=chr.find('.');
+ chr=chr.substr(chrp+3,pp-chrp-3);
+
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+
+ string strand=*sit++;
+ int nstrand=0;
+ if(strand=="R") {
+ fpos=-1*(fpos+seq.size()-1);
+ nstrand=1;
+ }
+
+ sit++;
+
+ int nm1=0; int nm2=0;
+ if(sit!=tok.end()) {
+ string nms=*sit++;
+ nm1=atoi(nms.substr(0,nms.size()-1).c_str());
+ if(nstrand) { nm1=seq.size()-nm1+1; }
+ }
+ if(sit!=tok.end()) {
+ string nms=*sit++;
+ nm2=atoi(nms.substr(0,nms.size()-1).c_str());
+ if(nstrand) { nm2=seq.size()-nm2+1; }
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ mm1.push_back(vector<int>());
+ mm2.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (mm1[cind]).push_back(nm1);
+ (mm2[cind]).push_back(nm2);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+
+#ifdef DEBUG
+ Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+ Rprintf("done. read %d fragments\n",fcount);
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi,lsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=mm1.begin()+(csi-pos.begin());
+ lsi=mm2.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("f"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+
+
+
+ SEXP tv,nv,lv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator ili=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*ili++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 3)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in regular eland files, adjusting the negative strand coordinate by sequence length
+ SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+ int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string sequence=*sit++;
+ int len=sequence.size();
+ // adjust probe length if eland length limit was specified
+ if(eland_tag_length>0 && len>eland_tag_length) {
+ len=eland_tag_length;
+ }
+ string str_nm=*sit++;
+ int nm=0;
+ if(str_nm[0]=='U') {
+ nm=atoi((str_nm.c_str()+1));
+ } else {
+ continue;
+ }
+ sit++; sit++; sit++;
+ string chr=*sit++;
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+ string str_strand=*sit++;
+
+ if(str_strand[0]=='R') {
+ fpos=-1*(fpos+len-1);
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+ // read in extended eland files, adjusting the negative strand coordinate by sequence length
+ SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+ int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string machinename=*sit++;
+ string runnumber=*sit++;
+ string lanenumber=*sit++;
+ *sit++;
+
+ string str_x=*sit++;
+ string str_y=*sit++;
+
+ string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y;
+
+
+
+ *sit++;
+ *sit++;
+
+
+ string sequence=*sit++;
+ *sit++;
+
+ string chr=*sit++;
+ string contig=*sit++;
+ chr=chr+contig;
+
+ int len=sequence.size();
+ // adjust probe length if eland length limit was specified
+ if(eland_tag_length>0 && len>eland_tag_length) {
+ len=eland_tag_length;
+ }
+
+
+
+ string str_pos=*sit++;
+ if(str_pos.size()<1) { continue; }
+ int fpos=atoi(str_pos.c_str());
+ string str_strand=*sit++;
+
+ if(str_strand[0]=='R') {
+ fpos=-1*(fpos+len-1);
+ }
+
+ string str_nm=*sit++;
+ // count non-digit characters
+ int nm=0;
+ for(int i=0;i<str_nm.size();i++) {
+ if(!isdigit(str_nm[i])) { nm++; }
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in eland multi files, adjusting the negative strand coordinate by sequence length
+SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG
+ Rprintf("read_eland_muti() : start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+ int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t","");
+ boost::char_separator<char> comsep(",","",boost::keep_empty_tokens);
+ boost::char_separator<char> colsep(":","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int nline=0;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+ nline++;
+ // chomp
+ size_t elpos = line.find_last_not_of("\n");
+ if(elpos != string::npos) {
+ line = line.substr(0, elpos+1);
+ }
+#ifdef DEBUG
+ Rprintf("line %d: %s\n",nline,line.c_str());
+#endif
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string sequence=*sit++;
+ string mspec=*sit++;
+ // parse out match spec
+
+ if(mspec=="NM" || mspec=="QC") { continue; }
+#ifdef DEBUG
+ Rprintf("parsing out spec \"%s\" : ",mspec.c_str());
+#endif
+
+ tokType stok(mspec, colsep);
+ tokType::iterator ssit=stok.begin();
+ string str_nm0=*ssit++;
+
+ int nm=0;
+ int nm0=atoi(str_nm0.c_str());
+ if(nm0>1) {
+#ifdef DEBUG
+ Rprintf("rejected for nm0\n");
+#endif
+ continue;
+ }
+ if(nm0==0) {
+ string str_nm1=*ssit++;
+ int nm1=atoi(str_nm1.c_str());
+ if(nm1>1) {
+#ifdef DEBUG
+ Rprintf("rejected for nm1\n");
+#endif
+ continue;
+ }
+ if(nm1==0) {
+ string str_nm2=*ssit++;
+ int nm2=atoi(str_nm2.c_str());
+ if(nm2>1) {
+#ifdef DEBUG
+ Rprintf("rejected for nm2\n");
+#endif
+ continue;
+ }
+ nm=2;
+ } else {
+ nm=1;
+ }
+ }
+
+#ifdef DEBUG
+ Rprintf("accepted (nm=%d)\n",nm);
+#endif
+ int npos=0;
+ string mpos=*sit++;
+ vector<string> mposc;
+ vector<int> mposp;
+ tokType ptok(mpos, comsep);
+ string prevchr;
+ for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) {
+ string cpos=*psit;
+ npos++;
+ int strand=1;
+ if(cpos.size()<5) {
+ Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str());
+ }
+ char lc=cpos.at(cpos.size()-1);
+
+ if(atoi(&lc)==nm) {
+ switch(cpos.at(cpos.size()-2)) {
+ case 'R': strand=-1; break;
+ case 'F': strand=1; break;
+ default:
+ Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break;
+ continue;
+ }
+ string chr,str_pos;
+ size_t colpos=cpos.find(":");
+ if(colpos==string::npos) {
+ if(npos>1) {
+ chr=prevchr;
+ str_pos=cpos.substr(0,cpos.size()-2);
+ } else {
+ Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str());
+ continue;
+ }
+ } else {
+ chr=cpos.substr(0,colpos);
+ str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos);
+ }
+#ifdef DEBUG
+ Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand);
+#endif
+ int pos=strand*atoi(str_pos.c_str());
+ mposc.push_back(chr);
+ mposp.push_back(pos);
+ }
+ }
+
+ string chr;
+ int fpos;
+ if(mposc.size()!=1) {
+ if(mposc.size()==0) {
+ Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str());
+ } else {
+ Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str());
+ }
+ continue;
+ } else {
+ chr=*mposc.begin();
+ fpos=*mposp.begin();
+ }
+
+ int len=sequence.size();
+ // adjust probe length if eland length limit was specified
+ if(eland_tag_length>0 && len>eland_tag_length) {
+ len=eland_tag_length;
+ }
+
+ if(fpos<0) {
+ fpos=-1*(-1*fpos+len-1);
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in regular eland files, adjusting the negative strand coordinate by sequence length
+ SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+ boost::char_separator<char> sep2(",");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
+ } else {
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string str_strand=*sit++;
+ string chr=*sit++;
+
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+
+ string sequence=*sit++;
+ sit++; sit++;
+ string mm=*sit++;
+
+ int len=sequence.size();
+ if(str_strand[0]=='-') {
+ fpos=-1*(fpos+len-1);
+ }
+ // determine number of mismatches
+ int nm=0;
+ if(mm.size()>0) {
+ nm++;
+ string::size_type tp(0);
+ while(tp!=string::npos) {
+ tp = mm.find(",",tp);
+ if(tp!=string::npos) {
+ tp++;
+ ++nm;
+ }
+ }
+ }
+
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // read in helicos tab-separated alignment output (regular or bz2)
+ SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length of the match
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+ boost::char_separator<char> sep2(",");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
+ } else {
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ int nlines=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+ nlines++;
+ // skip comments
+ if(line[0]=='#') { continue; }
+ if(line.compare(0,12,"Reference_ID")==0) {
+#ifdef DEBUG
+ Rprintf("matched header on line %d\n",nlines);
+#endif
+ continue;
+ }
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++;
+ string tagname=*sit++;
+ string str_startpos=*sit++;
+ string str_endpos=*sit++;
+
+ string str_tstart=*sit++;
+ string str_tend=*sit++;
+ int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str());
+
+ sit++; sit++;
+ string str_ndel=*sit++;
+ string str_nins=*sit++;
+ string str_nsub=*sit++;
+
+ string str_strand=*sit++;
+ int fpos;
+ if(str_strand[0]=='-') {
+ fpos=-1*atoi(str_endpos.c_str());
+ } else {
+ fpos=atoi(str_startpos.c_str());
+ }
+
+ // determine number of mismatches
+ int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<int> >::const_iterator lsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 3, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,lv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator lni=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*lni++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 3, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+ // read in text version of maq map
+ SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string tagname=*sit++;
+ string chr=*sit++;
+ string str_pos=*sit++;
+ int fpos=atoi(str_pos.c_str());
+ string str_strand=*sit++;
+ sit++; sit++; sit++; sit++; sit++;
+ string str_nm=*sit++;
+ sit++; sit++; sit++;
+ string str_len=*sit++;
+ int nm=atoi(str_nm.c_str());
+ int len=atoi(str_len.c_str());
+
+ if(str_strand[0]=='-') {
+ fpos=-1*(fpos+len-1);
+ }
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+
+
+ // read in tagalign file
+ SEXP read_tagalign(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(get_a_line(f,line)) {
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++;
+ string str_spos=*sit++;
+ string str_epos=*sit++;
+ sit++;
+ string str_qual=*sit++;
+ string str_strand=*sit;
+
+ int fpos;
+ if(str_strand[0]=='+') {
+ fpos=atoi(str_spos.c_str());
+ } else {
+ fpos=-1*atoi(str_epos.c_str());
+ }
+ int nm=atoi(str_qual.c_str());
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+
+
+ SEXP tv,nv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 2)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+
+
+ // arachne madness
+ SEXP read_arachne(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string chr=*sit++;
+ string str_spos=*sit++;
+ int nm=0;
+ if(sit!=tok.end()) {
+ string str_mm=*sit;
+ nm=atoi(str_mm.c_str());
+ }
+
+ int fpos=atoi(str_spos.c_str());;
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+
+
+ SEXP tv,nv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 2)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+ // arachne madness
+ SEXP read_arachne_long(SEXP filename) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<int> > poslen; // length of the match
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ typedef boost::tokenizer<boost::char_separator<char> > tokType;
+ boost::char_separator<char> sep(" \t");
+
+
+
+
+
+ FILE *f=fopen(fname,"rb");
+ if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
+ else {
+
+#ifdef HAVE_LIBBZ2
+ BZFILE* b;
+ int bzerror;
+
+ int bz2file=0;
+ if(strstr(fname,".bz2")) {
+ bz2file=1;
+ b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+ if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
+ }
+#endif
+
+
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+#ifdef HAVE_LIBBZ2
+ while(get_a_line(f,b,bz2file,line)) {
+#else
+ while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG
+ Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+ tokType tok(line, sep);
+ tokType::iterator sit=tok.begin();
+ if(sit!=tok.end()) {
+ string query=*sit++;
+ if(query!="QUERY") { continue; }
+ *sit++; *sit++; *sit++; *sit++;
+ string str_strand=*sit++;
+ string chr=*sit++;
+ string str_startpos=*sit++;
+ string str_endpos=*sit++;
+
+ int fpos;
+ if(str_strand[0]=='1') {
+ fpos=-1*atoi(str_endpos.c_str());
+ } else {
+ fpos=atoi(str_startpos.c_str());
+ }
+#ifdef DEBUG
+ Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos);
+#endif
+ *sit++;
+ string str_nblocks=*sit++;
+ int nblocks=atoi(str_nblocks.c_str());
+#ifdef DEBUG
+ Rprintf("nblocks=%d\n",nblocks);
+#endif
+ // tally up the read length and the number of mismatches for all blocks
+ int len=0; int nm=0;
+ for(int i=0;i<nblocks;i++) {
+ string str_sgs=*sit++;
+ int sgs=atoi(str_sgs.c_str());
+ string str_slen=*sit++;
+ int slen=atoi(str_slen.c_str());
+ string str_snm=*sit++;
+ int snm=atoi(str_snm.c_str());
+#ifdef DEBUG
+ Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm);
+#endif
+ len+=slen;
+ nm+=abs(sgs)+snm;
+ }
+ nm+=nblocks-1;
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ poslen.push_back(vector<int>());
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ (poslen[cind]).push_back(len);
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ }
+#ifdef HAVE_LIBBZ2
+ BZ2_bzReadClose( &bzerror, b);
+#endif
+
+ fclose(f);
+
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<int> >::const_iterator lsi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+ lsi=poslen.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+
+
+ SEXP tv,nv,lv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+ int* i_lv=INTEGER(lv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ vector<int>::const_iterator lni=lsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i_lv[i]=*lni++;
+ i++;
+ }
+ PROTECT(dv = allocVector(VECSXP, 3)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ SET_VECTOR_ELT(dv, 2, lv);
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/cdensum.c b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/cdensum.c
new file mode 100755
index 0000000..fdf3138
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/cdensum.c
@@ -0,0 +1,144 @@
+#include <math.h>
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+
+
+#undef DEBUG 1
+
+// dout is npos-length output array.
+// n - number of positions in pos (and length of tc count array)
+// spos - starting position
+void cdensum(int *n, double *pos, double *tc, double *spos, int *bw,int *dw, int *npos, int *step,double *dout)
+{
+ int i,j;
+
+ double epos= *spos + ((double) *npos);
+ double dbw=(double) *bw;
+ for(i = 0; i< *n; i++) {
+ // size of the window to which the contributions should be added
+ int in=(int) (pos[i]- *spos);
+ int ic=tc[i];
+ int whs=(*dw)*(*bw)*ic;
+ int ws=(int) floor((in-whs)/(*step));
+ int we=(int) ceil((in+whs)/(*step));
+ if(ws<0) { ws=0; }
+ if(we>= *npos) { we= *npos -1; }
+
+ for(j=ws;j<we;j++) {
+ double beta=((double)(j*(*step)-in))/dbw;
+ dout[j]+=((double)ic)*exp(-0.5*beta*beta);
+ }
+ }
+}
+
+
+// window tag counts
+// dout is npos-length output array that will contain window tag counts
+// windows are of a specified size, moved at a specified step
+// n - number of positions in sorted tag array (positive only)
+// spos - starting position
+void window_n_tags(int *n, double *pos, double *spos, int *window_size, int *window_step, int *npos, int *dout)
+{
+ int i;
+ int cs=0; int ce=0; // current array start/end indecies
+ int ctc=0; // current tag count
+ double wpos=*spos-(*window_size)/2; // left-edge position
+ //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",*n,*window_size,*window_step,*npos,*spos);
+ for(i=0;i<*npos;i++) {
+ // advance end if needed
+ double ep=wpos+(*window_size);
+ while(ce<(*n) && pos[ce]<=ep) {
+ ctc++; ce++;
+ }
+ // advance start
+ while(cs<*n && pos[cs]<wpos) {
+ ctc--; cs++;
+ }
+ dout[i]=ctc;
+ // advance window position
+ wpos+=*window_step;
+ }
+}
+
+// window tag counts
+// windows are of a specified size, moved at a specified step
+// pos - tag positions (positive, pre-shifted)y
+// spos - starting position
+// returns nsteps-length output array that will contain window tag counts
+SEXP cwindow_n_tags(SEXP pos_R, SEXP spos_R, SEXP window_size_R, SEXP window_step_R, SEXP nsteps_R) {
+ double* pos=REAL(pos_R);
+ int n=LENGTH(pos_R);
+ int window_size=*INTEGER(window_size_R);
+ int window_step=*INTEGER(window_step_R);
+ int nsteps=*INTEGER(nsteps_R);
+ double spos=*REAL(spos_R);
+
+ // allocate return array
+ SEXP tc_R;
+ PROTECT(tc_R=allocVector(INTSXP,nsteps));
+ int* dout=INTEGER(tc_R);
+
+ int i;
+ int cs=0; int ce=0; // current array start/end indecies
+ int ctc=0; // current tag count
+ double wpos=spos-window_size/2; // left-edge position
+ //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",n,window_size,window_step,nsteps,spos);
+ for(i=0;i<nsteps;i++) {
+ // advance end if needed
+ double ep=wpos+window_size;
+ while(ce<n && pos[ce]<=ep) {
+ ctc++; ce++;
+ }
+ // advance start
+ while(cs<n && pos[cs]<wpos) {
+ ctc--; cs++;
+ }
+ dout[i]=ctc;
+ // advance window position
+ wpos+=window_step;
+ }
+ UNPROTECT(1);
+ return(tc_R);
+}
+
+// tag counts in windows around specified positions
+// pos - tag positions
+// ntags - number of tags in each position
+// wpos - window positions
+// returns a pos-length vector giving number of tags that fall within window_half_size from the provided positions
+SEXP cwindow_n_tags_around(SEXP pos_R, SEXP ntags_R, SEXP wpos_R, SEXP window_half_size_R) {
+ double* pos=REAL(pos_R);
+ int* ntags=INTEGER(ntags_R);
+ int n=LENGTH(pos_R);
+ double* wpos=REAL(wpos_R);
+ int nw=LENGTH(wpos_R); // number of windows
+ double whs=(double) *INTEGER(window_half_size_R);
+
+ // allocate return array
+ SEXP tc_R;
+ PROTECT(tc_R=allocVector(INTSXP,nw));
+ int* dout=INTEGER(tc_R);
+
+ int i;
+ int cs=0; int ce=0; // current array start/end indecies
+ int ctc=0; // current tag count
+ for(i=0;i<nw;i++) {
+ //if(i>(nw-2)) { Rprintf("-i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); }
+ // advance end if needed
+ double ep=wpos[i]+whs;
+ while(ce<n && pos[ce]<=ep) {
+ ctc+=ntags[ce]; ce++;
+ }
+ // advance start
+ double sp=wpos[i]-whs;
+ while(cs<n && pos[cs]<sp) {
+ ctc-=ntags[cs]; cs++;
+ }
+ dout[i]=ctc;
+ // if(i>(nw-2)) { Rprintf("+i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); }
+ }
+ UNPROTECT(1);
+ return(tc_R);
+}
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/const.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/const.h
new file mode 100755
index 0000000..2a06313
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/const.h
@@ -0,0 +1,18 @@
+#ifndef NST_CONST_H
+#define NST_CONST_H
+
+#define MAX_ULL 0xffffffffffffffffull
+
+typedef unsigned long long bit64_t;
+typedef unsigned bit32_t;
+typedef unsigned short bit16_t;
+typedef unsigned char bit8_t;
+
+extern bit8_t nst_nt4_table[];
+extern bit8_t nst_nt16_table[];
+extern char *nst_nt4_rev_table;
+extern char *nst_nt16_rev_table;
+extern bit8_t nst_nt16_nt4_table[];
+extern int nst_nt16_count_table[];
+
+#endif
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.c b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.c
new file mode 100755
index 0000000..96b4fff
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.c
@@ -0,0 +1,164 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include "const.h"
+#include "maqmap.h"
+
+maqmap_t *maq_new_maqmap()
+{
+ maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t));
+ mm->format = MAQMAP_FORMAT_NEW;
+ return mm;
+}
+void maq_delete_maqmap(maqmap_t *mm)
+{
+ int i;
+ if (mm == 0) return;
+ for (i = 0; i < mm->n_ref; ++i)
+ free(mm->ref_name[i]);
+ free(mm->ref_name);
+ free(mm->mapped_reads);
+ free(mm);
+}
+void maqmap_write_header(gzFile fp, const maqmap_t *mm)
+{
+ int i, len;
+ gzwrite(fp, &mm->format, sizeof(int));
+ gzwrite(fp, &mm->n_ref, sizeof(int));
+ for (i = 0; i != mm->n_ref; ++i) {
+ len = strlen(mm->ref_name[i]) + 1;
+ gzwrite(fp, &len, sizeof(int));
+ gzwrite(fp, mm->ref_name[i], len);
+ }
+ gzwrite(fp, &mm->n_mapped_reads, sizeof(bit64_t));
+}
+maqmap_t *maqmap_read_header(gzFile fp)
+{
+ maqmap_t *mm;
+ int k, len;
+ mm = maq_new_maqmap();
+ gzread(fp, &mm->format, sizeof(int));
+ if (mm->format != MAQMAP_FORMAT_NEW) {
+ if (mm->format > 0) {
+ fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n");
+ exit(3);
+ }
+ assert(mm->format == MAQMAP_FORMAT_NEW);
+ }
+ gzread(fp, &mm->n_ref, sizeof(int));
+ mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*));
+ for (k = 0; k != mm->n_ref; ++k) {
+ gzread(fp, &len, sizeof(int));
+ mm->ref_name[k] = (char*)malloc(len * sizeof(char));
+ gzread(fp, mm->ref_name[k], len);
+ }
+ /* read number of mapped reads */
+ gzread(fp, &mm->n_mapped_reads, sizeof(bit64_t));
+ return mm;
+}
+
+/* mapvalidate */
+
+static void mapvalidate_core(gzFile fpin)
+{
+ maqmap_t *m = maqmap_read_header(fpin);
+ maqmap1_t *m1, mm1;
+ bit64_t n = 0;
+ int i, l;
+ bit64_t *cnt;
+ m1 = &mm1;
+ cnt = (bit64_t*)calloc(m->n_ref, 8);
+ printf("[message] number of reference sequences: %d\n", m->n_ref);
+ while ((l = maqmap_read1(fpin, m1)) != 0) {
+ if (l != sizeof(maqmap1_t)) {
+ printf("[fatal error] truncated map file.\n");
+ break;
+ }
+ ++n;
+ if ((int)m1->seqid >= m->n_ref) {
+ printf("[fatal error] maqmap1_t::seqid is invalid (%d >= %d).\n", m1->seqid, m->n_ref);
+ break;
+ }
+ ++cnt[m1->seqid];
+ if (m1->size >= MAX_READLEN - 1) {
+ printf("[faltal error] maqmap1_t::size is invalid (%d >= %d).\n", m1->size, MAX_READLEN - 1);
+ break;
+ }
+ }
+ if (m->n_mapped_reads != 0) {
+ if (m->n_mapped_reads != n) {
+ printf("[warning] maqmap1_t::n_mapped_reads is set, but not equals the real number (%llu != %llu).\n",
+ m->n_mapped_reads, n);
+ }
+ }
+ for (i = 0; i != m->n_ref; ++i)
+ printf("[message] %s : %llu\n", m->ref_name[i], cnt[i]);
+ free(cnt);
+ maq_delete_maqmap(m);
+}
+
+/* mapview */
+
+static void mapview_core(FILE *fpout, gzFile fpin, int is_verbose, int is_mm)
+{
+ bit32_t j;
+ maqmap_t *m = maqmap_read_header(fpin);
+ maqmap1_t *m1, mm1;
+ m1 = &mm1;
+ while (maqmap_read1(fpin, m1)) {
+ fprintf(fpout, "%s\t%s\t%d\t%c\t%d\t%u\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d",
+ m1->name, m->ref_name[m1->seqid], (m1->pos>>1) + 1,
+ (m1->pos&1)? '-' : '+', m1->dist, m1->flag, m1->map_qual, (signed char)m1->seq[MAX_READLEN-1],
+ m1->alt_qual, m1->info1&0xf, m1->info2, m1->c[0], m1->c[1], m1->size);
+ if (is_verbose) {
+ fputc('\t', fpout);
+ for (j = 0; j != m1->size; ++j) {
+ if (m1->seq[j] == 0) fputc('n', fpout);
+ else if ((m1->seq[j]&0x3f) < 27) fputc("acgt"[m1->seq[j]>>6&3], fpout);
+ else fputc("ACGT"[m1->seq[j]>>6&3], fpout);
+ }
+ fputc('\t', fpout);
+ for (j = 0; j != m1->size; ++j)
+ fputc((m1->seq[j]&0x3f) + 33, fpout);
+ }
+ if (is_mm) {
+ bit64_t *p = (bit64_t*)(m1->seq + 55);
+ fprintf(fpout, "\t%llx", *p);
+ }
+ fputc('\n', fpout);
+ }
+ maq_delete_maqmap(m);
+}
+
+int ma_mapview(int argc, char *argv[])
+{
+ int c, is_verbose = 1, is_mm = 0;
+ while ((c = getopt(argc, argv, "bN")) >= 0) {
+ switch (c) {
+ case 'b': is_verbose = 0; break;
+ case 'N': is_mm = 1; break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: maq mapview [-bN] <in.map>\n");
+ return 1;
+ }
+ gzFile fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[optind], "r");
+ mapview_core(stdout, fp, is_verbose, is_mm);
+ gzclose(fp);
+ return 0;
+}
+
+int ma_mapvalidate(int argc, char *argv[])
+{
+ gzFile fp;
+ if (argc < 2) {
+ fprintf(stderr, "Usage: maq mapvalidate <in.map>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[1], "r");
+ mapvalidate_core(fp);
+ gzclose(fp);
+ return 0;
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.h
new file mode 100755
index 0000000..9beba0c
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqmap.h
@@ -0,0 +1,70 @@
+#ifndef MAQMAP_H_
+#define MAQMAP_H_
+
+#ifdef MAQ_LONGREADS
+# define MAX_READLEN 128
+#else
+# define MAX_READLEN 64
+#endif
+
+#define MAX_NAMELEN 36
+#define MAQMAP_FORMAT_OLD 0
+#define MAQMAP_FORMAT_NEW -1
+
+#define PAIRFLAG_FF 0x01
+#define PAIRFLAG_FR 0x02
+#define PAIRFLAG_RF 0x04
+#define PAIRFLAG_RR 0x08
+#define PAIRFLAG_PAIRED 0x10
+#define PAIRFLAG_DIFFCHR 0x20
+#define PAIRFLAG_NOMATCH 0x40
+#define PAIRFLAG_SW 0x80
+
+#include <string.h>
+#include <zlib.h>
+#include "const.h"
+
+/*
+ name: read name
+ size: the length of the read
+ seq: read sequence (see also below)
+ seq[MAX_READLEN-1]: single end mapping quality (equals to map_qual if not paired)
+ map_qual: the final mapping quality
+ alt_qual: the lower quality of the two ends (equals to map_qual if not paired)
+ flag: status of the pair
+ dist: offset of the mate (zero if not paired)
+ info1: mismatches in the 24bp (higher 4 bits) and mismatches (lower 4 bits)
+ info2: sum of errors of the best hit
+ c[2]: count of all 0- and 1-mismatch hits on the reference
+ */
+typedef struct
+{
+ bit8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */
+ bit8_t size, map_qual, info1, info2, c[2], flag, alt_qual;
+ bit32_t seqid, pos;
+ int dist;
+ char name[MAX_NAMELEN];
+} maqmap1_t;
+
+typedef struct
+{
+ int format, n_ref;
+ char **ref_name;
+ bit64_t n_mapped_reads;
+ maqmap1_t *mapped_reads;
+} maqmap_t;
+
+#define maqmap_read1(fp, m1) gzread((fp), (m1), sizeof(maqmap1_t))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ maqmap_t *maq_new_maqmap();
+ void maq_delete_maqmap(maqmap_t *mm);
+ void maqmap_write_header(gzFile fp, const maqmap_t *mm);
+ maqmap_t *maqmap_read_header(gzFile fp);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqread.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqread.cpp
new file mode 100755
index 0000000..dcc992a
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/maqread.cpp
@@ -0,0 +1,209 @@
+#include "pc.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <zlib.h>
+
+extern "C" {
+// pliu 20160911
+//#include "R.h"
+//#include "Rmath.h"
+//////
+#include "Rinternals.h"
+#include "Rdefines.h"
+#include "maqmap.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+
+class lessAbsoluteValue {
+public:
+ bool operator()(int a, int b) const {
+ return abs(a) < abs(b);
+ }
+};
+
+
+
+//#define DEBUG 1
+
+extern "C" {
+
+ // read in text version of maq map
+ SEXP read_binmaqmap(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ const char* fname=CHAR(asChar(filename));
+ int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG
+ Rprintf("fname=%s\n",fname);
+#endif
+
+ // main data vector
+ // chr - pos
+ vector< vector<int> > pos;
+ vector< vector<int> > posnm; // number of mismatches
+ vector< vector<string> > tagnames;
+
+ // chromosome map
+ hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+ vector<string> cnames;
+
+
+ gzFile f=gzopen(fname,"r");
+
+ maqmap_t *m = maqmap_read_header(f);
+ maqmap1_t *m1, mm1;
+ m1 = &mm1;
+
+ if (!f) {
+ cout<<"can't open input file \""<<fname<<"\"\n";
+ } else {
+ Rprintf("opened %s\n",fname);
+
+ // read in bed line
+ string line;
+ int fcount=0;
+ while(maqmap_read1(f, m1)) {
+ string tagname=string(m1->name);
+ string chr=string(m->ref_name[m1->seqid]);
+ int len=m1->size;
+ int fpos=(m1->pos>>1) + 1;
+ if(m1->pos&1) {
+ fpos=-1*(fpos+len-1);
+ }
+ int nm=m1->info1&0xf;
+
+#ifdef DEBUG
+ Rprintf("read in map line chr=%s tagname=%s fpos=%d, nm=%d, len=%d\n",chr.c_str(),tagname.c_str(),fpos,nm,len);
+#endif
+
+
+ // determine the chromosome index
+ hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+ int cind=-1;
+ if(li==cind_map.end()) {
+ // register new chromosome
+ cind=cnames.size();
+ cnames.push_back(chr);
+ cind_map[chr]=cind;
+ // allocate new pos vector
+ pos.push_back(vector<int>());
+ posnm.push_back(vector<int>());
+ if(read_names) {
+ tagnames.push_back(vector<string>());
+ }
+#ifdef DEBUG
+ Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+ } else {
+ cind=li->second;
+ }
+ fcount++;
+ (pos[cind]).push_back(fpos);
+ (posnm[cind]).push_back(nm);
+ if(read_names) {
+ (tagnames[cind]).push_back(tagname);
+ }
+#ifdef DEBUG
+ Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+ if(fcount>30) {
+ break;
+ }
+#endif
+
+ }
+ gzclose(f);
+ Rprintf("done. read %d fragments\n",fcount);
+ }
+
+
+ // construct output structures
+ SEXP chnames;
+ int np=0; // number of protections
+ PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+ for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+ SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+ }
+ np++;
+
+ // sort
+ //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+ // sort(csi->begin(), csi->end(), lessAbsoluteValue());
+ //}
+
+ SEXP ans;
+ PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
+ vector<vector<int> >::const_iterator nsi;
+ vector<vector<string> >::const_iterator ssi;
+ for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+ nsi=posnm.begin()+(csi-pos.begin());
+
+ SEXP dv,dnames_R;
+ PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+ SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+ SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+ if(read_names) {
+ SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+ }
+
+
+
+ SEXP tv,nv,sv;
+ PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
+ PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
+ if(read_names) {
+ PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
+ }
+ int* i_tv=INTEGER(tv);
+ int* i_nv=INTEGER(nv);
+
+ int i=0;
+ vector<int>::const_iterator ini=nsi->begin();
+ for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+ i_tv[i]=*pi;
+ i_nv[i]=*ini++;
+ i++;
+ }
+ if(read_names) {
+ int i=0;
+ ssi=tagnames.begin()+(csi-pos.begin());
+ for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+ SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+ i++;
+ }
+ }
+ PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
+ SET_VECTOR_ELT(dv, 0, tv);
+ SET_VECTOR_ELT(dv, 1, nv);
+ if(read_names) {
+ SET_VECTOR_ELT(dv, 2, sv);
+ }
+ setAttrib(dv, R_NamesSymbol, dnames_R);
+
+ SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+ }
+
+ setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG
+ Rprintf("unprotecting %d elements\n",np);
+#endif
+
+ UNPROTECT(np);
+ return(ans);
+}
+
+
+}
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/pc.h b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/pc.h
new file mode 100755
index 0000000..8be0911
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/pc.h
@@ -0,0 +1,20 @@
+#ifndef PC_H
+#define PC_H 1
+#include <functional>
+//#include <hash_map.h>
+#include <ext/hash_set>
+#include <ext/hash_map>
+
+
+namespace __gnu_cxx
+{
+ template<> struct hash< std::string >
+ {
+ size_t operator()( const std::string& x ) const
+ {
+ return hash< const char* >()( x.c_str() );
+ }
+ };
+}
+
+#endif
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/peaks.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/peaks.cpp
new file mode 100755
index 0000000..095e8f5
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/peaks.cpp
@@ -0,0 +1,808 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <string>
+#include <set>
+
+extern "C" {
+// pliu 20160911
+//#include "R.h"
+//#include "Rmath.h"
+#include <stdlib.h>
+#include <math.h>
+//////
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+/**
+ * Calculate all local peaks
+ */
+
+//#define DEBUG 1
+
+extern "C" {
+ SEXP find_peaks(SEXP x_R,SEXP thr_R,SEXP max_span_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ int nx=LENGTH(x_R);
+ int max_span=*INTEGER(max_span_R);
+ double thr=REAL(thr_R)[0];
+#ifdef DEBUG
+ Rprintf("n=%d; thr=%f; max_span=%d\n",nx,thr,max_span);
+#endif
+
+ vector<int> pos;
+
+ double pv=x[0];
+ double ppv=0; // previous peak value
+ int ppp=-max_span-1; // previous peak position
+
+ for(int i=1;i<(nx-1);i++) {
+ if(x[i]>pv && x[i]>=thr && x[i]>x[i+1]) {
+ if(max_span>2) {
+ //Rprintf("i=%d; ppp=%d\n",i,ppp);
+ if(i-ppp > max_span) {
+ if(ppp>=0) {
+ pos.push_back(ppp);
+ }
+ //Rprintf("recorded %d; now %d\n",ppp,i);
+ ppp=i; ppv=x[i];
+ } else {
+ if(x[i]>ppv) {
+ //Rprintf("reset from %d to %d\n",ppp,i);
+ ppp=i; ppv=x[i];
+ }
+ }
+ } else {
+ pos.push_back(i);
+ }
+ }
+ if(x[i]!=x[i+1]) { pv=x[i]; }
+ }
+
+ // add remaining peak
+ if(max_span>2 && ppp>=0) {
+ pos.push_back(ppp);
+ }
+
+ SEXP nv;
+ PROTECT(nv=allocVector(INTSXP,pos.size()));
+ int* i_nv=INTEGER(nv);
+ int i=0;
+ for(vector<int> ::const_iterator pi=pos.begin();pi!=pos.end();++pi) {
+ i_nv[i++]=1+(*pi);
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+
+
+
+ /************************************************************************/
+ // given a data vector d (positive values) and a set of signed center coordinates pos,
+ // returns coordinates of data points relative to the centers
+ // size is the size of the region around the centers
+ // return: vector of relative coordinates (x) and indecies of centers relative the coordinate
+ // was calculated (i).
+ SEXP get_relative_coordinates(SEXP d_R,
+ SEXP pos_R,
+ SEXP size_R)
+ {
+ int *d, *pos;
+ int npos,nd,size;
+
+ d = INTEGER(d_R); pos = INTEGER(pos_R);
+ npos=LENGTH(pos_R); nd=LENGTH(d_R);
+ size = INTEGER(size_R)[0];
+#ifdef DEBUG
+ Rprintf("|d|=%d, |c|=%d, size=%d\n",nd,npos,size);
+#endif
+
+ vector<int> x; vector<int> xi;
+ int k=0; // current pos index
+
+ for(int i=0;i<nd;i++) {
+ // increment k until pos[k]+size>=d[i]
+ while((abs(pos[k])+size) < d[i]) { k++; if(k==npos) { break; };
+#ifdef DEBUG
+ Rprintf("advancing k to %d\n",k);
+#endif
+ }
+ if(k==npos) { break; };
+ // increment i until d[i]>=pos[k]-size
+ while((abs(pos[k])-size) > d[i]) { i++; if(i==nd) { break; }
+#ifdef DEBUG
+ Rprintf("advancing i to %d\n",i);
+#endif
+ }
+ if(i==nd) { break; }
+
+
+ int l=k;
+ while((l<npos) && ((abs(pos[l])-size) <= d[i])) { l++;
+#ifdef DEBUG
+ Rprintf("advancing l to %d\n",l);
+#endif
+ }
+ for(int j=k;j<l;j++) {
+ int pd=d[i]-abs(pos[j]);
+ if(abs(pd)<=size) {
+ // record
+ if(pos[j]>0) {
+ x.push_back(pd);
+ } else {
+ x.push_back(-1*pd);
+ }
+ xi.push_back(j);
+#ifdef DEBUG
+ Rprintf("recorded i=%d, j=%d\n",i,j);
+#endif
+ } else {
+ break;
+ }
+ }
+ }
+
+ SEXP xv_R,xiv_R;
+ PROTECT(xv_R=allocVector(INTSXP,x.size()));
+ PROTECT(xiv_R=allocVector(INTSXP,x.size()));
+ int* xv=INTEGER(xv_R);
+ int* xiv=INTEGER(xiv_R);
+
+ int i=0;
+ for(vector<int> ::const_iterator pi=x.begin();pi!=x.end();++pi) {
+ xv[i++]=*pi;
+ }
+ i=0;
+ for(vector<int> ::const_iterator pi=xi.begin();pi!=xi.end();++pi) {
+ xiv[i++]=1+(*pi);
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("x"));
+ SET_STRING_ELT(names_R, 1, mkChar("i"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, xv_R);
+ SET_VECTOR_ELT(ans_R, 1, xiv_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+ }
+
+
+ // determines a set of points within a set of fragments
+ // note: all vectors sorted in ascending order
+ // note: all vectors are integers
+ // x_R - vector of point positions
+ // se_R - vector of start and end positions
+ // fi_R - vector of signed fragment indecies
+ // return_list_R - whether a list of fragments should be returned for each point
+ // return_unique_R - whether points in multiple fragments should be omitted
+ SEXP points_within(SEXP x_R,SEXP se_R,SEXP fi_R,SEXP return_list_R,SEXP return_unique_R,SEXP return_point_counts_R) {
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ int* x=INTEGER(x_R);
+ int nx=LENGTH(x_R);
+ int* se=INTEGER(se_R);
+ int* fi=INTEGER(fi_R);
+ int nf=LENGTH(se_R);
+
+ int return_list=*(INTEGER(return_list_R));
+ int return_unique=*(INTEGER(return_unique_R));
+ int return_point_counts=*(INTEGER(return_point_counts_R));
+
+#ifdef DEBUG
+ Rprintf("nf=%d; nx=%d, return_list=%d, return_unique=%d, return_point_counts=%d\n",nf/2,nx,return_list,return_unique,return_point_counts);
+#endif
+ set<int> fset;
+
+
+ SEXP nv; int *i_nv;
+ int np=0;
+ if(return_point_counts) {
+ PROTECT(nv = allocVector(INTSXP, nf/2)); np++;
+ i_nv=INTEGER(nv);
+ for(int i=0;i<nf/2;i++) { i_nv[i]=0; }
+ } else if(return_list) {
+ PROTECT(nv = allocVector(VECSXP, nx)); np++;
+ } else {
+ PROTECT(nv=allocVector(INTSXP,nx)); np++;
+ i_nv=INTEGER(nv);
+ }
+
+ int j=0;
+
+ for(int i=0;i<nx;i++) {
+ // advance j
+ while(j<nf && se[j]<x[i]) {
+ int frag=fi[j];
+ if(frag>0) { // insert
+ fset.insert(frag);
+#ifdef DEBUG
+ Rprintf("inserted frag %d, size=%d\n",frag,fset.size());
+#endif
+ } else { // remove
+ fset.erase(-frag);
+#ifdef DEBUG
+ Rprintf("removed frag %d, size=%d\n",-frag,fset.size());
+#endif
+ }
+ j++;
+ }
+#ifdef DEBUG
+ Rprintf("i=%d j=%d\n",i,j);
+#endif
+ if(return_list) {
+ if(fset.empty() || (return_unique && fset.size()>1)) {
+ // assign null list?
+ } else {
+ SEXP fil_R;
+ PROTECT(fil_R=allocVector(INTSXP,fset.size())); np++;
+ int* fil=INTEGER(fil_R);
+ int k=0;
+ for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) {
+ fil[k]=*ki; k++;
+ }
+ SET_VECTOR_ELT(nv, i, fil_R);
+ UNPROTECT(1); np--;
+ }
+ } else {
+ if(return_point_counts) {
+ for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) {
+ i_nv[*ki-1]++;
+ }
+ } else {
+ if(fset.empty() || (return_unique && fset.size()>1)) {
+ i_nv[i]=-1;
+ } else {
+ i_nv[i]=*fset.begin();
+ }
+ }
+ }
+ }
+
+ UNPROTECT(np);
+ return(nv);
+ }
+
+
+ SEXP expuni_lr(SEXP x_R, // positions and their number (assumed sorted in ascending order)
+ SEXP mdist_R, // max distance at which points should be considered
+ SEXP lambda_R, // lambda value
+ SEXP spos_R, // starting position
+ SEXP epos_R, // ending position
+ SEXP step_R, // step size
+ SEXP return_peaks_R, // whether peak positions should be returned, or entire score vector
+ SEXP min_peak_lr_R // min peak height (lr)
+ )
+ {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ int* x=INTEGER(x_R);
+ int nx=LENGTH(x_R);
+ int mdist=INTEGER(mdist_R)[0];
+ double lambda=*(REAL(lambda_R));
+
+ int return_peaks=*(INTEGER(return_peaks_R));
+ double min_peak=*(REAL(min_peak_lr_R));
+
+ int spos=*(INTEGER(spos_R));
+ int epos=*(INTEGER(epos_R));
+ int step=*(INTEGER(step_R));
+
+ int nsteps=(int) (epos-spos)/step;
+
+
+#ifdef DEBUG
+ Rprintf("n=%d; lambda=%f; mdist=%d; spos=%d; epos=%d; step=%d; nsteps=%d\n",nx,lambda,mdist,spos,epos,step,nsteps);
+#endif
+
+
+ SEXP nv;
+ double *d_nv;
+ if(!return_peaks) {
+ PROTECT(nv=allocVector(REALSXP,nsteps+1));
+ d_nv=REAL(nv);
+ }
+
+
+ int i=0; // current index of the first point being used in the calculations
+ int j=0; // current index of the last point being used in the calculations
+ int sx=0; // current sum of all positions
+ int n=0;
+
+ for(int k=0; k<=nsteps; k++) {
+ int cpos=spos+k*step;
+ // increase i until x[i]>=cpos-mdist; remove x from sx; decrement n;
+ while(i<nx && x[i]<(cpos-mdist)) {
+ n--; sx-=x[i]; i++;
+ //Rprintf("incremented i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]);
+ }
+ //Rprintf("stable i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]);
+
+ //if(i>j) { j=i; }
+
+ // increase j until x[j]>cpos
+ while(j<nx && x[j]<=cpos) {
+ n++; sx+=x[j]; j++;
+ //Rprintf("incremented j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j-1]);
+ }
+ //Rprintf("stable j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j]);
+
+ // calculate lr
+ d_nv[k]=((double)(1-n))*log(lambda)-lambda*((double)(n*(cpos+1)-sx));
+ //Rprintf("recorded lr[%d]=%f\n",k-1,d_nv[k-1]);
+ }
+ UNPROTECT(1);
+ return(nv);
+ }
+
+
+ SEXP allpdist(SEXP x_R,SEXP max_dist_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ int nx=LENGTH(x_R);
+ double max_dist=*REAL(max_dist_R);
+#ifdef DEBUG
+ Rprintf("n=%d; max_dist=%d\n",nx,max_dist);
+#endif
+
+ vector<double> dist;
+
+ for(int i=0;i<nx;i++) {
+ for(int j=i+1;j<nx;j++) {
+
+ double d=x[j]-x[i];
+#ifdef DEBUG
+ Rprintf("i=%d; j=%d; d=%f\n",i,j,d);
+#endif
+ if(d<=max_dist) {
+ dist.push_back(d);
+ } else {
+ break;
+ }
+ }
+ }
+
+ SEXP nv;
+ PROTECT(nv=allocVector(REALSXP,dist.size()));
+ double* i_nv=REAL(nv);
+ int i=0;
+ for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) {
+ i_nv[i++]=*pi;
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ // same as above, but for two different sets
+ SEXP allxpdist(SEXP x_R,SEXP y_R, SEXP max_dist_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ double* y=REAL(y_R);
+ int nx=LENGTH(x_R);
+ int ny=LENGTH(y_R);
+ double max_dist=*REAL(max_dist_R);
+#ifdef DEBUG
+ Rprintf("nx=%d; ny=%d; max_dist=%d\n",nx,ny,max_dist);
+#endif
+
+ vector<double> dist;
+ int yi=0; // latest y start index
+
+ for(int i=0;i<nx;i++) {
+ // adjust yi so that yi>=x[i]-max_dist_R
+ while(y[yi]<(x[i]-max_dist) && yi<ny) { yi++; }
+ if(yi==ny) { break; }
+
+ for(int j=yi;j<ny;j++) {
+ double d=y[j]-x[i];
+#ifdef DEBUG
+ Rprintf("i=%d; j=%d; d=%f\n",i,j,d);
+#endif
+ if(d<=max_dist) {
+ dist.push_back(d);
+ } else {
+ break;
+ }
+ }
+ }
+
+ SEXP nv;
+ PROTECT(nv=allocVector(REALSXP,dist.size()));
+ double* i_nv=REAL(nv);
+ int i=0;
+ for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) {
+ i_nv[i++]=*pi;
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ // returns a vector giving for each point,
+ // number of points within a given max_dist
+ SEXP nwithindist(SEXP x_R,SEXP max_dist_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* x=REAL(x_R);
+ int nx=LENGTH(x_R);
+ double max_dist=*REAL(max_dist_R);
+
+ SEXP nv;
+ PROTECT(nv=allocVector(REALSXP,nx));
+ double* i_nv=REAL(nv);
+ for(int i=0;i<nx;i++) { i_nv[i]=0; }
+
+#ifdef DEBUG
+ Rprintf("n=%d; max_dist=%d\n",nx,max_dist);
+#endif
+
+ for(int i=0;i<nx;i++) {
+ for(int j=i+1;j<nx;j++) {
+
+ double d=x[j]-x[i];
+#ifdef DEBUG
+ Rprintf("i=%d; j=%d; d=%f\n",i,j,d);
+#endif
+ if(d<=max_dist) {
+ i_nv[i]++;
+ i_nv[j]++;
+ } else {
+ break;
+ }
+ }
+ }
+
+ UNPROTECT(1);
+ return(nv);
+ }
+
+
+
+
+ // given a list of sorted chromosome signal and background vectors (unscaled), determine
+ // cluster contigs exceeding thr poisson P value, based on a whs window size,
+ // and satisfying mcs cluster size
+ SEXP find_poisson_enrichment_clusters(SEXP pos_R,SEXP flag_R,SEXP wsize_R,SEXP thr_R,SEXP mcs_R,SEXP bgm_R,SEXP mintag_R,SEXP either_R) {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+ double* pos=REAL(pos_R);
+ int* flag=INTEGER(flag_R);
+ int nt=LENGTH(pos_R);
+
+ int mcs=*INTEGER(mcs_R);
+ int wsize=*INTEGER(wsize_R);
+ int either=*INTEGER(either_R);
+ double thr=REAL(thr_R)[0];
+ double bgm=REAL(bgm_R)[0];
+ double mintag=REAL(mintag_R)[0];
+
+#ifdef DEBUG
+ Rprintf("nt=%d; wsize=%d; thr=%f; mcs=%d; min.tag=%f; bgm=%f\n",nt,wsize,thr,mcs,mintag,bgm);
+#endif
+
+
+ vector< pair<double,double> > contigs;
+
+ // running indecies (start and end)
+ int si=0;
+ int ei=0;
+
+ // current window coordinate
+ double ws=pos[0];
+
+ // current window tag counts
+ int cc[2]={0,0};
+
+
+ if(nt>0) {
+ cc[flag[si]]++;
+ // increment window end
+ while(ei<(nt-1) && (pos[ei+1]-ws) <= wsize) {
+ ei++;
+ cc[flag[ei]]++;
+ }
+
+
+ // cluster start,end positions
+ double cs,ce;
+ int inclust=0;
+
+ while(si<nt-1) {
+
+ if((pos[si+1]-ws) > (pos[ei+1] - ws - wsize) && ei!=(nt-1)) {
+ // move end boudnary
+ ei++;
+ ws=pos[ei]-wsize;
+ cc[flag[ei]]++;
+ while(ei<(nt-1) && pos[ei+1]==ws+wsize) {
+ ei++;
+ cc[flag[ei]]++;
+ }
+
+ // increment window start
+ while(si<(nt-1) && pos[si] < ws) {
+ cc[flag[si]]--;
+ si++;
+ }
+
+ } else {
+ // move up start boundary
+ ws=pos[si+1];
+ cc[flag[si]]--;
+ si++;
+ while(si<(nt-1) && pos[si+1]==ws) {
+ cc[flag[si]]--;
+ si++;
+ }
+
+ // increment window end
+ while(ei<(nt-1) && (pos[ei+1] - ws) <= wsize) {
+ ei++;
+ cc[flag[ei]]++;
+ }
+
+ }
+
+ // calculate z score
+ double dc0=((double)cc[0])+0.5;
+ double dc1=((double)cc[1])+0.5;
+ double rte=dc0+dc1-0.25*thr*thr;
+ double lb;
+ if(rte<=0) {
+ lb=0;
+ } else {
+ lb=(sqrt(dc1*dc0) - 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr);
+ if(lb<0) { lb=0; }
+ lb*=lb;
+ }
+
+ //Rprintf("%f=f(%f,%f,%f); %f=f(%f,%f,%f)\n",lb,1.0-thr,2.0*dc1,2.0*dc0,ub,thr,2.0*dc1,2.0*dc0);
+
+#ifdef DEBUG
+ //double ub=gsl_cdf_fdist_Qinv(thr,2.0*dc1,2.0*dc0)*dc1/dc0;
+ double ub=(sqrt(dc1*dc0) + 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr);
+ ub*=ub;
+ Rprintf("s=%d (%f); e=%d (%f); window: %f-%f; cc=[%d,%d]; lb=%f; ub=%f\n",si,pos[si],ei,pos[ei],ws,ws+wsize,cc[0],cc[1],lb,ub);
+#endif
+
+ int bc=lb>=bgm && cc[1]>=mintag;
+ if(either) {
+ bc=lb>=bgm || cc[1]>=mintag;
+ }
+ if(bc) {
+ if(inclust) {
+ double nce=ws+wsize/2.0;
+ if(nce-ce > wsize/2.0) {
+ // next point is too far removed, end cluster
+ if(ce-cs >= mcs) {
+ contigs.push_back(pair<double,double>(cs,ce));
+#ifdef DEBUG
+ Rprintf("recorded cluster %f-%f\n",cs,ce);
+#endif
+ }
+ inclust=0;
+ } else {
+ ce=nce;
+ }
+ } else {
+ inclust=1;
+ cs=ws+wsize/2.0;
+ ce=cs;
+ }
+ } else {
+ if(inclust) {
+ if(ce-cs >= mcs) {
+ contigs.push_back(pair<double,double>(cs,ce));
+#ifdef DEBUG
+ Rprintf("recorded cluster %f-%f\n",cs,ce);
+#endif
+ }
+ inclust=0;
+ }
+ }
+
+ }
+
+ if(inclust) {
+ if(ce-cs >= mcs) {
+ contigs.push_back(pair<double,double>(cs,ce));
+#ifdef DEBUG
+ Rprintf("recorded cluster %f-%f\n",cs,ce);
+#endif
+ }
+ inclust=0;
+ }
+ }
+
+ SEXP cs_R,ce_R;
+ PROTECT(cs_R=allocVector(REALSXP,contigs.size()));
+ PROTECT(ce_R=allocVector(REALSXP,contigs.size()));
+ double* csa=REAL(cs_R);
+ double* cea=REAL(ce_R);
+
+ int i=0;
+ for(vector< pair<double,double> >::const_iterator ci=contigs.begin(); ci!=contigs.end();++ci) {
+ csa[i]=ci->first;
+ cea[i]=ci->second;
+ i++;
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("s"));
+ SET_STRING_ELT(names_R, 1, mkChar("e"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, cs_R);
+ SET_VECTOR_ELT(ans_R, 1, ce_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+
+ }
+
+
+ // finds intersection between a list of regions
+ // the flag has +n/-n value, corresponding to the start/end of a segment in n-th regionset
+ // max_val: 1 - report max overlapping value, -1: report min, 0 - don't look at values
+ // returns: $s, $e, ($v) lists
+ SEXP region_intersection(SEXP n_R,SEXP pos_R,SEXP flags_R,SEXP vals_R,SEXP max_val_R,SEXP union_R) {
+ const int max_val=*INTEGER(max_val_R);
+ const int unionr=*INTEGER(union_R);
+ const int n=*INTEGER(n_R);
+ double* pos=REAL(pos_R);
+ int* flags=INTEGER(flags_R);
+ double* val=REAL(vals_R);
+
+#ifdef DEBUG
+ Rprintf("n=%d; npos=%d; max_val=%d\n",n,LENGTH(pos_R),max_val);
+#endif
+
+ int s[n]; // flag status for each set
+ double mv[n]; // max/min value of current clusters
+
+ for(int i=0;i<n;i++) { s[i]=0; }
+
+ vector<double> starts;
+ vector<double> ends;
+ vector<double> values;
+
+ int start=-1;
+ double mval=0;
+ for(int i=0;i<LENGTH(pos_R);i++) {
+ // update flags
+ int f=flags[i];
+ if(f>0) {
+ s[abs(f)-1]++;
+ } else {
+ s[abs(f)-1]--;
+ }
+
+ if(max_val!=0 && val[i]*max_val > mval*max_val) { mval=val[i]; }
+
+ // joined status
+ int all;
+ if(unionr) {
+ all=0;
+ for(int j=0;j<n;j++) { if(s[j]>0) { all=1; break;} }
+ } else {
+ all=1;
+ for(int j=0;j<n;j++) { all=all & (s[j]>0); }
+ }
+
+
+ //Rprintf("i=%d; s=[",i);
+ //for(int j=0;j<n;j++) { Rprintf("%d",s[j]); }
+ //Rprintf("]; all=%d; start=%d\n",all,start);
+
+ if(start>=0) {
+ // in fragment
+ if(!all) {
+ // end fragment
+ starts.push_back(pos[start]);
+ ends.push_back(pos[i]);
+ start=-1;
+ if(max_val!=0) { values.push_back(mval); }
+
+#ifdef DEBUG
+ Rprintf("recorded new fragment (s=%f,e=%f,v=%f);\n",pos[start],pos[i],mval);
+#endif
+ }
+ } else {
+ // should a fragment be started?
+ if(all) {
+ start=i;
+ if(max_val!=0) { mval=val[i]; }
+#ifdef DEBUG
+ Rprintf("starting new fragment (s=%f,i=%d);\n",pos[start],i);
+#endif
+ }
+ }
+ }
+ SEXP cs_R,ce_R,cv_R;
+ PROTECT(cs_R=allocVector(REALSXP,starts.size()));
+ PROTECT(ce_R=allocVector(REALSXP,ends.size()));
+
+ double* csa=REAL(cs_R);
+ int i=0;
+ for(vector<double>::const_iterator ci=starts.begin(); ci!=starts.end(); ++ci) {
+ csa[i]=*ci; i++;
+ }
+
+ csa=REAL(ce_R);
+ i=0;
+ for(vector<double>::const_iterator ci=ends.begin(); ci!=ends.end(); ++ci) {
+ csa[i]=*ci; i++;
+ }
+
+ if(max_val!=0) {
+ PROTECT(cv_R=allocVector(REALSXP,values.size()));
+ csa=REAL(cv_R);
+ i=0;
+ for(vector<double>::const_iterator ci=values.begin(); ci!=values.end(); ++ci) {
+ csa[i]=*ci; i++;
+ }
+ }
+
+ SEXP ans_R, names_R;
+ if(max_val!=0) {
+ PROTECT(names_R = allocVector(STRSXP, 3));
+ SET_STRING_ELT(names_R, 0, mkChar("s"));
+ SET_STRING_ELT(names_R, 1, mkChar("e"));
+ SET_STRING_ELT(names_R, 2, mkChar("v"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 3));
+ SET_VECTOR_ELT(ans_R, 0, cs_R);
+ SET_VECTOR_ELT(ans_R, 1, ce_R);
+ SET_VECTOR_ELT(ans_R, 2, cv_R);
+ } else {
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("s"));
+ SET_STRING_ELT(names_R, 1, mkChar("e"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, cs_R);
+ SET_VECTOR_ELT(ans_R, 1, ce_R);
+ }
+
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ if(max_val!=0) {
+ UNPROTECT(5);
+ } else {
+ UNPROTECT(4);
+ }
+ return(ans_R);
+ }
+
+}
+
diff --git a/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/wdl.cpp b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/wdl.cpp
new file mode 100755
index 0000000..9219106
--- /dev/null
+++ b/pRSEM/phantompeakqualtools/spp_1.10.1_on_R3.3/src/wdl.cpp
@@ -0,0 +1,660 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <string>
+#include <set>
+
+extern "C" {
+// pliu 20160911
+//#include "R.h"
+//#include "Rmath.h"
+#include <math.h>
+//////
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx;
+
+//#define DEBUG 1
+
+extern "C" {
+
+ /************************************************************************/
+ /*
+ * lwcc - calculate local window cross-correlation
+ */
+
+ SEXP lwcc(SEXP x_R, // positive strand hist
+ SEXP y_R, // negative strand hist of the same length
+ SEXP osize_R, // outer boundary distance
+ SEXP isize_R, // inner boundary distance
+ SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
+ SEXP min_peak_dist_R, // distance between closest peaks
+ SEXP min_peak_val_R, // min peak threshold
+ SEXP tag_weight_R, // tag weight
+ SEXP bg_subtract_R, // a flag whether do background subtractio
+ SEXP bgp_R, // optional background hist for positive strand
+ SEXP bgn_R, // optional background hist for negative strand
+ SEXP bg_wsize_R, // window size for the background counts
+ SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout)
+ SEXP round_up_R // whether to round up fractional signal tag counts
+ )
+ {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+
+ int is=INTEGER(isize_R)[0];
+ int os=INTEGER(osize_R)[0];
+ double rs=((double)(2*os+1));
+ int* x=INTEGER(x_R);
+ int* y=INTEGER(y_R);
+ int n_x=LENGTH(x_R);
+
+ // background-related
+ int* bgp=INTEGER(bgp_R);
+ int* bgn=INTEGER(bgn_R);
+ int bg_whs=INTEGER(bg_wsize_R)[0];
+
+ int return_peaks=*(INTEGER(return_peaks_R));
+ double min_peak_val=*(REAL(min_peak_val_R));
+ int min_peak_dist=*(INTEGER(min_peak_dist_R));
+ double tag_weight=*(REAL(tag_weight_R));
+
+ const int round_up=*(INTEGER(round_up_R));
+ const int bg_subtract=*(INTEGER(bg_subtract_R));
+ const double bg_weight=*(REAL(bg_weight_R));
+
+ int i; // point at which the value is being calculated
+ int start=os;
+ int end=n_x-os-1;
+
+ // bg tag counts within bg window
+ int bg_pn1=0;
+ int bg_nn1=0;
+ int bg_pn2=0;
+ int bg_nn2=0;
+
+
+
+ // illustration for counting:
+ //
+ // 012345678901234567890123456789012
+ // ==========------|------==========
+ //
+ // osize=16; isize=6;
+
+
+ SEXP nv;
+ double *d_nv;
+ vector<int> ppos;
+ vector<double> pval;
+ if(!return_peaks) {
+ PROTECT(nv=allocVector(REALSXP,n_x));
+ d_nv=REAL(nv);
+ for(int i=0;i<n_x;i++) {
+ d_nv[i]=0;
+ }
+ }
+
+#ifdef DEBUG
+ Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight);
+ Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
+#endif
+
+ int lpp=-1; // last peak position
+ double lpv=-1e3; // last peak value
+
+ double ppv=-1e3; // last value
+ double pppv=-11e-3; // value before last
+
+ int pn1,pn2,nn1,nn2;
+
+
+ if(bg_subtract) {
+ // pre-initialize background tag counts,
+ for(int i=0;i<bg_whs;i++) {
+ if(i<n_x) {
+ bg_pn2+=bgp[i];
+ bg_nn2+=bgn[i];
+ }
+ }
+ }
+
+
+ for(i=0;i<end;i++) {
+#ifdef DEBUG
+ //Rprintf("i=%d ", i);
+#endif
+
+ if(bg_subtract) {
+ // update background counts
+ int nl=i-bg_whs-1;
+
+ if(nl>=0) {
+ bg_pn1-=bgp[nl];
+ bg_nn1-=bgn[nl];
+ }
+ bg_pn1+=bgp[i];
+ bg_nn1+=bgn[i];
+
+ if(i>0) {
+ bg_pn2-=bgp[i-1];
+ bg_nn2-=bgn[i-1];
+ }
+ int nr=i+bg_whs;
+ if(nr<n_x) {
+ bg_pn2+=bgp[nr];
+ bg_nn2+=bgn[nr];
+ }
+ }
+
+ if(i >= start) {
+ // update counts, taking into account masked out regions
+ pn1=pn2=nn1=nn2=0;
+
+ for(int k=0;k<=(os-is);k++) {
+ int xp1=x[i-os+k];
+ int xp2=x[i+os-k];
+ int xn1=y[i+os-k];
+ int xn2=y[i-os+k];
+
+ if(xp1!=-1 && xn1!=-1) {
+ pn1+=xp1;
+ nn1+=xn1;
+ }
+ if(xp2!=-1 && xn2!=-1) {
+ pn2+=xp2;
+ nn2+=xn2;
+ }
+ }
+
+ // calculate the means
+ double mp=((double)(pn1+pn2))/rs;
+ double mn=((double)(pn1+pn2))/rs;
+#ifdef DEBUG
+ Rprintf("mp=%f mn=%f\n",mp,mn);
+#endif
+ // calculate correlation
+ double varp=0;
+ double varn=0;
+ double num=0;
+ double val=-1e3;
+ if(mp>0 & mn>0) {
+ for(int k=0;k<=(os-is);k++) {
+ int xp1=x[i-os+k];
+ int xp2=x[i+os-k];
+ int xn1=y[i+os-k];
+ int xn2=y[i-os+k];
+
+
+ if(xp1!=-1 && xn1!=-1) {
+ double nnp1=((double) xp1)-mp;
+ double nnn1=((double) xn1)-mn;
+ num+=nnp1*nnn1;
+ varp+=nnp1*nnp1;
+ varn+=nnn1*nnn1;
+ }
+
+ if(xp2!=-1 && xn2!=-1) {
+ double nnp2=((double) xp2)-mp;
+ double nnn2=((double) xn2)-mn;
+ num+=nnp2*nnn2;
+ varp+=nnp2*nnp2;
+ varn+=nnn2*nnn2;
+ }
+
+ }
+ double tagw;
+ double spn1=((double)pn1)*tag_weight;
+ double snn1=((double)nn1)*tag_weight;
+ double spn2=((double)pn2)*tag_weight;
+ double snn2=((double)nn2)*tag_weight;
+ if(round_up) {
+ if(pn1>0 && spn1<1) { spn1=1.0; }
+ //if(pn2>0 && spn2<1) { spn2=1.0; }
+ if(nn1>0 && snn1<1) { snn1=1.0; }
+ //if(nn2>0 && snn2<1) { snn2=1.0; }
+ }
+
+ if(bg_subtract) {
+ spn1-=((double)bg_pn1)*bg_weight;
+ snn1-=((double)bg_nn2)*bg_weight;
+ spn2-=((double)bg_pn2)*bg_weight;
+ snn2-=((double)bg_nn1)*bg_weight;
+
+ if(spn2<0) spn2=0;
+ if(snn2<0) snn2=0;
+
+ if(spn1>0 && snn1>0) {
+ tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0));
+ } else {
+ tagw=-(spn2+snn2+1.0);
+ }
+ //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl;
+ } else {
+ tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2);
+ }
+
+ if(tagw<0) {
+ val=0.0;
+ } else {
+ if(num==0.0) {
+ val=0;
+ } else {
+ val=num/(sqrt(varp*varn));
+ }
+ val=val*sqrt(tagw) + tagw;
+
+ }
+ //cout<<"val="<<val<<endl;
+
+#ifdef DEBUG
+ Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw);
+ Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val);
+#endif
+ }
+
+
+
+ if(return_peaks) {
+ // determine if previous position was a peak
+ if(ppv>min_peak_val && ppv>val && ppv>pppv) {
+ if(lpp>0 && (i-lpp+1)>min_peak_dist) {
+ // record previous peak position
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+#ifdef DEBUG
+ Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
+#endif
+ lpp=i-1; lpv=ppv;
+#ifdef DEBUG
+ Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
+#endif
+ } else {
+ if(ppv>lpv) {
+ // update last peak positions
+#ifdef DEBUG
+ Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
+#endif
+ lpp=i-1; lpv=ppv;
+ }
+ }
+ }
+
+ // update previous values
+ if(val!=ppv) {
+ pppv=ppv; ppv=val;
+ }
+ } else {
+ d_nv[i]=val;
+ }
+ }
+ }
+
+ if(return_peaks) {
+ // record last position
+ if(lpp>0) {
+#ifdef DEBUG
+ Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
+#endif
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+ }
+
+ SEXP rpp_R,rpv_R;
+ PROTECT(rpp_R=allocVector(INTSXP,ppos.size()));
+ PROTECT(rpv_R=allocVector(REALSXP,ppos.size()));
+ int* rpp=INTEGER(rpp_R);
+ double* rpv=REAL(rpv_R);
+
+ for(int i=0;i<ppos.size();i++) {
+ rpp[i]=ppos[i];
+ rpv[i]=pval[i];
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("x"));
+ SET_STRING_ELT(names_R, 1, mkChar("v"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, rpp_R);
+ SET_VECTOR_ELT(ans_R, 1, rpv_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+ } else {
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ }
+
+
+
+ /************************************************************************/
+ /*
+ * wtd - window tag difference implementation
+ */
+
+ SEXP wtd(SEXP x_R, // positive strand hist
+ SEXP y_R, // negative strand hist of the same length
+ SEXP wsize_R, // outer boundary distance
+ SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
+ SEXP min_peak_dist_R, // distance between closest peaks
+ SEXP min_peak_val_R, // min peak threshold
+ SEXP direct_count_R, // whether tag weighting should not be done
+ SEXP tag_weight_R, // tag weight
+ SEXP ignore_masking_R, // whether to ignore masked regions
+ SEXP bg_subtract_R, // a flag whether do background subtractio
+ SEXP bgp_R, // optional background hist for positive strand
+ SEXP bgn_R, // optional background hist for negative strand
+ SEXP bg_wsize_R, // window size for the background counts
+ SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference
+ SEXP round_up_R // whether to round up fractional signal tag counts
+ )
+ {
+
+#ifdef DEBUG
+ Rprintf("start\n");
+#endif
+
+ int whs=INTEGER(wsize_R)[0];
+ int* x=INTEGER(x_R);
+ int* y=INTEGER(y_R);
+ int n_x=LENGTH(x_R);
+
+ // background-related
+ int* bgp=INTEGER(bgp_R);
+ int* bgn=INTEGER(bgn_R);
+ int bg_whs=INTEGER(bg_wsize_R)[0];
+
+
+ const int return_peaks=*(INTEGER(return_peaks_R));
+ const int direct_count=*(INTEGER(direct_count_R));
+ const int ignore_masking=*(INTEGER(ignore_masking_R));
+ const double min_peak_val=*(REAL(min_peak_val_R));
+ const int min_peak_dist=*(INTEGER(min_peak_dist_R));
+ const double tag_weight=*(REAL(tag_weight_R));
+
+ const int round_up=*(INTEGER(round_up_R));
+ const int bg_subtract=*(INTEGER(bg_subtract_R));
+ const double bg_weight=*(REAL(bg_weight_R));
+
+ int i; // point at which the value is being calculated
+ int start=whs+1;
+ int end=n_x-whs-1;
+
+ // tag counts to calculate the means
+ int pn1=0;
+ int pn2=0;
+ int nn1=0;
+ int nn2=0;
+
+ // bg tag counts within bg window
+ int bg_pn1=0;
+ int bg_pn2=0;
+ int bg_nn1=0;
+ int bg_nn2=0;
+
+ SEXP nv;
+ double *d_nv;
+ vector<int> ppos;
+ vector<double> pval;
+ if(!return_peaks) {
+ PROTECT(nv=allocVector(REALSXP,n_x));
+ d_nv=REAL(nv);
+ for(int i=0;i<n_x;i++) {
+ d_nv[i]=0;
+ }
+ }
+
+#ifdef DEBUG
+ Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking);
+ Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
+#endif
+
+ int lpp=-1; // last peak position
+ double lpv=-1000; // last peak value
+
+ double ppv=-1000; // last value
+ int ppl=-1; // position of the last value
+ double pppv=-1000; // value before last
+
+
+ if(ignore_masking==1) {
+ for(int i=0;i<whs;i++) {
+ pn1+=x[i];
+ pn2+=x[i+whs+1];
+ nn1+=y[i];
+ nn2+=y[i+whs+1];
+
+ }
+ }
+
+ if(bg_subtract) {
+ // pre-initialize background tag counts,
+ for(int i=0;i<bg_whs;i++) {
+ if(i<n_x) {
+ bg_pn2+=bgp[i];
+ bg_nn2+=bgn[i];
+ }
+ }
+ // increment center of background count window to the start position
+ for(int i=0;i<start;i++) {
+ // update background counts
+ int nl=i-bg_whs-1;
+
+ if(nl>=0) {
+ bg_pn1-=bgp[nl];
+ bg_nn1-=bgn[nl];
+ }
+ bg_pn1+=bgp[i];
+ bg_nn1+=bgn[i];
+
+ if(i>0) {
+ bg_pn2-=bgp[i-1];
+ bg_nn2-=bgn[i-1];
+ }
+ int nr=i+bg_whs;
+ if(nr<n_x) {
+ bg_pn2+=bgp[nr];
+ bg_nn2+=bgn[nr];
+ }
+ }
+
+ }
+
+
+#ifdef DEBUG
+ Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2);
+#endif
+
+ for(i=start;i<end;i++) {
+ if(bg_subtract) {
+ // update background counts
+ int nl=i-bg_whs-1;
+
+ if(nl>=0) {
+ bg_pn1-=bgp[nl];
+ bg_nn1-=bgn[nl];
+ }
+ bg_pn1+=bgp[i];
+ bg_nn1+=bgn[i];
+
+ if(i>0) {
+ bg_pn2-=bgp[i-1];
+ bg_nn2-=bgn[i-1];
+ }
+ int nr=i+bg_whs;
+ if(nr<n_x) {
+ bg_pn2+=bgp[nr];
+ bg_nn2+=bgn[nr];
+ }
+ }
+
+ // update counts
+ if(ignore_masking==1) {
+ pn1+=x[i-1]-x[i-whs-1];
+ pn2+=x[i+whs]-x[i-1];
+ nn1+=y[i-1]-y[i-whs-1];
+ nn2+=y[i+whs]-y[i-1];
+
+ } else {
+
+ pn1=pn2=nn1=nn2=0;
+
+ for(int k=0;k<whs;k++) {
+ int xp1=x[i-k-1];
+ int xp2=x[i+k];
+ int xn1=y[i-k-1];
+ int xn2=y[i+k];
+
+ // omit masked positions
+ if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) {
+ pn1+=xp1;
+ nn1+=xn1;
+ pn2+=xp2;
+ nn2+=xn2;
+ }
+ }
+ }
+
+ double val;
+ double spn1=((double)pn1)*tag_weight;
+ double snn1=((double)nn1)*tag_weight;
+ double spn2=((double)pn2)*tag_weight;
+ double snn2=((double)nn2)*tag_weight;
+ if(round_up) {
+ if(pn1>0 && spn1<1) { spn1=1.0; }
+ //if(pn2>0 && spn2<1) { spn2=1.0; }
+ //if(nn1>0 && snn1<1) { snn1=1.0; }
+ if(nn2>0 && snn2<1) { snn2=1.0; }
+ }
+
+ if(direct_count) {
+ val=spn1+snn2;
+ if(round_up && val<1) {
+ val=1.0;
+ }
+ if(bg_subtract) {
+ val-=((double) (bg_pn1+bg_nn2))*bg_weight;
+ }
+ } else {
+ if(bg_subtract) {
+ spn1-=((double)bg_pn1)*bg_weight;
+ snn1-=((double)bg_nn1)*bg_weight;
+ spn2-=((double)bg_pn2)*bg_weight;
+ snn2-=((double)bg_nn2)*bg_weight;
+
+ if(spn2<0) spn2=0;
+ if(snn1<0) snn1=0;
+
+ if(spn1>0 && snn2>0) {
+ val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0));
+ } else {
+ val=-(spn2+snn1+1.0);
+ }
+ } else {
+ val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight);
+ }
+ }
+ //double val=sqrt(pn1*nn2);
+ //if(pn2>nn1) { val-=pn2; } else { val-=pn1; }
+#ifdef DEBUG
+ Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val);
+#endif
+
+ if(return_peaks) {
+ // determine if previous position was a peak
+ if(ppv>min_peak_val && ppv>val && ppv>pppv) {
+ if(lpp>0 && (i-lpp+1)>min_peak_dist) {
+ // record previous peak position
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+#ifdef DEBUG
+ Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
+#endif
+ if(ppl!=-1 && ppl!=i-1) {
+ lpp=(int) round((ppl+i-1)/2);
+ } else {
+ lpp=i-1;
+ }
+ lpv=ppv;
+#ifdef DEBUG
+ Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
+#endif
+ } else {
+ if(ppv>lpv) {
+ // update last peak positions
+#ifdef DEBUG
+ Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
+#endif
+ if(ppl!=-1 && ppl!=i-1) {
+ lpp=(int) round((ppl+i-1)/2);
+ } else {
+ lpp=i-1;
+ }
+ lpv=ppv;
+ }
+ }
+ }
+
+ // update previous values
+ if(val!=ppv) {
+ pppv=ppv; ppv=val; ppl=i;
+ }
+ } else {
+ d_nv[i]=val;
+ }
+ }
+
+ if(return_peaks) {
+ // record last position
+ if(lpp>0) {
+#ifdef DEBUG
+ Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
+#endif
+ ppos.push_back(lpp);
+ pval.push_back(lpv);
+ }
+
+ SEXP rpp_R,rpv_R;
+ PROTECT(rpp_R=allocVector(INTSXP,ppos.size()));
+ PROTECT(rpv_R=allocVector(REALSXP,ppos.size()));
+ int* rpp=INTEGER(rpp_R);
+ double* rpv=REAL(rpv_R);
+
+ for(int i=0;i<ppos.size();i++) {
+ rpp[i]=ppos[i];
+ rpv[i]=pval[i];
+ }
+
+ SEXP ans_R, names_R;
+ PROTECT(names_R = allocVector(STRSXP, 2));
+ SET_STRING_ELT(names_R, 0, mkChar("x"));
+ SET_STRING_ELT(names_R, 1, mkChar("v"));
+
+ PROTECT(ans_R = allocVector(VECSXP, 2));
+ SET_VECTOR_ELT(ans_R, 0, rpp_R);
+ SET_VECTOR_ELT(ans_R, 1, rpv_R);
+ setAttrib(ans_R, R_NamesSymbol, names_R);
+
+ UNPROTECT(4);
+ return(ans_R);
+ } else {
+ UNPROTECT(1);
+ return(nv);
+ }
+
+ }
+
+
+}
+
+
diff --git a/pRSEM/process-chipseq.R b/pRSEM/process-chipseq.R
new file mode 100644
index 0000000..11bb114
--- /dev/null
+++ b/pRSEM/process-chipseq.R
@@ -0,0 +1,61 @@
+#
+# pliu 20150509
+#
+# module for processing ChIP-seq data
+#
+
+main <- function() {
+ name2func <- list(
+ 'guessFqEncoding' = guessFqEncoding
+ )
+
+ argv <- commandArgs(trailingOnly=T)
+ name2func[[argv[1]]](argv[2:length(argv)])
+}
+
+
+guessFqEncoding <- function(argv){
+ .libPaths(c(argv[4], .libPaths()))
+ suppressMessages(library(data.table))
+ suppressMessages(library(ShortRead))
+
+ nthr <- strtoi(argv[1])
+ s_infiles <- argv[2]
+ fout <- argv[3]
+
+ files <- strsplit(s_infiles, ',', fixed=T)[[1]]
+
+ #lapply(files, guessFqEncodingByFile)
+
+ ## cannot use mclapply here, ShortRead::qa function bound with BiocParallel
+ ## have to use bplapply and define single core when call qa
+ ## this new feature only appear in the new version of ShortRead
+ register(MulticoreParam(workers=nthr))
+ outdt <- rbindlist(bplapply(files, guessFqEncodingByFile))
+
+ write.table(outdt, fout, sep="\t", quote=F, col.names=T, row.names=F)
+ #cat('File written:', fout, "\n")
+}
+
+
+## exam and guess Fastq's quality score's format
+guessFqEncodingByFile <- function(fq) {
+ qual <- qa(fq, BPPARAM=registered()$SerialParam)
+ bq <- qual[['baseQuality']]
+ score <- subset(bq, bq$count > 0, select=c('score'))$score
+
+ encod <- 'unknown'
+ if ( any( strsplit(intToUtf8(33:58), '')[[1]] %in% score ) ) {
+ encod <- '--phred33-quals'
+ } else if ( any( strsplit(intToUtf8(59:64), '')[[1]] %in% score ) ) {
+ #encod <- 'solexa ver. <1.3'
+ encod <- '--solexa-quals'
+ } else if ( any( strsplit(intToUtf8(75:104), '')[[1]] %in% score ) ) {
+ encod <- '--phred64-quals'
+ }
+
+ return(list(file=fq, encoding=encod))
+}
+
+
+main()
diff --git a/pRSEM/process-rnaseq.R b/pRSEM/process-rnaseq.R
new file mode 100644
index 0000000..3cee45a
--- /dev/null
+++ b/pRSEM/process-rnaseq.R
@@ -0,0 +1,982 @@
+#
+# pliu 20150608
+#
+# module for processing RNA-seq data
+#
+
+main <- function() {
+ name2func <- list(
+ 'selTrainingTr' = selTrainingTr,
+ 'prepTSSPeakFeatures' = prepTSSPeakFeatures,
+ 'prepPeakSignalGCLenFeatures' = prepPeakSignalGCLenFeatures,
+ 'prepMultiTargetsFeatures' = prepMultiTargetsFeatures,
+ 'genPriorByTSSPeak' = genPriorByTSSPeak,
+ 'genPriorByPeakSignalGCLen' = genPriorByPeakSignalGCLen,
+ 'genPriorByCombinedTSSSignals' = genPriorByCombinedTSSSignals
+ )
+
+ argv <- commandArgs(trailingOnly=T)
+ name2func[[argv[1]]](argv[2:length(argv)])
+}
+
+
+genPriorByCombinedTSSSignals <- function(argv=NA){
+ libloc <- argv[1]
+ finfo <- argv[2]
+ fout_glmmdl <- argv[3]
+ fout_ftrs <- argv[4]
+ fout_pvalLL <- argv[5]
+ fout_prior <- argv[6]
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# finfo <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/multi_targets.info'
+# fout_glmmdl <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.lgt_mdl.RData'
+# fout_ftrs <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.all_tr_features'
+# fout_pvalLL <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.pval_LL'
+# fout_prior <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.all_tr_prior'
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+
+ infodt <- fread(finfo, header=T, sep="\t")
+ tgtids <- infodt[, targetid]
+ ftrsdt <- rbindlist(lapply(tgtids,
+ function(tgtid) {
+ fin <- subset(infodt, targetid==tgtid)[, fftrs]
+ dt <- fread(fin, header=T, sep="\t")
+ dt[, `:=`( log10_tss_sig = ifelse(tss_sig > 0, log10(tss_sig),
+ -4),
+ targetid = tgtid )]
+ dt[, `:=`(tss_sig = NULL, nrd = NULL) ]
+ return(dt)
+ }))
+
+ alldt <- dcast(ftrsdt, ... ~ targetid, value.var = 'log10_tss_sig')
+ trndt <- subset(alldt, is_training == 1)
+ frm <- paste0('is_expr ~ ', paste(sort(tgtids), collapse=' + '))
+
+ glmmdl <- glm(frm, family='binomial', data=trndt)
+ save(glmmdl, file=fout_glmmdl)
+
+ alldt[, prd_expr_prob := predict(glmmdl, alldt, type='response')]
+ alldt[, partition := factor(ifelse(prd_expr_prob > 0.5, 1, 0))]
+
+ trn_prtdt <- subset(alldt, is_training == 1)
+ fit <- getFitByMLDM(trn_prtdt[, pme_count], trn_prtdt[, partition])
+ alldt[, prior:= fit$par[partition]]
+
+ orig_ordered_trids <- subset(ftrsdt, targetid == tgtids[1])[, trid]
+ setkey(alldt, trid)
+
+ write.table(alldt[orig_ordered_trids], fout_ftrs, quote=F, sep='\t',
+ col.names=T, row.names=F)
+
+ not_expr_cnt <- subset(trn_prtdt, partition == 0)[, pme_count]
+ expr_cnt <- subset(trn_prtdt, partition == 1)[, pme_count]
+ wrs <- suppressWarnings(wilcox.test(expr_cnt, not_expr_cnt,
+ alternative='greater', paired=F, exact=T))
+ pval <- wrs$p.value
+ loglikelihood <- fit$value
+ pvalLLdt <- data.table(pvalue = pval, loglikelihood = loglikelihood)
+ write.table(pvalLLdt, fout_pvalLL, quote=F, sep="\t", col.names=T,
+ row.names=F)
+
+ out_priordt <- alldt[orig_ordered_trids, list(prior, trid)]
+ write.table(out_priordt, fout_prior, quote=F, sep=' # ', col.names=F,
+ row.names=F)
+}
+
+
+genPriorByPeakSignalGCLen <- function(argv=NA) {
+ libloc <- argv[1]
+ fall_tr_features <- argv[2]
+ partition_model <- argv[3]
+ fout <- argv[4]
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_features <- '/tier2/deweylab/scratch/pliu/dev/rsem_expr/test.temp/test_prsem.all_tr_features'
+# partition_model <- 'pk_lgtnopk'
+# fout <- '/tier2/deweylab/scratch/pliu/dev/rsem_expr/test.temp/test_prsem.all_tr_prior'
+
+ partition_model2func <- list(
+ ## not needed in this function
+ #'pk' = getSampleAndPriorByPeak
+ #'1prt' = getSampleAndPriorByOnePartition,
+
+ ## w/ peak + logit on no peak
+ 'pk_lgtnopk' = getSampleAndPriorByPeakLogitNoPeak,
+
+ ## linear regression on all with different number of bins
+ 'lm3' = getSampleAndPriorByLM3, # 3 bins
+ 'lm4' = getSampleAndPriorByLM4, # 4 bins
+ 'lm5' = getSampleAndPriorByLM5, # 5 bins
+ 'lm6' = getSampleAndPriorByLM6, # 6 bins
+
+ ## no peak + lm on w/ peak with different number of bins
+ 'nopk_lm2pk' = getSampleAndPriorByNoPeakLM2Peak,
+ 'nopk_lm3pk' = getSampleAndPriorByNoPeakLM3Peak,
+ 'nopk_lm4pk' = getSampleAndPriorByNoPeakLM4Peak,
+ 'nopk_lm5pk' = getSampleAndPriorByNoPeakLM5Peak,
+
+ ## w/ peak + lm on no peak with different number of bins
+ 'pk_lm2nopk' = getSampleAndPriorByPeakLM2NoPeak,
+ 'pk_lm3nopk' = getSampleAndPriorByPeakLM3NoPeak,
+ 'pk_lm4nopk' = getSampleAndPriorByPeakLM4NoPeak,
+ 'pk_lm5nopk' = getSampleAndPriorByPeakLM5NoPeak
+ )
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+
+ all_trdt <- fread(fall_tr_features, header=T, sep="\t")
+ GC_mean <- mean(all_trdt[, GC_fraction])
+ all_trdt[, `:=`( log10_count = log10(pme_count + 1),
+ log10_tss_sig = ifelse(tss_sig > 0, log10(tss_sig), -4.0),
+ log10_body_sig = ifelse(body_sig > 0, log10(body_sig), -4.0),
+ log10_tes_sig = ifelse(tes_sig > 0, log10(tes_sig), -4.0),
+ log10_eff_len = ifelse(efflen > 0, log10(efflen), -4.0),
+ log10_GC_ov_mean = ifelse(GC_fraction > 0,
+ log10(GC_fraction/GC_mean), -4.0),
+ no_tss_pk = 1 - tss_pk,
+ no_body_pk = 1 - body_pk,
+ no_tes_pk = 1 - tes_pk
+ )]
+
+ training_trdt <- subset(all_trdt, is_training==1)
+
+ func <- partition_model2func[[partition_model]]
+
+ outdt <- func(training_trdt, all_trdt)
+ write.table(outdt[, list(prior, trid)], fout, quote=F, sep=' # ', col.names=F,
+ row.names=F)
+}
+
+
+prepMultiTargetsFeatures <- function(argv=NA){
+ libloc <- argv[1]
+ fall_tr_crd <- argv[2]
+ ftraining_tr_crd <- argv[3]
+ fisoforms_results <- argv[4]
+ flanking_width <- as.numeric(argv[5])
+ cap_stacked_chipseq_reads <- argv[6]
+ n_max_stacked_chipseq_reads <- argv[7]
+ finfo <- argv[8]
+ nthr <- argv[9]
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_crd <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/example_prsem.all_tr_crd'
+# ftraining_tr_crd <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/example_prsem.training_tr_crd'
+# fisoforms_results <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.isoforms.results'
+# flanking_width <- 500
+# cap_stacked_chipseq_reads <- 'True'
+# n_max_stacked_chipseq_reads <- 5
+# finfo <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/multi_targets.info'
+# nthr <- 16
+
+# tmpdir <- '/tier2/deweylab/scratch/pliu/test/pRSEM/histone/03_rsem_expr/LTHSCRep1/LTHSCRep1.temp/'
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_crd <- paste0(tmpdir, 'LTHSCRep1_prsem.all_tr_crd')
+# ftraining_tr_crd <- paste0(tmpdir, 'LTHSCRep1_prsem.training_tr_crd')
+# fisoforms_results <- paste0(tmpdir, '../LTHSCRep1.isoforms.results')
+# flanking_width <- 500
+# cap_stacked_chipseq_reads <- 'False'
+# n_max_stacked_chipseq_reads <- 5
+# finfo <- paste0(tmpdir, 'multi_targets.info')
+# nthr <- 16
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+ suppressMessages(library(GenomicRanges))
+
+ all_trdt <- fread(fall_tr_crd, header=T, sep="\t")
+ training_trdt <- fread(ftraining_tr_crd, header=T, sep="\t", select='trid')
+
+ rsemdt <- fread(fisoforms_results, header=T, sep="\t", select=c(
+ 'transcript_id', 'posterior_mean_count', 'pme_TPM'))
+ setnames(rsemdt, 1:2, c('trid', 'pme_count'))
+
+ trdt <- merge(all_trdt, rsemdt, by='trid', all.x=T)
+ trdt[, `:=`( tss = ifelse(strand == '+', start, end),
+ is_training = ifelse(trid %in% training_trdt[, trid], 1, 0) )]
+
+ tssdt <- trdt[, list(chrom, tss, trid)]
+ tssdt[, `:=`( start = tss - flanking_width,
+ end = tss + flanking_width ) ]
+
+ infodt <- fread(finfo, header=T, sep="\t")
+ dum <- mclapply(infodt[, targetid], prepTSSSignalsFeatures, tssdt,
+ infodt, trdt, all_trdt, flanking_width,
+ cap_stacked_chipseq_reads, n_max_stacked_chipseq_reads,
+ mc.cores=nthr)
+ #dum <- lapply(infodt[, targetid], prepTSSSignalsFeatures, tssdt,
+ # infodt, trdt, all_trdt, flanking_width,
+ # cap_stacked_chipseq_reads, n_max_stacked_chipseq_reads)
+}
+
+
+prepTSSSignalsFeatures <- function(tgtid, tssdt, infodt, trdt, all_trdt,
+ flanking_width, is_cap, n_max_cap) {
+ faln <- subset(infodt, targetid == tgtid)[, faln]
+ fout <- subset(infodt, targetid == tgtid)[, fftrs]
+ allrddt <- fread(paste0('zcat ',faln), header=F, sep="\t",
+ select=c('V1', 'V2', 'V3', 'V6'))
+ setnames(allrddt, 1:4, c('chrom', 'start', 'end', 'strand'))
+
+ if ( is_cap == 'True' ) {
+ ## keep at most 5 reads per strand-specific interval
+ allrddt[, dupi := seq_len(.N), by=list(chrom, start, end, strand)]
+ rddt <- subset(allrddt, dupi <= n_max_cap)
+ rddt[, dupi := NULL]
+ } else {
+ rddt <- allrddt
+ }
+
+ ## since no peak is called here, just use the average read length as fraglen
+ ## count # of reads as signals rather than # of overlapping nucleotide
+ ## normalize by TSS interval length and read depth to RPKM
+ tssgrs <- makeGRangesFromDataFrame(tssdt, keep.extra.columns=T)
+ rdgrs <- makeGRangesFromDataFrame(rddt, keep.extra.columns=T)
+
+ ol <- findOverlaps(rdgrs, tssgrs, type='within', ignore.strand=T)
+ oldt <- data.table(query=queryHits(ol), subject=subjectHits(ol))
+ oldt[, trid := tssdt[, trid][subject]]
+ nrddt <- oldt[, list(nrd = .N), by=trid]
+
+ trdt <- merge(trdt, nrddt, by='trid', all.x=T)
+
+ n_tot_rds <- length(rdgrs)
+ trdt[, `:=`( tss_sig = ifelse(is.na(nrd), 0,
+ nrd*1e+9/(flanking_width*2+1)/n_tot_rds),
+ is_expr = ifelse(pme_count > 0 & pme_TPM >= 1, 1, 0) )]
+
+ setkey(trdt, trid)
+ trdt <- trdt[all_trdt[, trid]] ## keep the order of original trids
+ write.table(trdt, fout, quote=F, sep="\t", col.names=T, row.names=F)
+}
+
+
+prepPeakSignalGCLenFeatures <- function(argv=NA){
+ libloc <- argv[1]
+ fall_tr_crd <- argv[2]
+ ftraining_tr_crd <- argv[3]
+ fout <- argv[4]
+ fisoforms_results <- argv[5]
+ flanking_width <- as.numeric(argv[6])
+ partition_model <- argv[7]
+ fchipseq_peaks <- argv[8]
+ fchipseq_target_signals <- argv[9]
+ fall_tr_gc <- argv[10]
+ nthr <- argv[11]
+ fraglen <- as.numeric(argv[12])
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_crd <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.all_tr_crd'
+# ftraining_tr_crd <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.training_tr_crd'
+# fout <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.all_tr_features'
+# fisoforms_results <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.isoforms.results'
+# flanking_width <- 500
+# partition_model <- 'lm4'
+# fchipseq_peaks <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/idr_target_vs_control.regionPeak.gz'
+# fchipseq_target_signals <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/target.tagAlign.gz'
+# fall_tr_gc <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.all_tr_gc'
+# nthr <- 20
+# fraglen <- 110
+
+# runid <- 'lm4_k562rep1'
+# exprdir <- paste0('/tier2/deweylab/scratch/pliu/test/pRSEM/', runid,
+# '/rsem_expr/')
+# tempdir <- paste0(exprdir, runid, '.temp/')
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_crd <- paste0(tempdir, runid, '_prsem.all_tr_crd')
+##fall_tr_crd <- paste0(tempdir, 'tmp.all_tr_crd')
+# ftraining_tr_crd <- paste0(tempdir, runid, '_prsem.training_tr_crd')
+# fout <- paste0('./', runid, '_prsem.all_tr_features')
+# fisoforms_results <- paste0(exprdir, runid, '.isoforms.results')
+# flanking_width <- 500
+# partition_model <- 'lm4'
+# fchipseq_peaks <- paste0(tempdir, 'idr_target_vs_control.regionPeak.gz')
+# fchipseq_target_signals <- paste0(tempdir, 'target.tagAlign.gz')
+##fchipseq_target_signals <- paste0(tempdir, 'tmp.chrX.tagAlign.gz')
+# fall_tr_gc <- paste0(tempdir, runid, '_prsem.all_tr_gc')
+# nthr <- 20
+# fraglen <- 110
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+ suppressMessages(library(GenomicRanges))
+
+ all_trdt <- fread(fall_tr_crd, header=T, sep="\t")
+
+ rsemdt <- fread(fisoforms_results, header=T, sep="\t", select=c(
+ 'transcript_id', 'effective_length', 'posterior_mean_count'))
+ setnames(rsemdt, 1:3, c('trid', 'efflen', 'pme_count'))
+
+ gcdt <- fread(fall_tr_gc, header=T, sep="\t")
+
+ trdt <- merge(all_trdt, rsemdt, by='trid', all.x=T)
+ trdt <- merge(trdt, gcdt, by='trid', all.x=T)
+
+ trdt[, `:=`( tss = ifelse(strand == '+', start, end ),
+ tes = ifelse(strand == '+', end, start))]
+ tssdt <- trdt[, list(chrom, tss, trid)]
+ bodydt <- trdt[, list(chrom, start, end, trid)]
+ tesdt <- trdt[, list(chrom, tes, trid)]
+
+ tssdt[, `:=`( start = tss - flanking_width,
+ end = tss + flanking_width ) ]
+ bodydt[, `:=`( body_start = start + flanking_width + 1,
+ body_end = end - flanking_width - 1)]
+ bodydt[, `:=`( start = ifelse(body_start <= body_end, body_start, body_end),
+ end = ifelse(body_start <= body_end, body_end, body_start))]
+ tesdt[, `:=`( start = tes - flanking_width,
+ end = tes + flanking_width ) ]
+
+ pkdt <- data.table(read.table(gzfile(fchipseq_peaks), header=F, sep="\t",
+ colClasses=c('character', 'numeric', 'numeric',
+ rep('NULL', 7))))
+ setnames(pkdt, 1:3, c('chrom', 'start', 'end'))
+
+ has_tss_pk_trids <- getRegionPeakOLTrID(tssdt, pkdt)
+ has_body_pk_trids <- getRegionPeakOLTrID(bodydt, pkdt)
+ has_tes_pk_trids <- getRegionPeakOLTrID(tesdt, pkdt)
+
+ rddt <- data.table(read.table(gzfile(fchipseq_target_signals), header=F,
+ sep="\t", colClasses=c('character', 'numeric',
+ 'numeric', rep('NULL', 2), 'character')))
+ setnames(rddt, 1:4, c('chrom', 'start', 'end', 'strand'))
+ tss_sigdt <- countRegionSignal(tssdt, rddt, fraglen, nthr, 'tss')
+ body_sigdt <- countRegionSignal(bodydt, rddt, fraglen, nthr, 'body')
+ tes_sigdt <- countRegionSignal(tesdt, rddt, fraglen, nthr, 'tes')
+
+ trdt <- merge(trdt, tss_sigdt, by='trid', all.x=T)
+ trdt <- merge(trdt, body_sigdt, by='trid', all.x=T)
+ trdt <- merge(trdt, tes_sigdt, by='trid', all.x=T)
+
+ trdt[, `:=`(tss_sig = ifelse(is.na(tss_sig), 0.0, tss_sig),
+ body_sig = ifelse(is.na(body_sig), 0.0, body_sig),
+ tes_sig = ifelse(is.na(tes_sig), 0.0, tes_sig ))]
+
+ training_trdt <- fread(ftraining_tr_crd, header=T, sep="\t", select='trid')
+ trdt[, `:=`( tss_pk = ifelse(trid %in% has_tss_pk_trids, 1, 0),
+ body_pk = ifelse(trid %in% has_body_pk_trids, 1, 0),
+ tes_pk = ifelse(trid %in% has_tes_pk_trids, 1, 0),
+ is_training = ifelse(trid %in% training_trdt[, trid], 1, 0))]
+
+ setkey(trdt, trid)
+ trdt <- trdt[all_trdt[, trid]] ## keep the order of original trids
+ write.table(trdt, fout, quote=F, sep="\t", col.names=T, row.names=F)
+}
+
+#
+# need to modify the way to calculate signal as # of nuc from fragment rather
+# than the number of read overlapping with selected region
+#
+# tagAlign list the read
+# '+' strand, [start, start + read_length]
+# '-' strand, [end - read_length, end]
+#
+# 1. extend read to fragment
+# 2. find fragmens overlapping target region
+# 3. remove fragments that have middle position outside target region (as how
+# dpeak works)
+# 4. count number of fragment nucleotide and average it by target region's
+# width to get signal
+#
+countRegionSignal <- function(regiondt, readdt, fraglen, nthr, prefix=''){
+ regiondtl <- split(regiondt[, list(chrom, start, end, trid)],
+ regiondt[, chrom])
+ readdtl <- split(readdt, readdt[, chrom])
+
+ #outdt <- rbindlist( lapply(names(regiondtl), countRegionSignalByChrom,
+ # regiondtl, readdtl, fraglen, prefix))
+
+ outdt <- rbindlist( mclapply(names(regiondtl), countRegionSignalByChrom,
+ regiondtl, readdtl, fraglen, prefix,
+ mc.cores = nthr))
+ return(outdt)
+}
+
+
+countRegionSignalByChrom <- function(chrom, regiondtl, readdtl, fraglen,
+ prefix) {
+ regiondt <- copy(regiondtl[[chrom]])
+ readdt <- copy(readdtl[[chrom]])
+
+ readdt[, frag_start := ifelse(strand == '+', start, end-fraglen)]
+ readdt[, frag_end := frag_start + fraglen - 1]
+
+ fragdt <- readdt[, list(chrom, frag_start, frag_end)]
+ setnames(fragdt, 2:3, c('start', 'end'))
+ fraggrs <- makeGRangesFromDataFrame(fragdt, ignore.strand=T)
+
+ regiongrs <- makeGRangesFromDataFrame(regiondt[, list(chrom, start, end,
+ trid)],
+ keep.extra.columns=T, ignore.strand=T)
+
+ ol <- findOverlaps(regiongrs, fraggrs, type='any', ignore.strand=T)
+ oldt <- data.table(query=queryHits(ol), subject=subjectHits(ol))
+
+ oldt[, `:=`( region_start = regiondt[, start][query],
+ region_end = regiondt[, end][query],
+ trid = regiondt[, trid][query],
+ frag_start = fragdt[, start][subject],
+ frag_end = fragdt[, end][subject] )]
+
+ oldt[, frag_mid := (frag_start + frag_end)/2]
+
+ ## as dpeak, only select fragment which has mid position falling into region
+ seloldt <- subset(oldt, (frag_mid >= region_start) & (frag_mid <= region_end))
+
+ seloldt[, `:=`(start = ifelse(frag_start < region_start, region_start,
+ frag_start),
+ end = ifelse(frag_end > region_end, region_end,
+ frag_end))]
+
+ sigdt <- seloldt[, list(nuc = sum(end - start + 1)), by=trid]
+ sigdt <- merge(sigdt, regiondt, by='trid', all.x=T)
+ colname <- paste0(prefix, '_sig')
+ sigdt[, eval(colname) := nuc/(end - start + 1)]
+ sigdt[, `:=`(nuc=NULL, chrom=NULL, start=NULL, end=NULL)]
+ return(sigdt)
+}
+
+
+genPriorByTSSPeak <- function(argv=NA){
+ libloc <- argv[1]
+ fall_tr_features <- argv[2]
+ fpval_LL <- argv[3]
+ fall_tr_prior <- argv[4]
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_features <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.all_tr_features'
+# fpval_LL <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.pval_LL'
+# fall_tr_prior <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.stat/example_prsem.all_tr_prior'
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+
+ alldt <- fread(fall_tr_features, header=T, sep="\t")
+ alldt[, partition := tss_pk]
+
+ trndt <- subset(alldt, is_training == 1)
+ outdt <- getSampleAndPriorByTSSPeak(trndt, alldt)
+
+ wpk_cnt <- subset(trndt, tss_pk == 1)[, pme_count]
+ nopk_cnt <- subset(trndt, tss_pk == 0)[, pme_count]
+ wrs <- suppressWarnings(wilcox.test(wpk_cnt, nopk_cnt, alternative='greater',
+ paired=F, exact=T))
+ pval <- wrs$p.value
+ loglikelihood <- unique(outdt[, loglikelihood])
+ pval_LLdt <- data.table(pvalue=pval, loglikelihood=loglikelihood)
+
+ write.table(alldt, fall_tr_features, quote=F, sep="\t", col.names=T,
+ row.names=F)
+ write.table(pval_LLdt, fpval_LL, quote=F, sep="\t", col.names=T, row.names=F)
+ write.table(outdt[, list(prior, trid)], fall_tr_prior, quote=F, sep=" # ",
+ col.names=F, row.names=F)
+}
+
+
+prepTSSPeakFeatures <- function(argv=NA) {
+ libloc <- argv[1]
+ fall_tr_crd <- argv[2]
+ ftraining_tr_crd <- argv[3]
+ fout <- argv[4]
+ fisoforms_results <- argv[5]
+ flanking_width <- as.numeric(argv[6])
+ fchipseq_peaks <- argv[7]
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fall_tr_crd <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/example_prsem.all_tr_crd'
+# ftraining_tr_crd <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/example_prsem.training_tr_crd'
+# fout <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/example_prsem.all_tr_features'
+# fisoforms_results <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.isoforms.results'
+# flanking_width <- 500
+# fchipseq_peaks <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/example.temp/idr_target_vs_control.regionPeak.gz'
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+ suppressMessages(library(GenomicRanges))
+
+ rsemdt <- fread(fisoforms_results, header=T, sep="\t",
+ select=c('transcript_id', 'posterior_mean_count'))
+ setnames(rsemdt, 1:2, c('trid', 'pme_count'))
+
+ intrdt <- fread(fall_tr_crd, header=T, sep="\t")
+ trdt <- merge(intrdt, rsemdt, by='trid', all.x=T)
+ trdt[, tss := ifelse(strand=='+', start, end)]
+ tssdt <- trdt[, list(chrom, tss, trid)]
+ tssdt[, `:=`( start = tss - flanking_width,
+ end = tss + flanking_width)]
+
+ pkdt <- tryCatch({
+ data.table(read.table(gzfile(fchipseq_peaks), header=F, sep="\t",
+ colClasses=c('character', 'numeric',
+ 'numeric', rep('NULL', 7))))
+ }, error = function(err) {
+ message(paste0("\nFail to read file: ", fchipseq_peaks, "\n"))
+ message(err)
+ return(NA)
+ })
+
+ setnames(pkdt, 1:3, c('chrom', 'start', 'end'))
+
+ has_pk_trids <- getRegionPeakOLTrID(tssdt, pkdt)
+ training_trids <- fread(ftraining_tr_crd, header=T, sep="\t", select='trid'
+ )[, trid]
+
+ trdt[, `:=`( tss_pk = ifelse(trid %in% has_pk_trids, 1, 0),
+ is_training = ifelse(trid %in% training_trids, 1, 0) )]
+
+ setkey(trdt, trid)
+ trdt <- trdt[intrdt[, trid]] ## keep the order of original trid
+ write.table(trdt, fout, quote=F, sep="\t", col.names=T, row.names=F)
+}
+
+
+getRegionPeakOLTrID <- function(regiondt, peakdt) {
+ regiongrs <- makeGRangesFromDataFrame(regiondt[, list(chrom, start, end,
+ trid)],
+ keep.extra.columns=T, ignore.strand=T)
+
+ peakgrs <- makeGRangesFromDataFrame(peakdt[, list(chrom, start, end)],
+ ignore.strand=T)
+ olgrs <- subsetByOverlaps(regiongrs, peakgrs, type='any', ignore.strand=T)
+ has_peak_trids <- unique(mcols(olgrs)$trid)
+ return(has_peak_trids)
+}
+
+
+selTrainingTr <- function(argv=NA) {
+ libloc <- argv[1]
+ fin_tr <- argv[2]
+ fin_exon <- argv[3]
+ min_mpp <- as.numeric(argv[4])
+ flanking_width <- as.numeric(argv[5])
+ fout <- argv[6]
+
+# libloc <- '/ua/pliu/dev/RSEM/pRSEM/RLib/'
+# fin_tr <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.all_tr_crd'
+# fin_exon <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.all_exon_crd'
+# min_mpp <- 0.8
+# flanking_width <- 500
+# fout <- '/tier2/deweylab/scratch/pliu/dev/pRSEM/rsem_expr/test.temp/test_prsem.training_tr_crd'
+
+ .libPaths(c(libloc, .libPaths()))
+ suppressMessages(library(data.table))
+ suppressMessages(library(GenomicRanges))
+
+ alltrdt <- fread(fin_tr, header=T, sep="\t")
+ alltrdt[, tss := ifelse(strand == '+', start, end)]
+ highmppdt <- subset(alltrdt, (! is.na(tss_mpp )) & ( tss_mpp > min_mpp ) &
+ (! is.na(body_mpp)) & ( body_mpp > min_mpp ) &
+ (! is.na(tes_mpp )) & ( tes_mpp > min_mpp ) )
+
+ ## select tr that are not nested with other tr regardless of strand
+ nested_trids <- getTrTrOLTrID(highmppdt, alltrdt, oltype='within',
+ ignore_strand=T)
+ not_nested_trdt <- subset(highmppdt, ! trid %in% nested_trids)
+
+ ## select tr that not have exon all nested with the union of other tr's exons
+ ## regardless of strand
+ allexondt <- fread(fin_exon, header=T, sep="\t")
+ exon_all_ol_trids <- getExonsAllOLTrID(not_nested_trdt[, trid], allexondt)
+ seltrdt <- subset(not_nested_trdt, ! trid %in% exon_all_ol_trids)
+
+ seltr_tss_region_dt <- copy(seltrdt)
+ seltr_tss_region_dt[, `:=`( start = tss - flanking_width,
+ end = tss + flanking_width )]
+
+ alltr_tss_dt <- copy(alltrdt)
+ alltr_tss_dt[, `:=`(start = tss, end=tss)]
+
+ tss_region_ol_trids <- getTrTrOLTrID(seltr_tss_region_dt, alltr_tss_dt,
+ 'any', ignore_strand=T)
+
+ outdt <- subset(seltrdt, ! trid %in% tss_region_ol_trids)
+ write.table(outdt, fout, quote=F, sep="\t", col.names=T, row.names=F)
+}
+
+
+getExonsAllOLTrID <- function(query_trids, allexondt) {
+ allexongrs <- makeGRangesFromDataFrame(allexondt, keep.extra.columns=T,
+ ignore.strand=F)
+ queryexongrs <- subset(allexongrs, mcols(allexongrs)$trid %in% query_trids)
+ ol <- findOverlaps(queryexongrs, allexongrs, type='within', ignore.strand=T)
+
+ oldt <- data.table(query = queryHits(ol), subject=subjectHits(ol))
+ oldt[, `:=`( query_trid = mcols(queryexongrs)$trid[query],
+ query_exon_index = mcols(queryexongrs)$exon_index[query],
+ subject_trid = mcols(allexongrs)$trid[subject]
+ )]
+ oldt <- subset(oldt, query_trid != subject_trid)
+ nolexondt <- oldt[, list(nolexon = length(unique(query_exon_index))),
+ by=query_trid]
+ subexondt <- subset(allexondt, trid %in% nolexondt[, query_trid])
+ subnexondt <- subexondt[, list(nexon = length(unique(exon_index))), by=trid]
+ setnames(nolexondt, 1, 'trid')
+ nolexondt <- merge(nolexondt, subnexondt, by='trid', all.x=T)
+ exon_all_ol_trids <- subset(nolexondt, nolexon == nexon)[, trid]
+
+ return(exon_all_ol_trids)
+}
+
+
+getTrTrOLTrID <- function(querydt, subjectdt, oltype, ignore_strand) {
+ querygrs <- makeGRangesFromDataFrame(
+ querydt[, list(chrom, strand, start, end, trid)],
+ keep.extra.columns=T, ignore.strand=F)
+ subjectgrs <- makeGRangesFromDataFrame(
+ subjectdt[, list(chrom, strand, start, end, trid)],
+ keep.extra.columns=T, ignore.strand=F)
+
+ ol <- findOverlaps(querygrs, subjectgrs, type=oltype,
+ ignore.strand=ignore_strand)
+
+ oldt <- data.table(query=queryHits(ol), subject=subjectHits(ol))
+ oldt[, `:=`(query_trid = mcols(querygrs)$trid[query],
+ subject_trid = mcols(subjectgrs)$trid[subject] )]
+ ol_trids <- subset(oldt, query_trid != subject_trid)[, query_trid]
+ return(unique(ol_trids))
+}
+
+
+rdirichlet_multinomial <- function(alpha, n) {
+ theta <- rdirichlet(alpha)
+ return(as.vector(rmultinom(1, n, theta)))
+}
+
+
+rdirichlet <- function(alpha) {
+ x <- rgamma(length(alpha), alpha)
+ return(x / sum(x))
+}
+
+
+getFitByMLDM <- function(counts, partition) {
+ initial_alpha <- rep(1, nlevels(partition))
+ fit <- optim(initial_alpha,
+ partitioned_log_likelihood,
+ partitioned_log_likelihood_gradient,
+ counts=counts,
+ partition=partition,
+ method="L-BFGS-B",
+ lower=0.0001,
+ upper=1e+4,
+ control=list(fnscale=-1))
+
+ return(fit)
+}
+
+
+getSampleByDM <- function(par, counts, partition) {
+ ml_alpha <- par
+ names(ml_alpha) <- levels(partition)
+
+ alpha <- ml_alpha[partition]
+ sample <- rdirichlet_multinomial(alpha, sum(counts))
+ return(sample)
+}
+
+
+partitioned_log_likelihood <- function(alpha, counts, partition) {
+
+ component_counts <- table(partition)
+ N <- sum(counts)
+
+ return(lgamma(N + 1) - sum(lgamma(counts + 1)) +
+ lgamma(component_counts %*% alpha) -
+ lgamma(N + component_counts %*% alpha) +
+ sum(lgamma(counts + alpha[partition])) -
+ component_counts %*% lgamma(alpha))
+}
+
+
+partitioned_log_likelihood_gradient <- function(alpha, counts, partition) {
+
+ component_counts <- table(partition)
+
+ N <- sum(counts)
+ alpha_sum <- sum(component_counts * alpha)
+
+ return(component_counts * (digamma(alpha_sum) -
+ digamma(N + alpha_sum) -
+ digamma(alpha)) +
+ tapply(digamma(counts + alpha[partition]), partition, sum))
+}
+
+
+#
+# create a new partition with give parition and data
+# the breaks from old partition will be kept
+# lower and upper bounds will be from give data
+#
+createPartitionForNewData <- function(partition, data){
+ labs <- levels(partition)
+ lower <- as.numeric( sub("\\((.+),.*", "\\1", labs) )
+ data_range <- range(data)
+ eps <- 1.0e-4
+ lower[1] <- data_range[1] - eps
+ breaks <- c(lower, data_range[2] + eps)
+ new_partition <- cut(data, breaks)
+
+ return(new_partition)
+}
+
+
+#getSampleAndPriorByOnePartition <- function(trndt, tstdt) {
+# trn_partition <- factor(rep(0, nrow(trndt)))
+# fit <- getFitByMLDM(trndt[, pme_count], trn_partition)
+
+##cat('priors:', format(fit$par, digits=2, nsmall=3), "\n")
+#
+# tst_partition <- factor(rep(0, nrow(tstdt)))
+# prior <- fit$par[tst_partition]
+
+# outdt <- tstdt[, list(trid, pme_count)]
+# outdt[, `:=`(partition = tst_partition,
+# sample = getSampleByDM(fit$par, pme_count, tst_partition),
+# prior = prior)]
+
+##cat("training set's partition:")
+##print(table(trn_partition))
+##cat("testing set's prior:")
+##print(table(prior))
+
+# return(outdt)
+#}
+
+
+getSampleAndPriorByTSSPeak <- function(trndt, tstdt) {
+ trn_partition <- factor(trndt[, tss_pk])
+ fit <- getFitByMLDM(trndt[, pme_count], trn_partition)
+
+ tst_partition <- factor(tstdt[, tss_pk])
+ prior <- fit$par[tst_partition]
+
+ outdt <- tstdt[, list(trid, pme_count)]
+ outdt[, `:=`(partition = tst_partition,
+ sample = getSampleByDM(fit$par, pme_count, tst_partition),
+ prior = prior,
+ loglikelihood = fit$value )]
+
+ cat("training set's partition:")
+ print(table(trn_partition))
+ cat("testing set's prior:")
+ print(table(prior))
+
+ return(outdt)
+}
+
+
+getSampleAndPriorByLM <- function(trndt, tstdt, nbin=NULL) {
+ if (is.null(nbin)) nbin=3
+ frm <- formula( paste0('log10_count~',
+ 'tss_pk + tss_pk:log10_tss_sig + no_tss_pk:log10_tss_sig +',
+ 'body_pk + body_pk:log10_body_sig + no_body_pk:log10_body_sig +',
+ 'tes_pk + tes_pk:log10_tes_sig + no_tes_pk:log10_tes_sig +',
+ 'log10_eff_len + log10_GC_ov_mean') )
+
+ trn_lm <- lm(frm, trndt)
+ trn_prd <- as.vector(predict(trn_lm, trndt))
+ trn_partition <- cut(trn_prd, nbin)
+
+ fit <- getFitByMLDM(trndt[, pme_count], trn_partition)
+
+ tst_prd <- as.vector(predict(trn_lm, tstdt))
+ tst_partition <- createPartitionForNewData(trn_partition, tst_prd)
+
+ prior <- fit$par[tst_partition]
+
+ outdt <- tstdt[, list(trid, pme_count)]
+ outdt[, `:=`(partition = tst_partition,
+ sample = getSampleByDM(fit$par, pme_count, tst_partition),
+ prior = prior)]
+
+ cat("training set's partition:")
+ print(table(trn_partition))
+ cat("testing set's prior:")
+ print(table(prior))
+
+ return(outdt)
+}
+
+
+getSampleAndPriorByPeakLM <- function(trndt, tstdt, nbin=NULL, lm_on_wpk=NULL) {
+ if (is.null(nbin)) nbin=2
+ if (is.null(lm_on_wpk)) lm_on_wpk=T
+ lm_pk_type <- ifelse(lm_on_wpk, 1, 0)
+
+ slim_trndt <- trndt[, list(trid, pme_count, tss_pk)]
+ slim_tstdt <- tstdt[, list(trid, pme_count, tss_pk)]
+
+ frm <- formula( paste0('log10_count~',
+ 'log10_tss_sig +',
+ 'body_pk + body_pk:log10_body_sig + no_body_pk:log10_body_sig +',
+ 'tes_pk + tes_pk:log10_tes_sig + no_tes_pk:log10_tes_sig +',
+ 'log10_eff_len + log10_GC_ov_mean') )
+
+ pk_trn_dt <- subset(trndt, tss_pk == lm_pk_type)
+ pk_trn_lm <- lm(frm, pk_trn_dt)
+ pk_trn_prd <- as.vector(predict(pk_trn_lm, pk_trn_dt))
+ pk_trn_partition <- cut(pk_trn_prd, nbin)
+ pk_trn_dt[, partition:=pk_trn_partition]
+
+ slim_trndt <- merge(slim_trndt, pk_trn_dt[, list(trid, partition)], by='trid',
+ all=T)
+ slim_trndt <- slim_trndt[trndt[,trid]]
+
+ slim_trndt[, partition:=factor(ifelse(tss_pk==(1-lm_pk_type), 0, partition))]
+
+ fit <- getFitByMLDM(slim_trndt[, pme_count], slim_trndt[, partition])
+
+ pk_tst_dt <- subset(tstdt, tss_pk == lm_pk_type)
+ pk_tst_prd <- as.vector(predict(pk_trn_lm, pk_tst_dt))
+ pk_tst_partition <- createPartitionForNewData(pk_trn_partition,
+ pk_tst_prd)
+ pk_tst_dt[, partition:=pk_tst_partition]
+
+ slim_tstdt <- merge(slim_tstdt, pk_tst_dt[, list(trid, partition)], by='trid',
+ all=T)
+ slim_tstdt <- slim_tstdt[tstdt[,trid]]
+ slim_tstdt[, partition:=factor(ifelse(tss_pk==(1-lm_pk_type), 0, partition))]
+ prior <- fit$par[slim_tstdt[, partition]]
+
+ outdt <- slim_tstdt[, list(trid, pme_count, partition)]
+ outdt[, `:=`(sample = getSampleByDM(fit$par, pme_count, partition),
+ prior = prior)]
+
+ cat("training set's partition:")
+ print(table(slim_trndt[,partition]))
+ cat("testing set:")
+ print(table(prior))
+ cat("\n")
+
+ return(outdt)
+}
+
+
+getSampleAndPriorByPeakLogitNoPeak <- function(trndt, tstdt) {
+ slim_trndt <- trndt[, list(trid, pme_count, tss_pk)]
+ slim_tstdt <- tstdt[, list(trid, pme_count, tss_pk)]
+
+ setkey(slim_trndt, trid)
+ setkey(slim_tstdt, trid)
+
+ nopk_trn_dt <- subset(trndt, tss_pk == 0)
+ nopk_tst_dt <- subset(tstdt, tss_pk == 0)
+ nopk_trn_dt[, has_count:=ifelse(pme_count > 0, 1, 0)]
+ nopk_tst_dt[, has_count:=ifelse(pme_count > 0, 1, 0)]
+
+ frm <- formula( paste0('has_count~',
+ 'log10_tss_sig +',
+ 'body_pk + body_pk:log10_body_sig + no_body_pk:log10_body_sig +',
+ 'tes_pk + tes_pk:log10_tes_sig + no_tes_pk:log10_tes_sig +',
+ 'log10_eff_len + log10_GC_ov_mean') )
+ prt_levels <- c('no pk, no cnt', 'no pk, has cnt', 'w/ pk')
+
+ nopk_trn_glm <- glm(frm, data=nopk_trn_dt, family='binomial')
+ trn_prob <- predict(nopk_trn_glm, nopk_trn_dt, type='response')
+ nopk_trn_dt[, `:=`(logit_prob=trn_prob,
+ partition=ifelse(trn_prob > 0.5, 'no pk, has cnt',
+ 'no pk, no cnt') )]
+
+ slim_trndt <- merge(slim_trndt,
+ nopk_trn_dt[, list(trid, partition, logit_prob)],
+ by='trid', all=T)
+ slim_trndt <- slim_trndt[trndt[, trid]]
+
+ slim_trndt[, partition:=factor(ifelse(tss_pk==1, 'w/ pk', partition),
+ levels=prt_levels)]
+
+
+ fit <- getFitByMLDM(slim_trndt[, pme_count], slim_trndt[, partition])
+
+ tst_prob <- predict(nopk_trn_glm, nopk_tst_dt, type='response')
+ nopk_tst_dt[, `:=`(logit_prob=tst_prob,
+ partition=ifelse(tst_prob > 0.5, 'no pk, has cnt',
+ 'no pk, no cnt'))]
+
+ slim_tstdt <- merge(slim_tstdt,
+ nopk_tst_dt[, list(trid, partition, logit_prob)],
+ by='trid', all=T)
+ slim_tstdt <- slim_tstdt[tstdt[, trid]]
+ slim_tstdt[, partition:=factor(ifelse(tss_pk==1, 'w/ pk', partition),
+ levels=prt_levels)]
+
+ prior <- fit$par[slim_tstdt[, partition]]
+
+ outdt <- slim_tstdt[, list(trid, pme_count, partition)]
+ outdt[, `:=`(sample = getSampleByDM(fit$par, pme_count, partition),
+ prior = prior)]
+
+ cat("training set's partition:")
+ print(table(slim_trndt[,partition]))
+ cat("testing set:")
+ print(table(prior))
+ cat("\n")
+
+ return(outdt)
+}
+
+
+getSampleAndPriorByLM3 <- function(trndt, tstdt) {
+ return(getSampleAndPriorByLM(trndt, tstdt, nbin=3))
+}
+
+getSampleAndPriorByLM4 <- function(trndt, tstdt) {
+ return(getSampleAndPriorByLM(trndt, tstdt, nbin=4))
+}
+
+getSampleAndPriorByLM5 <- function(trndt, tstdt) {
+ return(getSampleAndPriorByLM(trndt, tstdt, nbin=5))
+}
+
+getSampleAndPriorByLM6 <- function(trndt, tstdt) {
+ return(getSampleAndPriorByLM(trndt, tstdt, nbin=6))
+}
+
+
+getSampleAndPriorByNoPeakLM2Peak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=2, lm_on_wpk=T))
+}
+
+getSampleAndPriorByNoPeakLM3Peak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=3, lm_on_wpk=T))
+}
+
+getSampleAndPriorByNoPeakLM4Peak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=4, lm_on_wpk=T))
+}
+
+getSampleAndPriorByNoPeakLM5Peak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=5, lm_on_wpk=T))
+}
+
+
+getSampleAndPriorByPeakLM2NoPeak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=2, lm_on_wpk=F))
+}
+
+getSampleAndPriorByPeakLM3NoPeak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=3, lm_on_wpk=F))
+}
+
+getSampleAndPriorByPeakLM4NoPeak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=4, lm_on_wpk=F))
+}
+
+getSampleAndPriorByPeakLM5NoPeak <- function(trndt, tstdt) {
+ return(getSampleAndPriorByPeakLM(trndt, tstdt, nbin=5, lm_on_wpk=F))
+}
+
+
+main()
+#selTrainingTr()
+#prepTSSPeakFeatures()
+#genPriorByTSSPeak()
+#prepPeakSignalGCLenFeatures()
+#genPriorByPeakSignalGCLen()
+#prepTSSSignalFeatures()
+#genPriorByCombinedTSSSignals()
+#prepMultiTargetsFeatures()
diff --git a/pRSEM/prsem-calculate-expression b/pRSEM/prsem-calculate-expression
new file mode 100755
index 0000000..0b56904
--- /dev/null
+++ b/pRSEM/prsem-calculate-expression
@@ -0,0 +1,74 @@
+#!/bin/env python
+
+__doc__="""
+
+ pliu 20150510
+
+ run pRSEM to calculate priors and use them for abundance estimation
+"""
+
+import os
+import sys
+import Util
+import Prsem
+
+def main():
+ import Param
+
+ argdict = getCommandLineArguments()
+
+ param = Param.initFromCommandLineArguments(argdict)
+
+ if param.chipseq_peak_file is None:
+ if param.partition_model == 'cmb_lgt':
+ if param.chipseq_bed_files_multi_targets is not None:
+ Prsem.genChIPSeqSignalFilesFromBed(param)
+ elif param.chipseq_read_files_multi_targets is not None:
+ Prsem.genChIPSeqSignalFilesFromReads(param)
+ else:
+ ## IDR peaks will be saved in file param.fidr_chipseq_peaks
+ Prsem.genChIPSeqPeakFileBySPPIDR(param)
+
+
+ if param.partition_model == 'pk':
+ ## no need to calculate signals or body/tes peaks here
+ Prsem.genPriorByTSSPeak(param)
+ elif param.partition_model == 'cmb_lgt':
+ Prsem.genPriorByCombinedTSSSignals(param)
+ else:
+ genPriorByPeakSignalGCLen(param)
+
+ Prsem.runGibbsSampling(param)
+
+
+def getCommandLineArguments():
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--num-threads', type=int)
+ parser.add_argument('--chipseq-target-read-files')
+ parser.add_argument('--chipseq-control-read-files')
+ parser.add_argument('--chipseq-read-files-multi-targets')
+ parser.add_argument('--chipseq-bed-files-multi-targets')
+ parser.add_argument('--cap-stacked-chipseq-reads', action='store_true')
+ parser.add_argument('--n-max-stacked-chipseq-reads', type=int)
+ parser.add_argument('--bowtie-path')
+ parser.add_argument('--chipseq-peak-file')
+ parser.add_argument('--partition-model', )
+ parser.add_argument('--gibbs-burnin', type=int)
+ parser.add_argument('--gibbs-number-of-samples', type=int)
+ parser.add_argument('--gibbs-sampling-gap', type=int)
+ parser.add_argument('--quiet', action='store_true')
+
+ ## need to be in the same order as fed in argument
+ parser.add_argument('ref_name')
+ parser.add_argument('sample_name')
+ parser.add_argument('stat_name')
+ parser.add_argument('imd_name')
+ argdict = vars(parser.parse_args())
+
+ return argdict
+
+
+if __name__=='__main__':
+ main()
diff --git a/pRSEM/prsem-prepare-reference b/pRSEM/prsem-prepare-reference
new file mode 100755
index 0000000..cef6bdb
--- /dev/null
+++ b/pRSEM/prsem-prepare-reference
@@ -0,0 +1,70 @@
+#!/bin/env python
+
+__doc__="""
+
+ pliu 20160309
+
+ run pRSEM to prepare reference genome and select training set isoforms
+"""
+
+import os
+import sys
+import Util
+import Prsem
+
+
+def main():
+ import Gene
+ import Param
+ import Transcript
+
+ argdict = getCommandLineArguments()
+
+ prm = Param.initFromCommandLineArguments(argdict)
+
+ prepBowtieRef(prm)
+
+ prm.transcripts = Transcript.quicklyReadRSEMTI(prm.fti)
+ prm.genes = Gene.constructGenesFromTranscripts(prm.transcripts)
+
+ Prsem.buildTrainingSet(prm)
+
+
+def getCommandLineArguments():
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--num-threads', type=int)
+ parser.add_argument('--bowtie-path')
+ parser.add_argument('--mappability-bigwig-file')
+ parser.add_argument('--quiet', action='store_true')
+
+ ## need to be in the same order as fed in argument
+ parser.add_argument('ref_fasta')
+ parser.add_argument('ref_name')
+
+ argdict = vars(parser.parse_args())
+
+ return argdict
+
+
+def prepBowtieRef(prm):
+ bowtie_build = prm.bowtie_path + '/bowtie-build'
+ #bowtie_inspect = prm.bowtie_path + '/bowtie-inspect'
+ prsem_ref = prm.ref_name + '_prsem'
+
+ ## run bowtie-build for index whole genome for ChIP-seq
+ ## use --offrate=3, which is 2 less than the default value 5
+ ## will trade ~1.5 time memory for 1/4 alignment time for human genome
+ if prm.quiet:
+ Util.runCommand(bowtie_build, '-o', 3, '--quiet', prm.ref_fasta,
+ prsem_ref, quiet=prm.quiet)
+ else:
+ Util.runCommand(bowtie_build, '-o', 3, prm.ref_fasta,
+ prsem_ref, quiet=prm.quiet)
+
+ #Util.runCommandnAndGetOutput(bowtie_inspect, '-s', prsem_ref, quiet=False)
+
+
+if __name__=='__main__':
+ main()
diff --git a/pRSEM/prsem-testing-procedure b/pRSEM/prsem-testing-procedure
new file mode 100755
index 0000000..7e9a0f6
--- /dev/null
+++ b/pRSEM/prsem-testing-procedure
@@ -0,0 +1,104 @@
+#!/bin/env python
+
+__doc__="""
+
+ pliu 20150304
+
+ run pRSEM's testing procedure to calculate:
+ 1. a p-value on whether external data is informative
+ 2. a log-likelihood on read counts of partitioned isoforms' fit to DM model
+"""
+
+import os
+import sys
+import Util
+import Prsem
+
+def main():
+ import File
+ import Param
+
+ argdict = getCommandLineArguments()
+
+ param = Param.initFromCommandLineArguments(argdict)
+
+ if param.chipseq_peak_file is None:
+ if param.partition_model == 'cmb_lgt':
+ if param.chipseq_bed_files_multi_targets is not None:
+ Prsem.genChIPSeqSignalFilesFromBed(param)
+ elif param.chipseq_read_files_multi_targets is not None:
+ Prsem.genChIPSeqSignalFilesFromReads(param)
+ param.targetids = sorted(param.targetid2fchipseq_alignment.keys())
+ else:
+ ## IDR peaks will be saved in file param.fidr_chipseq_peaks
+ Prsem.genChIPSeqPeakFileBySPPIDR(param)
+ cse_target = param.chipseqexperiment_target
+ cse_control = param.chipseqexperiment_control
+ param.targetids = sorted([rep.name for rep in cse_target.reps])
+ if cse_control is not None:
+ param.targetids += sorted([rep.name for rep in cse_control.reps])
+ else:
+ file_pk = File.initFromFullFileName(param.chipseq_peak_file)
+ param.targetids = [file_pk.basename]
+
+
+ if param.partition_model == 'pk':
+ ## no need to calculate signals or body/tes peaks here
+ Prsem.genPriorByTSSPeak(param)
+ elif param.partition_model == 'cmb_lgt':
+ Prsem.genPriorByCombinedTSSSignals(param)
+ else:
+ Prsem.genPriorByPeakSignalGCLen(param)
+
+ writePvalLL(param)
+
+def getCommandLineArguments():
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--num-threads', type=int)
+ parser.add_argument('--chipseq-target-read-files')
+ parser.add_argument('--chipseq-control-read-files')
+ parser.add_argument('--chipseq-read-files-multi-targets')
+ parser.add_argument('--chipseq-bed-files-multi-targets')
+ parser.add_argument('--cap-stacked-chipseq-reads', action='store_true')
+ parser.add_argument('--n-max-stacked-chipseq-reads', type=int)
+ parser.add_argument('--bowtie-path')
+ parser.add_argument('--chipseq-peak-file')
+ parser.add_argument('--partition-model', )
+ parser.add_argument('--quiet', action='store_true')
+
+ ## need to be in the same order as fed in argument
+ parser.add_argument('ref_name')
+ parser.add_argument('sample_name')
+ parser.add_argument('stat_name')
+ parser.add_argument('imd_name')
+ argdict = vars(parser.parse_args())
+
+ return argdict
+
+
+def writePvalLL(prm):
+ """
+ add p-value and log-likelihood to a file under RSEM's calculate expression dir
+ """
+ existing_lines = []
+ if os.path.exists(prm.fall_pvalLL):
+ existing_lines = Util.readFile(prm.fall_pvalLL)
+ pvalLL_lines = Util.readFile(prm.fpvalLL)
+
+ s_target = ','.join(prm.targetids)
+
+ f_fout = open(prm.fall_pvalLL, 'a')
+ if len(existing_lines) == 0:
+ f_fout.write("partition_model\texternal_data\tp_value\tlog_likelihood\n")
+
+ f_fout.write("%s\t%s\t%s\n" % (prm.partition_model, s_target,
+ pvalLL_lines[1]))
+ f_fout.close()
+
+ sys.stdout.write("\npRSEM testing procedure result is saved in %s\n" %
+ prm.fall_pvalLL)
+
+if __name__=='__main__':
+ main()
diff --git a/parseIt.cpp b/parseIt.cpp
new file mode 100644
index 0000000..b636f8d
--- /dev/null
+++ b/parseIt.cpp
@@ -0,0 +1,230 @@
+/*
+ * Assume any read should have a name other than ""
+ */
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<iostream>
+#include<fstream>
+#include<string>
+#include<map>
+
+#include "utils.h"
+#include "my_assert.h"
+
+#include "GroupInfo.h"
+#include "Transcripts.h"
+
+#include "SingleRead.h"
+#include "SingleReadQ.h"
+#include "PairedEndRead.h"
+#include "PairedEndReadQ.h"
+#include "SingleHit.h"
+#include "PairedEndHit.h"
+
+#include "HitContainer.h"
+#include "SamParser.h"
+
+using namespace std;
+
+bool verbose = true;
+
+int read_type; // 0 SingleRead, 1 SingleReadQ, 2 PairedEndRead, 3 PairedEndReadQ
+READ_INT_TYPE N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
+HIT_INT_TYPE nHits; // # of hits
+READ_INT_TYPE nUnique, nMulti, nIsoMulti;
+char *aux;
+char groupF[STRLEN], tiF[STRLEN];
+char datF[STRLEN], cntF[STRLEN];
+
+GroupInfo gi;
+Transcripts transcripts;
+
+SamParser *parser;
+ofstream hit_out;
+
+int n_os; // number of ostreams
+ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2
+char readOutFs[3][2][STRLEN];
+
+map<int, READ_INT_TYPE> counter;
+map<int, READ_INT_TYPE>::iterator iter;
+
+void init(const char* imdName, const char* alignF) {
+ parser = new SamParser(alignF, aux, transcripts, imdName);
+
+ memset(cat, 0, sizeof(cat));
+ memset(readOutFs, 0, sizeof(readOutFs));
+
+ int tmp_n_os = -1;
+
+ for (int i = 0; i < 3; i++) {
+ genReadFileNames(imdName, i, read_type, n_os, readOutFs[i]);
+
+ assert(tmp_n_os < 0 || tmp_n_os == n_os); tmp_n_os = n_os;
+
+ for (int j = 0; j < n_os; j++)
+ cat[i][j] = new ofstream(readOutFs[i][j]);
+ }
+
+ counter.clear();
+}
+
+//Do not allow duplicate for unalignable reads and supressed reads in SAM input
+template<class ReadType, class HitType>
+void parseIt(SamParser *parser) {
+ // record_val & record_read are copies of val & read for record purpose
+ int val, record_val;
+ ReadType read, record_read;
+ HitType hit;
+ HitContainer<HitType> hits;
+
+ nHits = 0;
+ nUnique = nMulti = nIsoMulti = 0;
+ memset(N, 0, sizeof(N));
+
+ READ_INT_TYPE cnt = 0;
+
+ record_val = -2; //indicate no recorded read now
+ while ((val = parser->parseNext(read, hit)) >= 0) {
+ if (val >= 0 && val <= 2) {
+ // flush out previous read's info if needed
+ if (record_val >= 0) {
+ record_read.write(n_os, cat[record_val]);
+ ++N[record_val];
+ }
+
+ general_assert(record_val == 1 || hits.getNHits() == 0, "Read " + record_read.getName() + " is both unalignable and alignable according to the input file!");
+
+ // flush out previous read's hits if the read is alignable reads
+ if (record_val == 1) {
+ hits.updateRI();
+ nHits += hits.getNHits();
+ nMulti += hits.calcNumGeneMultiReads(gi);
+ nIsoMulti += hits.calcNumIsoformMultiReads();
+ hits.write(hit_out);
+
+ iter = counter.find(hits.getNHits());
+ if (iter != counter.end()) {
+ iter->second++;
+ }
+ else {
+ counter[hits.getNHits()] = 1;
+ }
+ }
+
+ hits.clear();
+ record_val = val;
+ record_read = read; // no pointer, thus safe
+ }
+
+ if (val == 1 || val == 5) {
+ hits.push_back(hit);
+ }
+
+ ++cnt;
+ if (verbose && (cnt % 1000000 == 0)) { cout<< "Parsed "<< cnt<< " entries"<< endl; }
+ }
+
+ if (record_val >= 0) {
+ record_read.write(n_os, cat[record_val]);
+ ++N[record_val];
+ }
+
+ if (record_val == 1) {
+ hits.updateRI();
+ nHits += hits.getNHits();
+ nMulti += hits.calcNumGeneMultiReads(gi);
+ nIsoMulti += hits.calcNumIsoformMultiReads();
+ hits.write(hit_out);
+
+ iter = counter.find(hits.getNHits());
+ if (iter != counter.end()) {
+ iter->second++;
+ }
+ else {
+ counter[hits.getNHits()] = 1;
+ }
+ }
+
+ nUnique = N[1] - nMulti;
+}
+
+void release() {
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < n_os; j++) {
+ ((ofstream*)cat[i][j])->close();
+ delete cat[i][j];
+ }
+ if (N[i] > 0) continue;
+ for (int j = 0; j < n_os; j++) {
+ remove(readOutFs[i][j]); //delete if the file is empty
+ }
+ }
+ delete parser;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 6) {
+ printf("Usage : rsem-parse-alignments refName imdName statName alignF read_type [-t fai_file] [-tag tagName] [-q]\n");
+ exit(-1);
+ }
+
+ read_type = atoi(argv[5]);
+
+ aux = NULL;
+ if (argc > 6) {
+ for (int i = 6; i < argc; ++i) {
+ if (!strcmp(argv[i], "-t")) aux = argv[i + 1];
+ if (!strcmp(argv[i], "-tag")) SamParser::setReadTypeTag(argv[i + 1]);
+ if (!strcmp(argv[i], "-q")) verbose = false;
+ }
+ }
+
+ sprintf(groupF, "%s.grp", argv[1]);
+ gi.load(groupF);
+ sprintf(tiF, "%s.ti", argv[1]);
+ transcripts.readFrom(tiF);
+
+ sprintf(datF, "%s.dat", argv[2]);
+ sprintf(cntF, "%s.cnt", argv[3]);
+
+ init(argv[2], argv[4]);
+
+ hit_out.open(datF);
+
+ string firstLine(99, ' ');
+ firstLine.append(1, '\n'); //May be dangerous!
+ hit_out<<firstLine;
+
+ switch(read_type) {
+ case 0 : parseIt<SingleRead, SingleHit>(parser); break;
+ case 1 : parseIt<SingleReadQ, SingleHit>(parser); break;
+ case 2 : parseIt<PairedEndRead, PairedEndHit>(parser); break;
+ case 3 : parseIt<PairedEndReadQ, PairedEndHit>(parser); break;
+ }
+
+ hit_out.seekp(0, ios_base::beg);
+ hit_out<<N[1]<<" "<<nHits<<" "<<read_type;
+
+ hit_out.close();
+
+ //cntF for statistics of alignments file
+ ofstream fout(cntF);
+ fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
+ fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
+ fout<<nHits<<" "<<read_type<<endl;
+ fout<<"0\t"<<N[0]<<endl;
+ for (iter = counter.begin(); iter != counter.end(); iter++) {
+ fout<<iter->first<<'\t'<<iter->second<<endl;
+ }
+ fout<<"Inf\t"<<N[2]<<endl;
+ fout.close();
+
+ release();
+
+ if (verbose) { printf("Done!\n"); }
+
+ return 0;
+}
diff --git a/preRef.cpp b/preRef.cpp
new file mode 100644
index 0000000..0105f7d
--- /dev/null
+++ b/preRef.cpp
@@ -0,0 +1,90 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cctype>
+#include<string>
+#include<fstream>
+#include<sstream>
+#include<cassert>
+
+#include "utils.h"
+#include "Refs.h"
+#include "PolyARules.h"
+#include "RefSeqPolicy.h"
+#include "AlignerRefSeqPolicy.h"
+
+using namespace std;
+
+bool verbose = true;
+
+int M;
+
+RefSeqPolicy refp;
+AlignerRefSeqPolicy aligner_refp;
+PolyARules rules;
+Refs refs;
+
+ofstream fout;
+char refF[STRLEN], idxF[STRLEN], n2g_idxF[STRLEN];
+
+int polyAChoice, polyALen;
+char exceptionF[STRLEN];
+bool quiet; // verbose = !quiet;
+
+// always generate references for aligners, default convert all N into G
+int main(int argc, char* argv[]) {
+
+ if (argc < 4) {
+ printf("USAGE : rsem-preref refFastaF polyAChoice refName [-l polyALen] [-f exceptionF] [-q]\n\n");
+ printf(" refFastaF: a FASTA format file contains all reference transcripts\n");
+ printf(" polyAChoice: choice for polyA tail padding.It is a number from {0,1,2}\n");
+ printf(" 0: pad polyA tail\n");
+ printf(" 1: do not pad polyA tail at all\n");
+ printf(" 2: pad polyA tail for all references but those in exceptionF\n");
+ printf(" -l: polyALen: specify the length of polyA tail you want to pad. Default is 100\n");
+ printf(" -f: exceptionF: file contains a list of exception reference ids. IDs starts from 1. Must set if polyAChoice = 2\n");
+ printf(" -q: quiet\n");
+ exit(-1);
+ }
+
+ polyAChoice = atoi(argv[2]);
+
+ polyALen = 125;
+ quiet = false;
+ memset(exceptionF, 0, sizeof(exceptionF));
+
+ for (int i = 4; i < argc; i++) {
+ if (!strcmp(argv[i], "-l")) { polyALen = atoi(argv[i + 1]); }
+ if (!strcmp(argv[i], "-f")) { strcpy(exceptionF, argv[i + 1]); }
+ if (!strcmp(argv[i], "-q")) { quiet = true; }
+ }
+
+ verbose = !quiet;
+
+ //make references
+ rules = PolyARules(polyAChoice, polyALen, exceptionF);
+ refs.makeRefs(argv[1], refp, rules);
+ M = refs.getM();
+
+ //save references
+ sprintf(refF, "%s.seq", argv[3]);
+ refs.saveRefs(refF);
+
+ sprintf(idxF, "%s.idx.fa", argv[3]);
+ fout.open(idxF);
+ for (int i = 1; i <= M; i++) {
+ fout<< ">"<< refs.getRef(i).getName()<< endl<< refs.getRef(i).getSeq()<< endl;
+ }
+ fout.close();
+ if (verbose) printf("%s is generated!\n", idxF);
+
+ sprintf(n2g_idxF, "%s.n2g.idx.fa", argv[3]);
+ fout.open(n2g_idxF);
+ for (int i = 1; i <= M; i++) {
+ fout<< ">"<< refs.getRef(i).getName()<< endl<< aligner_refp.convert(refs.getRef(i).getSeq())<< endl;
+ }
+ fout.close();
+ if (verbose) printf("%s is generated!\n", n2g_idxF);
+
+ return 0;
+}
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
new file mode 100755
index 0000000..1ed66ea
--- /dev/null
+++ b/rsem-calculate-expression
@@ -0,0 +1,1651 @@
+#!/usr/bin/env perl
+
+use Getopt::Long qw(:config no_auto_abbrev);
+use Pod::Usage;
+use File::Basename;
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
+
+use Env qw(@PATH);
+
+@PATH = ($FindBin::RealBin, "$FindBin::RealBin/" . getSAMTOOLS(), @PATH);
+
+use strict;
+use warnings;
+
+#const
+my $BURNIN = 200;
+my $NCV = 1000;
+my $SAMPLEGAP = 1;
+my $CONFIDENCE = 0.95;
+my $NSPC = 50;
+
+my $NMB = 1024; # default
+
+my $status = 0;
+
+my $read_type = 1; # default, single end with qual
+
+
+my $strandedness = "none"; # none, forward, reverse
+my $probF = undef; # deprecated
+my $strand_specific = undef; # deprecated
+
+my $bowtie = 0; # Bowtie is on if !$is_alignment and !$bowtie2 and !$star and !$hicat2_hca
+my $bowtie_path = "";
+my $C = 2;
+my $E = 99999999;
+my $L = 25;
+my $maxHits = 200;
+my $chunkMbs = 0; # 0 = use bowtie default
+my $phred33 = 0;
+my $phred64 = 0;
+my $solexa = 0;
+
+my $is_alignment = 0;
+my $faiF = "";
+my $tagName = "XM";
+
+my $minL = 1;
+my $maxL = 1000;
+my $mean = -1;
+my $sd = 0;
+
+my $estRSPD = 0;
+my $B = 20;
+
+my $nThreads = 1;
+
+
+my $genBamF = 1; # default is generating transcript bam file
+my $genGenomeBamF = 0;
+my $sampling = 0;
+
+my $sort_bam_by_coordinate = 0;
+my $sort_bam_by_read_name = 0;
+my $sort_bam_memory = "1G"; # default as 1G per thread
+
+my $calcPME = 0;
+my $calcCI = 0;
+my $single_cell_prior = 0;
+my $quiet = 0;
+my $help = 0;
+
+my $paired_end = 0;
+my $no_qual = 0;
+my $keep_intermediate_files = 0;
+
+
+my $bowtie2 = 0;
+my $bowtie2_path = "";
+my $bowtie2_mismatch_rate = 0.1;
+my $bowtie2_k = 200;
+my $bowtie2_sensitivity_level = "sensitive"; # must be one of "very_fast", "fast", "sensitive", "very_sensitive"
+
+my $star = 0;
+my $star_path = "";
+my $star_gzipped_read_file = 0;
+my $star_bzipped_read_file = 0;
+my $star_output_genome_bam = 0;
+
+my $hisat2_hca = 0;
+my $hisat2_path = "";
+
+
+my $seed = "NULL";
+
+my $appendNames = 0;
+
+my $version = 0;
+
+my $mTime = 0;
+my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
+
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ('') x 7;
+my $gap = 32;
+
+my $alleleS = 0;
+
+# pRSEM options
+my $run_prsem = 0;
+my $chipseq_target_read_files = '';
+my $chipseq_control_read_files = '';
+my $chipseq_peak_file = '';
+my $partition_model = 'pk';
+my $chipseq_read_files_multi_targets = ''; ## read files for multiple targets
+ ## delimited by comma
+my $chipseq_bed_files_multi_targets = ''; ## BED files for multiple targets
+ ## delimited by comma
+my $cap_stacked_chipseq_reads = 0; ## for multiple targets, remove redundant
+ ## reads aligned to the same position
+my $n_max_stacked_chipseq_reads = 5; ## as above
+
+
+GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+ "temporary-folder=s" => \$temp_dir,
+ "no-qualities" => \$no_qual,
+ "paired-end" => \$paired_end,
+ "strandedness=s" => \$strandedness,
+ "alignments" => \$is_alignment,
+ "fai=s" => \$faiF,
+ "tag=s" => \$tagName,
+ "seed-length=i" => \$L,
+ "bowtie-path=s" => \$bowtie_path,
+ "bowtie-n=i" => \$C,
+ "bowtie-e=i" => \$E,
+ "bowtie-m=i" => \$maxHits,
+ "bowtie-chunkmbs=i" => \$chunkMbs,
+ "phred33-quals" => \$phred33,
+ "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
+ "solexa-quals" => \$solexa,
+ "bowtie2" => \$bowtie2,
+ "bowtie2-path=s" => \$bowtie2_path,
+ "bowtie2-mismatch-rate=f" => \$bowtie2_mismatch_rate,
+ "bowtie2-k=i" => \$bowtie2_k,
+ "bowtie2-sensitivity-level=s" => \$bowtie2_sensitivity_level,
+ "star" => \$star,
+ "star-path=s" => \$star_path,
+ "star-gzipped-read-file" => \$star_gzipped_read_file,
+ "star-bzipped-read-file" => \$star_bzipped_read_file,
+ "star-output-genome-bam" => \$star_output_genome_bam,
+ "hisat2-hca" => \$hisat2_hca,
+ "hisat2-path=s" => \$hisat2_path,
+ "fragment-length-min=i" => \$minL,
+ "fragment-length-max=i" => \$maxL,
+ "fragment-length-mean=f" => \$mean,
+ "fragment-length-sd=f" => \$sd,
+ "estimate-rspd" => \$estRSPD,
+ "num-rspd-bins=i" => \$B,
+ "p|num-threads=i" => \$nThreads,
+ "append-names" => \$appendNames,
+ "sampling-for-bam" => \$sampling,
+ "no-bam-output" => sub { $genBamF = 0; },
+ "output-genome-bam" => \$genGenomeBamF,
+ "sort-bam-by-coordinate" => \$sort_bam_by_coordinate,
+ "sort-bam-by-read-name" => \$sort_bam_by_read_name,
+ "sort-bam-memory-per-thread=s" => \$sort_bam_memory,
+ "single-cell-prior" => \$single_cell_prior,
+ "calc-pme" => \$calcPME,
+ "gibbs-burnin=i" => \$BURNIN,
+ "gibbs-number-of-samples=i" => \$NCV,
+ "gibbs-sampling-gap=i", \$SAMPLEGAP,
+ "calc-ci" => \$calcCI,
+ "ci-credibility-level=f" => \$CONFIDENCE,
+ "ci-memory=i" => \$NMB,
+ "ci-number-of-samples-per-count-vector=i" => \$NSPC,
+ "seed=i" => \$seed,
+ "run-pRSEM" => \$run_prsem,
+ "chipseq-target-read-files=s" => \$chipseq_target_read_files,
+ ## delimited by comma if more than one
+ "chipseq-control-read-files=s" => \$chipseq_control_read_files,
+ ## delimited by comma if more than one
+ "chipseq-read-files-multi-targets=s" => \$chipseq_read_files_multi_targets,
+ ## delimited by comma
+ "chipseq-bed-files-multi-targets=s" => \$chipseq_bed_files_multi_targets,
+ ## delimited by comma
+ "cap-stacked-chipseq-reads" => \$cap_stacked_chipseq_reads,
+ "n-max-stacked-chipseq-reads=i" => \$n_max_stacked_chipseq_reads,
+ "chipseq-peak-file=s" => \$chipseq_peak_file,
+ "partition-model=s" => \$partition_model,
+ "time" => \$mTime,
+
+ # deprecated
+ "strand-specific" => \$strand_specific,
+ "forward-prob=f" => \$probF,
+ "sam|bam" => \$is_alignment,
+
+ # help
+ "version" => \$version,
+ "q|quiet" => \$quiet,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+&showVersionInfo($FindBin::RealBin) if ($version == 1);
+
+#check parameters and options
+
+if ($is_alignment) {
+ pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+ pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m, --phred33-quals, --phred64-quals, --solexa-quals, --bowtie2, --bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k, --bowtie2-sensitivity-level, --star, --star-path, and --star-output-genome-bam cannot be set if input is SAM/BAM/CRAM format!", -exitval => 2, -verbose => 2) if ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200 || $phred33 || $phred64 || $solexa || $bowtie2 || $bowtie2_path ne "" || $bowtie2_mismatch_rate != 0.1 || $bowtie2_k != 200 || $bowtie2_sensitivity_level ne "sensitive" || $star || $star_path ne "" || $star_output_genome_bam);
+}
+else {
+ if (!$bowtie2 && !$star && !$hisat2_hca) { $bowtie = 1; }
+
+ pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (!$paired_end && scalar(@ARGV) != 3 || $paired_end && scalar(@ARGV) != 4);
+ pod2usage(-msg => "If --no-qualities is set, neither --phred33-quals, --phred64-quals or --solexa-quals can be active!", -exitval => 2, -verbose => 2) if ($no_qual && ($phred33 + $phred64 + $solexa > 0));
+ pod2usage(-msg => "Only one of --phred33-quals, --phred64-quals, and --solexa-quals can be active!", -exitval => 2, -verbose => 2) if ($phred33 + $phred64 + $solexa > 1);
+ pod2usage(-msg => "--bowtie-path, --bowtie-n, --bowtie-e, --bowtie-m cannot be set if bowtie aligner is not used!", -exitval => 2, -verbose => 2) if (!$bowtie && ($bowtie_path ne "" || $C != 2 || $E != 99999999 || $maxHits != 200));
+ pod2usage(-msg => "--bowtie2-path, --bowtie2-mismatch-rate, --bowtie2-k and --bowtie2-sensitivity-level cannot be set if bowtie2 aligner is not used!", -exitval => 2, -verbose => 2) if (!$bowtie2 && ($bowtie2_path ne "" || $bowtie2_mismatch_rate != 0.1 || $bowtie2_k != 200 || $bowtie2_sensitivity_level ne "sensitive"));
+ pod2usage(-msg => "--star-path, --star-gzipped-read-file, --star-bzipped-read-file and --star-output-genome-bam cannot be set if STAR aligner is not used!", -exitval => 2, -verbose => 2) if (!$star && ($star_path ne "" || $star_gzipped_read_file || $star_bzipped_read_file || $star_output_genome_bam));
+ pod2usage(-msg => "--hisat2-path cannot be set if HISAT2 aligner is not used!", -exitval => 2, -verbose => 2) if (!$hisat2_hca && ($hisat2_path ne ""));
+ pod2usage(-msg => "Mismatch rate must be within [0, 1]!", -exitval => 2, -verbose => 2) if ($bowtie2 && ($bowtie2_mismatch_rate < 0.0 || $bowtie2_mismatch_rate > 1.0));
+ pod2usage(-msg => "Sensitivity level must be one of \"very_fast\", \"fast\", \"sensitive\", and \"very_sensitive\"!", -exitval => 2, -verbose => 2) if ($bowtie2 && (($bowtie2_sensitivity_level ne "very_fast") && ($bowtie2_sensitivity_level ne "fast") && ($bowtie2_sensitivity_level ne "sensitive") && ($bowtie2_sensitivity_level ne "very_sensitive")));
+
+ if ($faiF ne "") { print "Warning: There is no need to set --fai if you ask RSEM to align reads for you.\n" }
+}
+
+pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -verbose => 2) if ($minL < 1);
+pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
+pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
+pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
+pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5);
+pod2usage(-msg => "--sampling-for-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
+pod2usage(-msg => "--output-genome-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($genGenomeBamF && !$genBamF);
+pod2usage(-msg => "The seed for random number generator must be a non-negative 32bit integer!\n", -exitval => 2, -verbose => 2) if (($seed ne "NULL") && ($seed < 0 || $seed > 0xffffffff));
+pod2usage(-msg => "The credibility level should be within (0, 1)!\n", -exitval => 2, -verbose => 2) if ($CONFIDENCE <= 0.0 || $CONFIDENCE >= 1.0);
+
+
+if ( $run_prsem ) {
+ my $msg = '';
+ if ( ( $chipseq_peak_file eq '' ) &&
+ ( ( $chipseq_target_read_files eq '' ) ||
+ ( $chipseq_control_read_files eq '' ) ||
+ ( $bowtie_path eq '' ) ) &&
+ ( ( $chipseq_read_files_multi_targets eq '' ) ||
+ ( $bowtie_path eq '' ) ) &&
+ ( $chipseq_bed_files_multi_targets eq '' )
+ ) {
+ $msg = "please define one set of the following options to run pRSEM:\n" .
+ "1. --chipseq-peak-file <string>\n" .
+ "2. --chipseq-target-read-files <string> and\n" .
+ " --chipseq-control-read-files <string> and\n" .
+ " --bowtie-path <path>\n" .
+ "3. --chipseq-read-files-multi-targets <string> and\n" .
+ " --bowtie-path <path>\n" .
+ "4. --chipseq-bed-files-multi-targets <string>\n";
+ }
+
+ my @prsem_partition_models = (
+ 'pk', 'pk_lgtnopk',
+ 'lm3', 'lm4', 'lm5', 'lm6',
+ 'nopk_lm2pk', 'nopk_lm3pk', 'nopk_lm4pk', 'nopk_lm5pk',
+ 'pk_lm2nopk', 'pk_lm3nopk', 'pk_lm4nopk', 'pk_lm5nopk',
+ 'cmb_lgt'
+ );
+
+ my %prtmdl2one = ();
+ foreach my $prtmdl (@prsem_partition_models) {
+ $prtmdl2one{$prtmdl} = 1;
+ }
+
+ if ( exists $prtmdl2one{$partition_model} ) {
+ if ( ( $partition_model eq 'cmb_lgt' ) &&
+ ( ( $chipseq_read_files_multi_targets eq '' ) &&
+ ( $chipseq_bed_files_multi_targets eq '' ) ) ) {
+ $msg = 'either --chipseq-read-files-multi-targets <string> or ' .
+ '--chipseq-bed-files-multi-targets <string> needs to be ' .
+ "defined for pRSEM's partition model: '$partition_model'";
+ } elsif ( ( $partition_model ne 'pk' ) &&
+ ( $partition_model ne 'cmb_lgt' ) &&
+ ( ( $chipseq_target_read_files eq '' ) ||
+ ( $chipseq_control_read_files eq '' ) ||
+ ( $bowtie_path eq '' ) ) ) {
+ $msg = '--chipseq-target-read-files <string> and ' .
+ '--chipseq-control-read-files <string> and ' .
+ '--bowtie-path <path> need to be defined for ' .
+ "pRSEM's partition model: '$partition_model'";
+ }
+ } else {
+ $msg = '--partition-model <string> must be one of [' .
+ join(', ', @prsem_partition_models) . "]";
+ }
+
+ if ( $msg ne '' ) {
+ pod2usage(-msg => "$msg\n", -exitval => 2, -verbose => 2);
+ }
+
+ if ( ( $partition_model ne 'cmb_lgt' ) &&
+ ( ( $chipseq_read_files_multi_targets ne '' ) ||
+ ( $chipseq_bed_files_multi_targets ne '' ) ) ) {
+ print "\nCombining signals from multiple sources, partition model is set to 'cmb_lgt'\n\n";
+ $partition_model = 'cmb_lgt';
+ }
+}
+
+
+if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; }
+
+# strandedness
+if (!defined($probF)) {
+ if ($strandedness eq "forward" || ($strandedness eq "none" && defined($strand_specific))) {
+ $probF = 1.0;
+ }
+ elsif ($strandedness eq "reverse") {
+ $probF = 0.0;
+ }
+ else {
+ $probF = 0.5;
+ }
+}
+
+pod2usage(-msg => "Forward probability should be in [0, 1]!", -exitval => 2, -verbose => 2) if ($probF < 0 || $probF > 1);
+
+if ($paired_end) {
+ if ($no_qual) { $read_type = 2; }
+ else { $read_type = 3; }
+}
+else {
+ if ($no_qual) { $read_type = 0; }
+ else { $read_type = 1; }
+}
+
+if (scalar(@ARGV) == 3) {
+ if ($is_alignment) { $inpF = $ARGV[0]; }
+ else {$mate1_list = $ARGV[0]; }
+ $refName = $ARGV[1];
+ $sampleName = $ARGV[2];
+}
+else {
+ $mate1_list = $ARGV[0];
+ $mate2_list = $ARGV[1];
+ $refName = $ARGV[2];
+ $sampleName = $ARGV[3];
+}
+
+if (((-e "$refName.ta") && !(-e "$refName.gt")) || (!(-e "$refName.ta") && (-e "$refName.gt"))) {
+ print "Allele-specific expression related reference files are corrupted!\n";
+ exit(-1);
+}
+
+$alleleS = (-e "$refName.ta") && (-e "$refName.gt");
+
+pod2usage(-msg => "RSEM reference cannot contain poly(A) tails if you want to use STAR aligner!", -exitval => 2, -verbose => 2) if ($star && (&hasPolyA("$refName.seq")));
+
+if ($genGenomeBamF) {
+ open(INPUT, "$refName.ti");
+ my $line = <INPUT>; chomp($line);
+ close(INPUT);
+ my ($M, $type) = split(/ /, $line);
+ pod2usage(-msg => "No genome information provided, so genome bam file cannot be generated!\n", -exitval => 2, -verbose => 2) if ($type != 0);
+}
+
+my $pos = rindex($sampleName, '/');
+if ($pos < 0) { $sampleToken = $sampleName; }
+else { $sampleToken = substr($sampleName, $pos + 1); }
+
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
+$stat_dir = "$sampleName.stat";
+
+if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
+if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
+
+$imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
+
+if (!$is_alignment && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
+
+my ($mate_minL, $mate_maxL) = (1, $maxL);
+
+if ($bowtie_path ne "") { $bowtie_path .= "/"; }
+if ($bowtie2_path ne "") { $bowtie2_path .= "/"; }
+if ($star_path ne '') { $star_path .= "/"; }
+if ($hisat2_path ne '') { $hisat2_path .= "/"; }
+
+my $command = "";
+
+if (!$is_alignment) {
+ if ($bowtie) {
+ $command = $bowtie_path."bowtie";
+ if ($no_qual) { $command .= " -f"; }
+ else { $command .= " -q"; }
+
+ if ($phred33) { $command .= " --phred33-quals"; }
+ elsif ($phred64) { $command .= " --phred64-quals"; }
+ elsif ($solexa) { $command .= " --solexa-quals"; }
+
+ $command .= " -n $C -e $E -l $L";
+ if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
+ if ($chunkMbs > 0) { $command .= " --chunkmbs $chunkMbs"; }
+
+ if ($probF == 1.0) { $command .= " --norc"; }
+ elsif ($probF == 0.0) { $command .= " --nofw"; }
+
+ $command .= " -p $nThreads -a -m $maxHits -S";
+ if ($quiet) { $command .= " --quiet"; }
+
+ $command .= " $refName";
+ if ($read_type == 0 || $read_type == 1) {
+ $command .= " $mate1_list";
+ }
+ else {
+ $command .= " -1 $mate1_list -2 $mate2_list";
+ }
+
+ # pipe to samtools to generate a BAM file
+ $command .= " 2> $sampleName.log | samtools view -b -o $imdName.bam -";
+ } elsif ($bowtie2) {
+ $command = $bowtie2_path."bowtie2";
+ if ($no_qual) { $command .= " -f"; }
+ else { $command .= " -q"; }
+
+ if ($phred33) { $command .= " --phred33"; }
+ elsif ($phred64) { $command .= " --phred64"; }
+ elsif ($solexa) { $command .= " --solexa-quals"; }
+
+ if ($bowtie2_sensitivity_level eq "very_fast") { $command .= " --very-fast"; }
+ elsif ($bowtie2_sensitivity_level eq "fast") { $command .= " --fast"; }
+ elsif ($bowtie2_sensitivity_level eq "sensitive") { $command .= " --sensitive"; }
+ else { $command .= " --very-sensitive"; }
+
+ $command .= " --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score-min L,0,-$bowtie2_mismatch_rate";
+
+ if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL --no-mixed --no-discordant"; }
+
+ if ($probF == 1.0) { $command .= " --norc"; }
+ elsif ($probF == 0.0) { $command .= " --nofw"; }
+
+ $command .= " -p $nThreads -k $bowtie2_k";
+ if ($quiet) { $command .= " --quiet"; }
+
+ $command .= " -x $refName";
+ if ($read_type == 0 || $read_type == 1) {
+ $command .= " -U $mate1_list";
+ }
+ else {
+ $command .= " -1 $mate1_list -2 $mate2_list";
+ }
+
+ # pipe to samtools to generate a BAM file
+ $command .= " 2> $sampleName.log | samtools view -b -o $imdName.bam -";
+ } elsif ($star) {
+ ## align reads by STAR
+ my $star_genome_path = dirname($refName);
+ $command = "$star_path"."STAR" .
+ ## ENCODE3 pipeline parameters
+ " --genomeDir $star_genome_path " .
+ " --outSAMunmapped Within " .
+ " --outFilterType BySJout " .
+ " --outSAMattributes NH HI AS NM MD " .
+ " --outFilterMultimapNmax 20 " .
+ " --outFilterMismatchNmax 999 " .
+ " --outFilterMismatchNoverLmax 0.04 " .
+ " --alignIntronMin 20 " .
+ " --alignIntronMax 1000000 " .
+ " --alignMatesGapMax 1000000 " .
+ " --alignSJoverhangMin 8 " .
+ " --alignSJDBoverhangMin 1 " .
+ " --sjdbScore 1 " .
+ " --runThreadN $nThreads " .
+ ##
+
+ ## different than ENCODE3 pipeline
+ ## do not allow using shared memory
+ " --genomeLoad NoSharedMemory " .
+ ##
+
+ ## different than ENCODE3 pipeline, which sorts output BAM
+ ## no need to do it here to save time and memory
+ " --outSAMtype BAM Unsorted " .
+ ##
+
+ ## unlike ENCODE3, we don"t output bedGraph files
+
+ " --quantMode TranscriptomeSAM ".
+ " --outSAMheaderHD \@HD VN:1.4 SO:unsorted ".
+
+ ## define output file prefix
+ " --outFileNamePrefix $imdName ";
+ ##
+
+ if ( $star_gzipped_read_file ) {
+ $command .= " --readFilesCommand zcat ";
+ } elsif ( $star_bzipped_read_file ) {
+ $command .= " --readFilesCommand bzip2 -c ";
+ }
+
+ if ( $read_type == 0 || $read_type == 1 ) {
+ $command .= " --readFilesIn $mate1_list ";
+ } else {
+ $command .= " --readFilesIn $mate1_list $mate2_list";
+ }
+
+ } elsif ($hisat2_hca) {
+ $command = $hisat2_path."hisat2";
+ if ($no_qual) { $command .= " -f"; }
+ else { $command .= " -q"; }
+
+ if ($phred33) { $command .= " --phred33"; }
+ elsif ($phred64) { $command .= " --phred64"; }
+ elsif ($solexa) { $command .= " --solexa-quals"; }
+
+ $command .= " --rg-id=$sampleToken --rg SM:$sampleToken --rg LB:$sampleToken --rg PL:ILLUMINA --rg PU:$sampleToken" .
+ " --new-summary --summary-file $sampleName.log --met-file $sampleName.hisat2.met.txt --met 5" .
+ " --mp 1,1 --np 1 --score-min L,0,-0.1 --rdg 99999999,99999999 --rfg 99999999,99999999" .
+ " --no-spliced-alignment --no-softclip --seed 12345";
+
+ if ($read_type == 2 || $read_type == 3) { $command .= " --no-mixed --no-discordant"; }
+
+ if ($probF == 1.0) { $command .= " --norc"; }
+ elsif ($probF == 0.0) { $command .= " --nofw"; }
+
+ if ($quiet) { $command .= " --quiet"; }
+
+ $command .= " -p $nThreads -k 10 --secondary";
+
+ $command .= " -x $refName";
+ if ($read_type == 0 || $read_type == 1) {
+ $command .= " -U $mate1_list";
+ }
+ else {
+ $command .= " -1 $mate1_list -2 $mate2_list";
+ }
+
+ # pipe to samtools to generate a BAM file
+ $command .= " | samtools view -b -o $imdName.bam -";
+ } else {
+ print "Impossible --- unknown aligner!!!\n"; exit(-1);
+ }
+
+ if ($mTime) { $time_start = time(); }
+
+ &runCommand($command);
+
+ if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
+
+ $inpF = "$imdName.bam";
+
+ if ( $star ) {
+ my $star_tr_bam = $imdName . "Aligned.toTranscriptome.out.bam";
+ rename $star_tr_bam, $inpF or die "can't rename $star_tr_bam to $inpF: $!\n";
+ rmdir $imdName . "_STARtmp/";
+ my $star_genome_bam = $imdName . "Aligned.out.bam";
+ my $rsem_star_genome_bam = $sampleName.'.STAR.genome.bam';
+ if ( $star_output_genome_bam ) {
+ rename $star_genome_bam, $rsem_star_genome_bam or die "can't move $star_genome_bam to $rsem_star_genome_bam: $!\n";
+ } else {
+ unlink $star_genome_bam or die "can't remove $star_genome_bam: $!\n";
+ }
+ rename $imdName."Log.final.out", $sampleName.".log" or die "Cannot rename ${imdName}Log.final.out to $sampleName.log: $!\n";
+ }
+}
+
+if ( $sort_bam_by_read_name ) {
+ my $sorted_bam = "$imdName.sorted.bam";
+ $command = "samtools sort -n -@ $nThreads -m $sort_bam_memory -o $sorted_bam $inpF";
+ &runCommand($command);
+ if (!$is_alignment) {
+ $command = "rm -f $inpF";
+ &runCommand($command);
+ }
+ $inpF = $sorted_bam;
+}
+
+if ($mTime) { $time_start = time(); }
+
+$command = "rsem-parse-alignments $refName $imdName $statName $inpF $read_type";
+if ($faiF ne "") { $command .= " -t $faiF"; }
+if ($tagName ne "") { $command .= " -tag $tagName"; }
+if ($quiet) { $command .= " -q"; }
+
+&runCommand($command);
+
+my $inpCntF = "$statName.cnt";
+my $local_status = open(INPUT, $inpCntF);
+if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); }
+my $line = <INPUT>;
+chomp($line);
+my @Ns = split(/ /, $line);
+close(INPUT);
+my $no_aligned = ($Ns[1] == 0);
+
+if (!$no_aligned) {
+ $command = "rsem-build-read-index $gap";
+ if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; }
+ elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; }
+ elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
+ elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
+ else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); }
+ &runCommand($command);
+}
+
+my $doesOpen = open(OUTPUT, ">$imdName.mparams");
+if ($doesOpen == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); }
+print OUTPUT "$minL $maxL\n";
+print OUTPUT "$probF\n";
+print OUTPUT "$estRSPD\n";
+print OUTPUT "$B\n";
+print OUTPUT "$mate_minL $mate_maxL\n";
+print OUTPUT "$mean $sd\n";
+print OUTPUT "$L\n";
+close(OUTPUT);
+
+my @seeds = ();
+if ($seed ne "NULL") {
+ srand($seed);
+ for (my $i = 0; $i < 3; $i++) {
+ push(@seeds, int(rand(1 << 32)));
+ }
+}
+
+$command = "rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
+if ($genBamF) {
+ $command .= " -b $inpF";
+ if ($faiF ne "") { $command .= " 1 $faiF"; }
+ else { $command .= " 0"; }
+ if ($sampling) { $command .= " --sampling"; }
+ if ($seed ne "NULL") { $command .= " --seed $seeds[0]"; }
+}
+if ($calcPME || $calcCI) { $command .= " --gibbs-out"; }
+if ($appendNames) { $command .= " --append-names"; }
+if ($quiet) { $command .= " -q"; }
+
+&runCommand($command);
+
+if ($alleleS) {
+ &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+}
+else {
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+}
+
+if ($genBamF) {
+ if ($genGenomeBamF) {
+ $command = "rsem-tbam2gbam $refName $sampleName.transcript.bam $sampleName.genome.bam";
+ &runCommand($command);
+ }
+
+ if ($sort_bam_by_coordinate) {
+ $command = "samtools sort -@ $nThreads -m $sort_bam_memory -o $sampleName.transcript.sorted.bam $sampleName.transcript.bam";
+ &runCommand($command);
+ $command = "samtools index $sampleName.transcript.sorted.bam";
+ &runCommand($command);
+
+ if ($genGenomeBamF) {
+ $command = "samtools sort -@ $nThreads -m $sort_bam_memory -o $sampleName.genome.sorted.bam $sampleName.genome.bam";
+ &runCommand($command);
+ $command = "samtools index $sampleName.genome.sorted.bam";
+ &runCommand($command);
+ }
+ }
+}
+
+if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
+
+if ($mTime) { $time_start = time(); }
+
+if ($no_aligned) {
+ print "Since no aligned reads, further steps will not be performed!\n";
+ if (!$keep_intermediate_files) {
+ &runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!");
+ }
+ if ($mTime) {
+ open(OUTPUT, ">$sampleName.time");
+ print OUTPUT "Aligning reads: $time_alignment s.\n";
+ print OUTPUT "Estimating expression levels: $time_rsem s.\n";
+ close(OUTPUT);
+ }
+ exit(0);
+}
+
+if ($calcPME || $calcCI ) {
+ $command = "rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
+ $command .= " -p $nThreads";
+ if ($seed ne "NULL") { $command .= " --seed $seeds[1]"; }
+ if ($single_cell_prior) { $command .= " --pseudo-count 0.1"; }
+ if ($quiet) { $command .= " -q"; }
+ &runCommand($command);
+}
+
+if ($calcPME || $calcCI) {
+ if ($alleleS) {
+ system("mv $sampleName.alleles.results $imdName.alleles.results.bak1");
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+ &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+ else {
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+}
+
+if ($calcCI) {
+ $command = "rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
+ $command .= " -p $nThreads";
+ if ($seed ne "NULL") { $command .= " --seed $seeds[2]"; }
+ if ($single_cell_prior) { $command .= " --pseudo-count 0.1"; }
+ if ($quiet) { $command .= " -q"; }
+ &runCommand($command);
+
+ if ($alleleS) {
+ system("mv $sampleName.alleles.results $imdName.alleles.results.bak2");
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+ &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+ else {
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+}
+
+if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
+
+if ($mTime) { $time_start = time(); }
+
+## To-do: only run gibbs sampling once, either for pRSEM or uniform prior 1
+if ( $run_prsem ) {
+ $command = "$FindBin::RealBin/pRSEM/prsem-calculate-expression " .
+ " --num-threads $nThreads " .
+ " --partition-model $partition_model " .
+ " --gibbs-burnin $BURNIN " .
+ " --gibbs-number-of-samples $NCV " .
+ " --gibbs-sampling-gap $SAMPLEGAP ";
+ ####
+
+ ## ChIP-seq peak file from single source
+ if ( $chipseq_peak_file ne '') { ## only for partition model pk
+ ## need to add sanity check!!
+ $command .= " --chipseq-peak-file $chipseq_peak_file";
+ } elsif ( $partition_model eq 'cmb_lgt' ) { ## multi-sources
+ if ( $chipseq_bed_files_multi_targets ne '' ) { ## use bed over read
+ $command .= ' --chipseq-bed-files-multi-targets ' .
+ $chipseq_bed_files_multi_targets;
+ } elsif ( $chipseq_read_files_multi_targets ne '' ) {
+ $command .= ' --chipseq-read-files-multi-targets ' .
+ $chipseq_read_files_multi_targets .
+ " --bowtie-path $bowtie_path" ;
+ }
+ if ( $cap_stacked_chipseq_reads ) {
+ $command .= ' --cap-stacked-chipseq-reads ' .
+ " --n-max-stacked-chipseq-reads $n_max_stacked_chipseq_reads";
+ }
+ } else { ## ChIP-seq reads files from single source
+ $command .= " --chipseq-target-read-files $chipseq_target_read_files " .
+ " --bowtie-path $bowtie_path" ;
+ if ( $chipseq_control_read_files ne '' ) {
+ $command .= " --chipseq-control-read-files $chipseq_control_read_files";
+ }
+ }
+
+ if ( $quiet ) {
+ $command .= ' --quiet ';
+ }
+
+ $command .= " $refName $sampleName $statName $imdName";
+ &runCommand($command);
+
+ ## collect pRSEM results
+ my $fiso_res = "$imdName.iso_res";
+ my $fgene_res = "$imdName.gene_res";
+
+ my $fprsem_iso_res = "${imdName}_prsem.iso_res";
+ my $fprsem_gene_res = "${imdName}_prsem.gene_res";
+
+ system("head -8 $fiso_res > $fprsem_iso_res" );
+ system("tail -5 $fiso_res >> $fprsem_iso_res" );
+
+ system("head -7 $fgene_res > $fprsem_gene_res");
+ system("tail -4 $fgene_res >> $fprsem_gene_res");
+
+ my $fstat_iso_results = "${statName}_uniform_prior_1.isoforms.results";
+ my $fstat_gene_results = "${statName}_uniform_prior_1.genes.results";
+
+ my $fiso_results = "${sampleName}.isoforms.results";
+ my $fgene_results = "${sampleName}.genes.results";
+
+ rename $fiso_results, $fstat_iso_results or die
+ "can't rename $fiso_results to $fstat_iso_results: $!\n";
+
+ rename $fgene_results, $fstat_gene_results or die
+ "can't rename $fgene_results to $fstat_gene_results: $!\n";
+
+ collectResults("isoform", $fprsem_iso_res, $fiso_results);
+ collectResults("gene", $fprsem_gene_res, $fgene_results);
+}
+
+
+if (!$keep_intermediate_files) {
+ &runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!");
+}
+
+if ($mTime) { $time_end = time(); }
+
+if ($mTime) {
+ open(OUTPUT, ">$sampleName.time");
+ print OUTPUT "Aligning reads: $time_alignment s.\n";
+ print OUTPUT "Estimating expression levels: $time_rsem s.\n";
+ print OUTPUT "Calculating credibility intervals: $time_ci s.\n";
+# my $time_del = $time_end - $time_start;
+# print OUTPUT "Delete: $time_del s.\n";
+ close(OUTPUT);
+}
+
+__END__
+
+=head1 NAME
+
+rsem-calculate-expression - Estimate gene and isoform expression from RNA-Seq data.
+
+=head1 SYNOPSIS
+
+ rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name
+ rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name
+ rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<upstream_read_files(s)>
+
+Comma-separated list of files containing single-end reads or upstream reads for paired-end data. By default, these files are assumed to be in FASTQ format. If the --no-qualities option is specified, then FASTA format is expected.
+
+=item B<downstream_read_file(s)>
+
+Comma-separated list of files containing downstream reads which are paired with the upstream reads. By default, these files are assumed to be in FASTQ format. If the --no-qualities option is specified, then FASTA format is expected.
+
+=item B<input>
+
+SAM/BAM/CRAM formatted input file. If "-" is specified for the filename, the input is instead assumed to come from standard input. RSEM requires all alignments of the same read group together. For paired-end reads, RSEM also requires the two mates of any alignment be adjacent. In addition, RSEM does not allow the SEQ and QUAL fields to be empty. See Description section for how to make input file obey RSEM's requirements.
+
+=item B<reference_name>
+
+The name of the reference used. The user must have run 'rsem-prepare-reference' with this reference_name before running this program.
+
+=item B<sample_name>
+
+The name of the sample analyzed. All output files are prefixed by this name (e.g., sample_name.genes.results)
+
+=back
+
+=head1 BASIC OPTIONS
+
+=over
+
+=item B<--paired-end>
+
+Input reads are paired-end reads. (Default: off)
+
+=item B<--no-qualities>
+
+Input reads do not contain quality scores. (Default: off)
+
+=item B<--strandedness> <none|forward|reverse>
+
+This option defines the strandedness of the RNA-Seq reads. It recognizes three values: 'none', 'forward', and 'reverse'. 'none' refers to non-strand-specific protocols. 'forward' means all (upstream) reads are derived from the forward strand. 'reverse' means all (upstream) reads are derived from the reverse strand. If 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2 option will also be enabled to avoid aligning reads to the opposite strand. For Illumina TruSeq Stranded protocols, please use 'reverse'. (Default: 'none')
+
+=item B<-p/--num-threads> <int>
+
+Number of threads to use. Both Bowtie/Bowtie2, expression estimation and 'samtools sort' will use this many threads. (Default: 1)
+
+=item B<--alignments>
+
+Input file contains alignments in SAM/BAM/CRAM format. The exact file format will be determined automatically. (Default: off)
+
+=item B<--fai> <file>
+
+If the header section of input alignment file does not contain reference sequence information, this option should be turned on. <file> is a FAI format file containing each reference sequence's name and length. Please refer to the SAM official website for the details of FAI format. (Default: off)
+
+=item B<--bowtie2>
+
+Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM does not handle indel, local and discordant alignments, the Bowtie2 parameters are set in a way to avoid those alignments. In particular, we use options '--sensitive --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter of '--score-min', '-0.1', is the negative of maximum mismatch rate. This rate can be set by option '--bowtie2-mismatch-rate'. If reads are paired-end, we additionally use options '--no-mixed' and '--no-discordant'. (Default: off)
+
+=item B<--star>
+
+Use STAR to align reads. Alignment parameters are from ENCODE3's STAR-RSEM pipeline. To save computational time and memory resources, STAR's Output BAM file is unsorted. It is stored in RSEM's temporary directory with name as 'sample_name.bam'. Each STAR job will have its own private copy of the genome in memory. (Default: off)
+
+=item B<--hisat2-hca>
+
+Use HISAT2 to align reads to the transcriptome according to Human Cell Atlast SMART-Seq2 pipeline. In particular, we use HISAT parameters "-k 10 --secondary --rg-id=$sampleToken --rg SM:$sampleToken --rg LB:$sampleToken --rg PL:ILLUMINA --rg PU:$sampleToken --new-summary --summary-file $sampleName.log --met-file $sampleName.hisat2.met.txt --met 5 --mp 1,1 --np 1 --score-min L,0,-0.1 --rdg 99999999,99999999 --rfg 99999999,99999999 --no-spliced-alignment --no-softclip --seed 12345". If inputs are paired-end reads, we additionally use parameters "--no-mixed --no-discordant". (Default: off)
+
+=item B<--append-names>
+
+If gene_name/transcript_name is available, append it to the end of gene_id/transcript_id (separated by '_') in files 'sample_name.isoforms.results' and 'sample_name.genes.results'. (Default: off)
+
+=item B<--seed> <uint32>
+
+Set the seed for the random number generators used in calculating posterior mean estimates and credibility intervals. The seed must be a non-negative 32 bit integer. (Default: off)
+
+=item B<--single-cell-prior>
+
+By default, RSEM uses Dirichlet(1) as the prior to calculate posterior mean estimates and credibility intervals. However, much less genes are expressed in single cell RNA-Seq data. Thus, if you want to compute posterior mean estimates and/or credibility intervals and you have single-cell RNA-Seq data, you are recommended to turn on this option. Then RSEM will use Dirichlet(0.1) as the prior which encourage the sparsity of the expression levels. (Default: off)
+
+=item B<--calc-pme>
+
+Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. (Default: off)
+
+=item B<--calc-ci>
+
+Calculate 95% credibility intervals and posterior mean estimates. The credibility level can be changed by setting '--ci-credibility-level'. (Default: off)
+
+=item B<-q/--quiet>
+
+Suppress the output of logging information. (Default: off)
+
+=item B<-h/--help>
+
+Show help information.
+
+=item B<--version>
+
+Show version information.
+
+=back
+
+=head1 OUTPUT OPTIONS
+
+=over
+
+=item B<--sort-bam-by-read-name>
+
+Sort BAM file aligned under transcript coordidate by read name. Setting this option on will produce deterministic maximum likelihood estimations from independent runs. Note that sorting will take long time and lots of memory. (Default: off)
+
+=item B<--no-bam-output>
+
+Do not output any BAM file. (Default: off)
+
+=item B<--sampling-for-bam>
+
+When RSEM generates a BAM file, instead of outputting all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
+
+=item B<--output-genome-bam>
+
+Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off)
+
+=item B<--sort-bam-by-coordinate>
+
+Sort RSEM generated transcript and genome BAM files by coordinates and build associated indices. (Default: off)
+
+=item B<--sort-bam-memory-per-thread> <string>
+
+Set the maximum memory per thread that can be used by 'samtools sort'. <string> represents the memory and accepts suffices 'K/M/G'. RSEM will pass <string> to the '-m' option of 'samtools sort'. Note that the default used here is different from the default used by samtools. (Default: 1G)
+
+=back
+
+=head1 ALIGNER OPTIONS
+
+=over
+
+=item B<--seed-length> <int>
+
+Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least one of its mates' (for paired-end reads) length less than this value will be ignored. If the references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum allowed value is 25. Note that this script will only check if the value >= 5 and give a warning message if the value < 25 but >= 5. (Default: 25)
+
+=item B<--phred33-quals>
+
+Input quality scores are encoded as Phred+33. This option is used by Bowtie, Bowtie 2 and HISAT2. (Default: on)
+
+=item B<--phred64-quals>
+
+Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). This option is used by Bowtie, Bowtie 2 and HISAT2. (Default: off)
+
+=item B<--solexa-quals>
+
+Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). This option is used by Bowtie, Bowtie 2 and HISAT2. (Default: off)
+
+=item B<--bowtie-path> <path>
+
+The path to the Bowtie executables. (Default: the path to the Bowtie executables is assumed to be in the user's PATH environment variable)
+
+=item B<--bowtie-n> <int>
+
+(Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, Default: 2)
+
+=item B<--bowtie-e> <int>
+
+(Bowtie parameter) max sum of mismatch quality scores across the alignment. (Default: 99999999)
+
+=item B<--bowtie-m> <int>
+
+(Bowtie parameter) suppress all alignments for a read if > <int> valid alignments exist. (Default: 200)
+
+=item B<--bowtie-chunkmbs> <int>
+
+(Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use Bowtie's default)
+
+=item B<--bowtie2-path> <path>
+
+(Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: the path to the Bowtie 2 executables is assumed to be in the user's PATH environment variable)
+
+=item B<--bowtie2-mismatch-rate> <double>
+
+(Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: 0.1)
+
+=item B<--bowtie2-k> <int>
+
+(Bowtie 2 parameter) Find up to <int> alignments per read. (Default: 200)
+
+=item B<--bowtie2-sensitivity-level> <string>
+
+(Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end mode. This option controls how hard Bowtie 2 tries to find alignments. <string> must be one of "very_fast", "fast", "sensitive" and "very_sensitive". The four candidates correspond to Bowtie 2's "--very-fast", "--fast", "--sensitive" and "--very-sensitive" options. (Default: "sensitive" - use Bowtie 2's default)
+
+=item B<--star-path> <path>
+
+The path to STAR's executable. (Default: the path to STAR executable is assumed to be in user's PATH environment variable)
+
+=item B<--star-gzipped-read-file>
+
+(STAR parameter) Input read file(s) is compressed by gzip. (Default: off)
+
+=item B<--star-bzipped-read-file>
+
+(STAR parameter) Input read file(s) is compressed by bzip2. (Default: off)
+
+=item B<--star-output-genome-bam>
+
+(STAR parameter) Save the BAM file from STAR alignment under genomic coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted by genomic coordinate. In this file, according to STAR's manual, 'paired ends of an alignment are always adjacent, and multiple alignments of a read are adjacent as well'. (Default: off)
+
+=item B<--hisat2-path> <path>
+
+The path to HISAT2's executable. (Default: the path to HISAT2 executable is assumed to be in user's PATH environment variable)
+
+=back
+
+=head1 ADVANCED OPTIONS
+
+=over
+
+=item B<--tag> <string>
+
+The name of the optional field used in the SAM input for identifying a read with too many valid alignments. The field should have the format <tagName>:i:<value>, where a <value> bigger than 0 indicates a read with too many alignments. (Default: "")
+
+=item B<--fragment-length-min> <int>
+
+Minimum read/insert length allowed. This is also the value for the Bowtie/Bowtie2 -I option. (Default: 1)
+
+=item B<--fragment-length-max> <int>
+
+Maximum read/insert length allowed. This is also the value for the Bowtie/Bowtie 2 -X option. (Default: 1000)
+
+=item B<--fragment-length-mean> <double>
+
+(single-end data only) The mean of the fragment length distribution, which is assumed to be a Gaussian. (Default: -1, which disables use of the fragment length distribution)
+
+=item B<--fragment-length-sd> <double>
+
+(single-end data only) The standard deviation of the fragment length distribution, which is assumed to be a Gaussian. (Default: 0, which assumes that all fragments are of the same length, given by the rounded value of B<--fragment-length-mean>)
+
+=item B<--estimate-rspd>
+
+Set this option if you want to estimate the read start position distribution (RSPD) from data. Otherwise, RSEM will use a uniform RSPD. (Default: off)
+
+=item B<--num-rspd-bins> <int>
+
+Number of bins in the RSPD. Only relevant when '--estimate-rspd' is specified. Use of the default setting is recommended. (Default: 20)
+
+=item B<--gibbs-burnin> <int>
+
+The number of burn-in rounds for RSEM's Gibbs sampler. Each round passes over the entire data set once. If RSEM can use multiple threads, multiple Gibbs samplers will start at the same time and all samplers share the same burn-in number. (Default: 200)
+
+=item B<--gibbs-number-of-samples> <int>
+
+The total number of count vectors RSEM will collect from its Gibbs samplers. (Default: 1000)
+
+=item B<--gibbs-sampling-gap> <int>
+
+The number of rounds between two succinct count vectors RSEM collects. If the count vector after round N is collected, the count vector after round N + <int> will also be collected. (Default: 1)
+
+=item B<--ci-credibility-level> <double>
+
+The credibility level for credibility intervals. (Default: 0.95)
+
+=item B<--ci-memory> <int>
+
+Maximum size (in memory, MB) of the auxiliary buffer used for computing credibility intervals (CI). (Default: 1024)
+
+=item B<--ci-number-of-samples-per-count-vector> <int>
+
+The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled count vector. This option controls how many Theta vectors are sampled per sampled count vector. (Default: 50)
+
+=item B<--keep-intermediate-files>
+
+Keep temporary files generated by RSEM. RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted. Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
+
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
+=item B<--time>
+
+Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
+
+=back
+
+=head1 PRIOR-ENHANCED RSEM OPTIONS
+
+=over
+
+=item B<--run-pRSEM>
+
+Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. isoform's initial pseudo-count for RSEM's Gibbs sampling, will be learned from input RNA-seq data and an external data set. When pRSEM needs and only needs ChIP-seq peak information to partition isoforms (e.g. in pRSEM's default partition model), either ChIP-seq peak file (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for target and input and the path for Bowtie executables are required (with the '--chipseq-target-read-files <string>', '--chipseq-control-read-files <string>', and '--bowtie-path <path> options), otherwise, ChIP-seq FASTQ files for target and control and the path to Bowtie executables are required. (Default: off)
+
+=item B<--chipseq-peak-file> <string>
+
+Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. BED6+4, format. This file is used when running prior-enhanced RSEM in the default two-partition model. It partitions isoforms by whether they have ChIP-seq overlapping with their transcription start site region or not. Each partition will have its own prior parameter learned from a training set. This file can be either gzipped or ungzipped. (Default: "")
+
+=item B<--chipseq-target-read-files> <string>
+
+Comma-separated full path of FASTQ read file(s) for ChIP-seq target. This option is used when running prior-enhanced RSEM. It provides information to calculate ChIP-seq peaks and signals. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-control-read-files <string>' must be defined when this option is specified. (Default: "")
+
+=item B<--chipseq-control-read-files> <string>
+
+Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. This option is used when running prior-enhanced RSEM. It provides information to call ChIP-seq peaks. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-target-read-files <string>' must be defined when this option is specified. (Default: "")
+
+=item B<--chipseq-read-files-multi-targets> <string>
+
+Comma-separated full path of FASTQ read files for multiple ChIP-seq targets. This option is used when running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. It provides information to calculate ChIP-seq signals. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--bowtie-path <path>' must be defined and the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
+
+=item B<--chipseq-bed-files-multi-targets> <string>
+
+Comma-separated full path of BED files for multiple ChIP-seq targets. This option is used when running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. It provides information of ChIP-seq signals and must have at least the first six BED columns. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
+
+=item B<--cap-stacked-chipseq-reads>
+
+Keep a maximum number of ChIP-seq reads that aligned to the same genomic interval. This option is used when running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. This option is only in use when either '--chipseq-read-files-multi-targets <string>' or '--chipseq-bed-files-multi-targets <string>' is specified. (Default: off)
+
+=item B<--n-max-stacked-chipseq-reads> <int>
+
+The maximum number of stacked ChIP-seq reads to keep. This option is used when running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. This option is only in use when the option '--cap-stacked-chipseq-reads' is set. (Default: 5)
+
+=item B<--partition-model> <string>
+
+A keyword to specify the partition model used by prior-enhanced RSEM. It must be one of the following keywords:
+
+=over 2
+
+=item - B<pk>
+
+Partitioned by whether an isoform has a ChIP-seq peak overlapping with its transcription start site (TSS) region. The TSS region is defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this type of peak as 'TSS peak' when explaining other keywords.
+
+=item - B<pk_lgtnopk>
+
+First partitioned by TSS peak. Then, for isoforms in the 'no TSS peak' set, a logistic model is employed to further classify them into two partitions.
+
+=item - B<lm3>, B<lm4>, B<lm5>, or B<lm6>
+
+Based on their ChIP-seq signals, isoforms are classified into 3, 4, 5, or 6 partitions by a linear regression model.
+
+=item - B<nopk_lm2pk>, B<nopk_lm3pk>, B<nopk_lm4pk>, or B<nopk_lm5pk>
+
+First partitioned by TSS peak. Then, for isoforms in the 'with TSS peak' set, a linear regression model is employed to further classify them into 2, 3, 4, or 5 partitions.
+
+=item - B<pk_lm2nopk>, B<pk_lm3nopk>, B<pk_lm4nopk>, or B<pk_lm5nopk>
+
+First partitioned by TSS peak. Then, for isoforms in the 'no TSS peak' set, a linear regression model is employed to further classify them into 2, 3, 4, or 5 partitions.
+
+=item - B<cmb_lgt>
+
+Using a logistic regression to combine TSS signals from multiple complementary data sets and partition training set isoform into 'expressed' and 'not expressed'. This partition model is only in use when either '--chipseq-read-files-multi-targets <string>' or '--chipseq-bed-files-multi-targets <string> is specified.
+
+=back
+
+Parameters for all the above models are learned from a training set. For detailed explanations, please see prior-enhanced RSEM's paper. (Default: 'pk')
+
+=back
+
+=head1 DEPRECATED OPTIONS
+
+=over
+
+The options in this section are deprecated. They are here only for compatibility reasons and may be removed in future releases.
+
+=back
+
+=over
+
+=item B<--sam>
+
+Inputs are alignments in SAM format. (Default: off)
+
+=item B<--bam>
+
+Inputs are alignments in BAM format. (Default: off)
+
+=item B<--strand-specific>
+
+Equivalent to '--strandedness forward'. (Default: off)
+
+=item B<--forward-prob> <double>
+
+Probability of generating a read from the forward strand of a transcript. Set to 1 for a strand-specific protocol where all (upstream) reads are derived from the forward strand, 0 for a strand-specific protocol where all (upstream) read are derived from the reverse strand, or 0.5 for a non-strand-specific protocol. (Default: off)
+
+=back
+
+=head1 DESCRIPTION
+
+In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments. RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified. Alternatively, users can use STAR to align reads using the '--star' option. RSEM has provided options in 'rsem-prepare-reference' to prepare STAR's genome indices. Users may use an alternative aligner by specifying '--alignments', and providing an alignment file in SAM/BAM/CRAM format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section.
+
+One simple way to make the alignment file satisfying RSEM's requirements is to use the 'convert-sam-for-rsem' script. This script accepts SAM/BAM/CRAM files as input and outputs a BAM file. For example, type the following command to convert a SAM file, 'input.sam', to a ready-for-use BAM file, 'input_for_rsem.bam':
+
+ convert-sam-for-rsem input.sam input_for_rsem
+
+For details, please refer to 'convert-sam-for-rsem's documentation page.
+
+=head1 NOTES
+
+1. Users must run 'rsem-prepare-reference' with the appropriate reference before using this program.
+
+2. For single-end data, it is strongly recommended that the user provide the fragment length distribution parameters (--fragment-length-mean and --fragment-length-sd). For paired-end data, RSEM will automatically learn a fragment length distribution from the data.
+
+3. Some aligner parameters have default values different from their original settings.
+
+4. With the '--calc-pme' option, posterior mean estimates will be calculated in addition to maximum likelihood estimates.
+
+5. With the '--calc-ci' option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates.
+
+6. The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
+
+With the '--run-pRSEM' option and associated options (see section 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM will be running. Prior parameters will be learned from supplied external data set(s) and assigned as initial pseudo-counts for isoforms in the corresponding partition for Gibbs sampling.
+
+=head1 OUTPUT
+
+=over
+
+=item B<sample_name.isoforms.results>
+
+File containing isoform level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
+
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+
+'transcript_id' is the transcript name of this transcript. 'gene_id'
+is the gene name of the gene which this transcript belongs to (denote
+this gene as its parent gene). If no gene information is provided,
+'gene_id' and 'transcript_id' are the same.
+
+'length' is this transcript's sequence length (poly(A) tail is not
+counted). 'effective_length' counts only the positions that can
+generate a valid fragment. If no poly(A) tail is added,
+'effective_length' is equal to transcript length - mean fragment
+length + 1. If one transcript's effective length is less than 1, this
+transcript's both effective length and abundance estimates are set to
+0.
+
+'expected_count' is the sum of the posterior probability of each read
+comes from this transcript over all reads. Because 1) each read
+aligning to this transcript has a probability of being generated from
+background noise; 2) RSEM may filter some alignable low quality reads,
+the sum of expected counts for all transcript are generally less than
+the total number of reads aligned.
+
+'TPM' stands for Transcripts Per Million. It is a relative measure of
+transcript abundance. The sum of all transcripts' TPM is 1
+million. 'FPKM' stands for Fragments Per Kilobase of transcript per
+Million mapped reads. It is another relative measure of transcript
+abundance. If we define l_bar be the mean transcript length in a
+sample, which can be calculated as
+
+l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript),
+
+the following equation is hold:
+
+FPKM_i = 10^3 / l_bar * TPM_i.
+
+We can see that the sum of FPKM is not a constant across samples.
+
+'IsoPct' stands for isoform percentage. It is the percentage of this
+transcript's abandunce over its parent gene's abandunce. If its parent
+gene has only one isoform or the gene information is not provided,
+this field will be set to 100.
+
+'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean
+estimates calculated by RSEM's Gibbs
+sampler. 'posterior_standard_deviation_of_count' is the posterior
+standard deviation of counts. 'IsoPct_from_pme_TPM' is the isoform
+percentage calculated from 'pme_TPM' values.
+
+'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and
+'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
+credibility intervals for TPM and FPKM values. The bounds are
+inclusive (i.e. [l, u]).
+
+'TPM_coefficient_of_quartile_variation' and
+'FPKM_coefficient_of_quartile_variation' are coefficients of quartile
+variation (CQV) for TPM and FPKM values. CQV is a robust way of
+measuring the ratio between the standard deviation and the mean. It is
+defined as
+
+CQV := (Q3 - Q1) / (Q3 + Q1),
+
+where Q1 and Q3 are the first and third quartiles.
+
+=item B<sample_name.genes.results>
+
+File containing gene level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+gene_id transcript_id(s) length effective_length expected_count TPM FPKM [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
+
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+
+'transcript_id(s)' is a comma-separated list of transcript_ids
+belonging to this gene. If no gene information is provided, 'gene_id'
+and 'transcript_id(s)' are identical (the 'transcript_id').
+
+A gene's 'length' and 'effective_length' are
+defined as the weighted average of its transcripts' lengths and
+effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
+are just the sum of its transcripts' abundance estimates.
+
+=item B<sample_name.alleles.results>
+
+Only generated when the RSEM references are built with allele-specific
+transcripts.
+
+This file contains allele level expression estimates for
+allele-specific expression calculation. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation]
+
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+
+'allele_id' is the allele-specific name of this allele-specific transcript.
+
+'AlleleIsoPct' stands for allele-specific percentage on isoform
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent transcript's abundance. If its parent
+transcript has only one allele variant form, this field will be set to
+100.
+
+'AlleleGenePct' stands for allele-specific percentage on gene
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent gene's abundance.
+
+'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have
+similar meanings. They are calculated based on posterior mean
+estimates.
+
+Please note that if this file is present, the fields 'length' and
+'effective_length' in 'sample_name.isoforms.results' should be
+interpreted similarly as the corresponding definitions in
+'sample_name.genes.results'.
+
+=item B<sample_name.transcript.bam>
+
+Only generated when --no-bam-output is not specified.
+
+'sample_name.transcript.bam' is a BAM-formatted file of read
+alignments in transcript coordinates. The MAPQ field of each alignment
+is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
+posterior probability of that alignment being the true mapping of a
+read. In addition, RSEM pads a new tag ZW:f:value, where value is a
+single precision floating number representing the posterior
+probability. Because this file contains all alignment lines produced
+by bowtie or user-specified aligners, it can also be used as a
+replacement of the aligner generated BAM/SAM file.
+
+=item B<sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
+
+Only generated when --no-bam-output is not specified and --sort-bam-by-coordinate is specified.
+
+'sample_name.transcript.sorted.bam' and
+'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
+indices generated by samtools (included in RSEM package).
+
+=item B<sample_name.genome.bam>
+
+Only generated when --no-bam-output is not specified and --output-genome-bam is specified.
+
+'sample_name.genome.bam' is a BAM-formatted file of read alignments in
+genomic coordinates. Alignments of reads that have identical genomic
+coordinates (i.e., alignments to different isoforms that share the
+same genomic region) are collapsed into one alignment. The MAPQ field
+of each alignment is set to min(100, floor(-10 * log10(1.0 - w) +
+0.5)), where w is the posterior probability of that alignment being
+the true mapping of a read. In addition, RSEM pads a new tag
+ZW:f:value, where value is a single precision floating number
+representing the posterior probability. If an alignment is spliced, a
+XS:A:value tag is also added, where value is either '+' or '-'
+indicating the strand of the transcript it aligns to.
+
+=item B<sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai>
+
+Only generated when --no-bam-output is not specified, and --sort-bam-by-coordinate and --output-genome-bam are specified.
+
+'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
+sorted BAM file and indices generated by samtools (included in RSEM package).
+
+=item B<sample_name.time>
+
+Only generated when --time is specified.
+
+It contains time (in seconds) consumed by aligning reads, estimating expression levels and calculating credibility intervals.
+
+=item B<sample_name.log>
+
+Only generated when --alignments is not specified.
+
+It captures alignment statistics outputted from the user-specified aligner.
+
+=item B<sample_name.stat>
+
+This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
+
+'sample_name.stat/sample_name.cnt' contains alignment statistics. The format and meanings of each field are described in 'cnt_file_description.txt' under RSEM directory.
+
+'sample_name.stat/sample_name.model' stores RNA-Seq model parameters learned from the data. The format and meanings of each filed of this file are described in 'model_file_description.txt' under RSEM directory.
+
+The following four output files will be generated only by prior-enhanced RSEM
+
+=over 2
+
+=item - 'sample_name.stat/sample_name_prsem.all_tr_features'
+
+It stores isofrom features for deriving and assigning pRSEM prior. The first line is a header and the rest is one isoform per line. The description for each column is:
+
+=over 2
+
+=item * B<trid>: transcript ID from input annotation
+
+=item * B<geneid>: gene ID from input anntation
+
+=item * B<chrom>: isoform's chromosome name
+
+=item * B<strand>: isoform's strand name
+
+=item * B<start>: isoform's end with the lowest genomic loci
+
+=item * B<end>: isoform's end with the highest genomic loci
+
+=item * B<tss_mpp>: average mappability of [TSS-500bp, TSS+500bp], where TSS is isoform's transcription start site, i.e. 5'-end
+
+=item * B<body_mpp>: average mappability of (TSS+500bp, TES-500bp), where TES is isoform's transcription end site, i.e. 3'-end
+
+=item * B<tes_mpp>: average mappability of [TES-500bp, TES+500bp]
+
+=item * B<pme_count>: isoform's fragment or read count from RSEM's posterior mean estimates
+
+=item * B<tss>: isoform's TSS loci
+
+=item * B<tss_pk>: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region overlaps with a RNA Pol II peak; 0 otherwise
+
+=item * B<is_training>: equal to 1 if isoform is in the training set where Pol II prior is learned; 0 otherwise
+
+=back
+
+=item - 'sample_name.stat/sample_name_prsem.all_tr_prior'
+
+It stores prior parameters for every isoform. This file does not have a header. Each line contains a prior parameter and an isoform's transcript ID delimited by ` # `.
+
+=item - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results'
+
+RSEM's posterior mean estimates on the isoform level with an initial pseudo-count of one for every isoform. It is in the same format as the 'sample_name.isoforms.results'.
+
+=item - 'sample_name.stat/sample_name_uniform_prior_1.genes.results'
+
+RSEM's posterior mean estimates on the gene level with an initial pseudo-count of one for every isoform. It is in the same format as the 'sample_name.genes.results'.
+
+=back
+
+When learning prior from multiple external data sets in prior-enhanced RSEM, two additional output files will be generated.
+
+=over 2
+
+=item - 'sample_name.stat/sample_name.pval_LL'
+
+It stores a p-value and a log-likelihood. The p-value indicates whether the combination of multiple complementary data sets is informative for RNA-seq quantification. The log-likelihood shows how well pRSEM's Dirichlet-multinomial model fits the read counts of partitioned training set isoforms.
+
+=item - 'sample_name.stat/sample_name.lgt_mdl.RData'
+
+It stores an R object named 'glmmdl', which is a logistic regression model on the training set isoforms and multiple external data sets.
+
+=back
+
+In addition, extra columns will be added to 'sample_name.stat/all_tr_features'
+
+=over 2
+
+=item * B<is_expr>: equal to 1 if isoform has an abundance >= 1 TPM and a non-zero read count from RSEM's posterior mean estimates; 0 otherwise
+
+=item * B<"$external_data_set_basename">: log10 of external data's signal at [TSS-500, TSS+500]. Signal is the number of reads aligned within that interval and normalized to RPKM by read depth and interval length. It will be set to -4 if no read aligned to that interval.
+
+There are multiple columns like this one, where each represents an external data set.
+
+=item * B<prd_expr_prob>: predicted probability from logistic regression model on whether this isoform is expressed or not. A probability higher than 0.5 is considered as expressed
+
+=item * B<partition>: group index, to which this isoforms is partitioned
+
+=item * B<prior>: prior parameter for this isoform
+
+=back
+
+=back
+
+=head1 EXAMPLES
+
+Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mouse_125'.
+
+1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file. In addition, we want to append gene/transcript names to the result files:
+
+ rsem-calculate-expression --phred64-quals \
+ -p 8 \
+ --append-names \
+ --output-genome-bam \
+ /data/mmliver.fq \
+ /ref/mouse_125 \
+ mmliver_single_quals
+
+2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end reads with quality scores. Suppose the library is prepared using TruSeq Stranded Kit, which means the first mate should map to the reverse strand. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file:
+
+ rsem-calculate-expression -p 8 \
+ --paired-end \
+ --strandedness reverse \
+ /data/mmliver_1.fq \
+ /data/mmliver_2.fq \
+ /ref/mouse_125 \
+ mmliver_paired_end_quals
+
+3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads:
+
+ rsem-calculate-expression -p 8 \
+ --no-qualities \
+ /data/mmliver.fa \
+ /ref/mouse_125 \
+ mmliver_single_without_quals
+
+4) Data are the same as 1). This time we assume the bowtie executables are under '/sw/bowtie'. We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals. We allow RSEM to use 1GB of memory for CI calculation:
+
+ rsem-calculate-expression --bowtie-path /sw/bowtie \
+ --phred64-quals \
+ --fragment-length-mean 150.0 \
+ --fragment-length-sd 35.0 \
+ -p 8 \
+ --output-genome-bam \
+ --calc-ci \
+ --ci-memory 1024 \
+ /data/mmliver.fq \
+ /ref/mouse_125 \
+ mmliver_single_quals
+
+5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for paired-end reads with quality scores. We want to use 8 threads:
+
+ rsem-calculate-expression --paired-end \
+ --alignments \
+ -p 8 \
+ /data/mmliver_paired_end_quals.bam \
+ /ref/mouse_125 \
+ mmliver_paired_end_quals
+
+6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads with quality scores and read files are compressed by gzip. We want to use STAR to aligned reads and assume STAR executable is '/sw/STAR'. Suppose we want to use 8 threads and do not generate a genome BAM file:
+
+ rsem-calculate-expression --paired-end \
+ --star \
+ --star-path /sw/STAR \
+ --gzipped-read-file \
+ --paired-end \
+ -p 8 \
+ /data/mmliver_1.fq.gz \
+ /data/mmliver_2.fq.gz \
+ /ref/mouse_125 \
+ mmliver_paired_end_quals
+
+
+7) In the above example, suppose we want to run prior-enhanced RSEM instead. Assuming we want to learn priors from a ChIP-seq peak file '/data/mmlive.narrowPeak.gz':
+
+ rsem-calculate-expression --star \
+ --star-path /sw/STAR \
+ --gzipped-read-file \
+ --paired-end \
+ --calc-pme \
+ --run-pRSEM \
+ --chipseq-peak-file /data/mmliver.narrowPeak.gz \
+ -p 8 \
+ /data/mmliver_1.fq.gz \
+ /data/mmliver_2.fq.gz \
+ /ref/mouse_125 \
+ mmliver_paired_end_quals
+
+8) Similar to the example in 7), suppose we want to use the partition model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and then partitioning 'no TSS peak' isoforms into two bins by a linear regression model), and we want to partition isoforms by RNA Pol II's ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables are under '/sw/bowtie/':
+
+ rsem-calculate-expression --star \
+ --star-path /sw/STAR \
+ --gzipped-read-file \
+ --paired-end \
+ --calc-pme \
+ --run-pRSEM \
+ --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \
+ --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \
+ --partition-model pk_lm2nopk \
+ --bowtie-path /sw/bowtie \
+ -p 8 \
+ /data/mmliver_1.fq.gz \
+ /data/mmliver_2.fq.gz \
+ /ref/mouse_125 \
+ mmliver_paired_end_quals
+
+9) Similar to the example in 8), suppose we want to derive prior from four histone modification ChIP-seq read data sets: '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming Bowtie's executables are under '/sw/bowtie/':
+
+ rsem-calculate-expression --star \
+ --star-path /sw/STAR \
+ --gzipped-read-file \
+ --paired-end \
+ --calc-pme \
+ --run-pRSEM \
+ --partition-model cmb_lgt \
+ --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
+ --bowtie-path /sw/bowtie \
+ -p 8 \
+ /data/mmliver_1.fq.gz \
+ /data/mmliver_2.fq.gz \
+ /ref/mouse_125 \
+ mmliver_paired_end_quals
+
+=cut
diff --git a/rsem-control-fdr b/rsem-control-fdr
new file mode 100755
index 0000000..6736aaf
--- /dev/null
+++ b/rsem-control-fdr
@@ -0,0 +1,117 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use Pod::Usage;
+use strict;
+
+my $hard = 0;
+my $soft = 0;
+my $help = 0;
+
+GetOptions("hard-threshold" => \$hard,
+ "soft-threshold" => \$soft,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+pod2usage(-msg => "--hard-threshold and --soft-threshold cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($hard && $soft);
+
+if ($hard == 0 && $soft == 0) { $hard = 1; }
+
+my $fdr = $ARGV[1];
+
+open(INPUT, "$ARGV[0]");
+open(OUTPUT, ">$ARGV[2]");
+
+my $header = <INPUT>;
+chomp($header);
+my @columns = split(/\t/, $header);
+
+my $pos = 0;
+while ($pos <= $#columns && $columns[$pos] ne "\"PPDE\"") { ++$pos; }
+if ($pos > $#columns) { print "Error: Cannot find column PPDE!\n"; exit(-1); }
+++$pos;
+
+print OUTPUT "$header\n";
+
+my ($n, $sum) = (0, 0);
+my $line = "";
+while($line = <INPUT>) {
+ chomp($line);
+ my @fields = split(/\t/, $line);
+ my $ppee = 1.0 - $fields[$pos];
+ if ($hard) {
+ if ($ppee > $fdr) { last; }
+ ++$n;
+ print OUTPUT "$line\n";
+ }
+ else {
+ if ($sum + $ppee > $fdr * ($n + 1)) { last; }
+ ++$n;
+ $sum += $ppee;
+ print OUTPUT "$line\n";
+ }
+}
+
+print "There are $n genes/transcripts reported at FDR = $fdr.\n";
+
+close(INPUT);
+close(OUTPUT);
+
+__END__
+
+=head1 NAME
+
+rsem-control-fdr - Filter EBSeq output for statistical significance.
+
+=head1 SYNOPSIS
+
+rsem-control-fdr [options] input_file fdr_rate output_file
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<input_file>
+
+This should be the main result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics.
+
+=item B<fdr_rate>
+
+The desire false discovery rate (FDR).
+
+=item B<output_file>
+
+This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed. Because statistical significance does not necessarily mean biological significance, users should also refer to the fold changes to decide which genes/transcripts are biologically significant. When more than two conditions exist, this file will not contain fold change information and users need to calculate it from 'input_file.condmeans' by themselves.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--hard-threshold>
+
+Use hard threshold method to control FDR. If this option is set, only those genes/transcripts with their PPDE >= 1 - fdr_rate are called as DE. (Default: on)
+
+=item B<--soft-threshold>
+
+Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. This option is equivalent to use EBSeq's 'crit_fun' for FDR control. (Default: off)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program controls the false discovery rate and reports differentially expressed genes/transcripts.
+
+=head1 EXAMPLES
+
+We assume that we have 'GeneMat.results' as input. We want to control FDR at 0.05 using hard threshold method and name the output file as 'GeneMat.de.txt':
+
+ rsem-control-fdr GeneMat.results 0.05 GeneMat.de.txt
+
+=cut
diff --git a/rsem-gen-transcript-plots b/rsem-gen-transcript-plots
new file mode 100755
index 0000000..d43fe62
--- /dev/null
+++ b/rsem-gen-transcript-plots
@@ -0,0 +1,173 @@
+#!/usr/bin/env Rscript
+
+### Some constants
+
+
+nrow_per_page = 3 # if input_list is composed of transcript ids
+ncol_per_page = 2 # if input_list is composed of transcript ids
+num_plots_per_page = nrow_per_page * ncol_per_page # if input_list is composed of transcript/allele ids
+
+
+### Load program arguments
+
+
+assert = function(expr, errmsg) {
+ if (!expr) {
+ cat(errmsg, "\n", sep = "", file = stderr())
+ quit(save = "no", status = 1)
+ }
+}
+
+args = commandArgs(TRUE)
+assert(length(args) == 6, "Usage: rsem-gen-transcript-plots sample_name input_list is_allele_specific id_type<0,allele;1,isoform;2,gene> show_uniq output_plot_file")
+
+sample_name = args[1]
+input_list = args[2]
+alleleS = as.numeric(args[3])
+id_type = as.numeric(args[4])
+show_uniq = as.numeric(args[5])
+output_plot_file = args[6]
+
+
+### Load read depth files
+
+
+load_read_depth = function(file) {
+ depth = read.table(file, sep = "\t", stringsAsFactors = FALSE)
+ rownames(depth) = depth[,1]
+ return (depth)
+}
+
+readdepth = load_read_depth(sprintf("%s.transcript.readdepth", sample_name))
+M = dim(readdepth)[1]
+ord_depth = order(readdepth[,1])
+
+all2uniq = c()
+if (show_uniq) {
+ readdepth_uniq = load_read_depth(sprintf("%s.uniq.transcript.readdepth", sample_name))
+ ord_uniq_depth = order(readdepth_uniq[,1])
+ assert(sum(readdepth[ord_depth,1] != readdepth_uniq[ord_uniq_depth,1]) == 0, "transcript/allele IDS in read depth and unique read depth files are not the same!")
+ assert(sum(readdepth[ord_depth,2] != readdepth_uniq[ord_uniq_depth,2]) == 0, "transcript lengths in read depth and unique read depth files are not the same!")
+ all2uniq[ord_depth] = ord_uniq_depth
+}
+
+cat("Loading read depth files is done!\n")
+
+
+### Build Gene-Isoform/Gene-Allele map and maps between IDs and ID_NAMEs
+
+
+id_equal = function(a, b) {
+ a == substr(b, 1, nchar(a))
+}
+
+
+expr_data = read.delim(sprintf("%s.%s.results", sample_name, ifelse(alleleS, "alleles", "isoforms")), stringsAsFactors = FALSE)
+assert(M == dim(expr_data)[1], "The number of transcripts/alleles contained in the expression file is not equal to the number in the readdepth file!")
+ord_expr = order(expr_data[,1])
+
+assert(sum(sapply(1:M, function(i) { !id_equal(readdepth[ord_depth[i], 1], expr_data[ord_expr[i], 1]) })) == 0, "Transcript/Allele IDs in the expression file is not exactly the same as the ones in the readdepth file!")
+
+expr2depth = c() # from id_name to pos
+expr2depth[ord_expr] = ord_depth
+names(expr2depth) = expr_data[,1]
+
+is_composite = (!alleleS && (id_type == 2)) || (alleleS && (id_type > 0))
+
+if (is_composite) {
+ tmp_df = data.frame(expr2depth, expr_data[,ifelse(alleleS && id_type == 2, 3, 2)], stringsAsFactors = F)
+ tmp_agg = aggregate(tmp_df[1], tmp_df[2], function(x) { x })
+}
+
+cat("Building transcript to gene map is done!\n")
+
+
+### Load and transfer IDs
+
+
+ids = scan(file = input_list, what = "", sep = "\n", strip.white = T)
+assert(length(ids) > 0, "You should provide at least one ID.")
+poses = c()
+
+if (is_composite) {
+ poses = charmatch(ids, tmp_agg[,1], nomatch = -1)
+} else {
+ poses = match(ids, expr_data[,1])
+ idx = !is.na(poses)
+ poses[idx] = expr2depth[poses[idx]]
+ poses[!idx] = match(ids[!idx], readdepth[,1], nomatch = -1)
+}
+
+err_idx = poses < 1
+if (sum(err_idx) > 0) {
+ cat("Warning: The following IDs are not in the RSEM indices and thus ignored: ")
+ cat(ids[err_idx], sep = ", ")
+ cat("\n")
+}
+
+ids = ids[!err_idx]
+poses = poses[!err_idx]
+
+assert(length(poses) > 0, "There is no valid ID. Stopped.")
+
+
+### Generate plots
+
+# pos is a number indexing the position in readdepth/readdepth_uniq
+make_a_plot = function(pos) {
+ len = readdepth[pos, 2]
+ depths = readdepth[pos, 3]
+
+ if (is.na(depths)) wiggle = rep(0, len) else wiggle = as.numeric(unlist(strsplit(depths, split = " ")))
+
+ if (!show_uniq) {
+ plot(wiggle, type = "h")
+ } else {
+ depths = readdepth_uniq[all2uniq[pos], 3]
+ if (is.na(depths)) wiggle_uniq = rep(0, len) else wiggle_uniq = as.numeric(unlist(strsplit(depths, split = " ")))
+ if (len != sum(wiggle >= wiggle_uniq)) {
+ cat("Warning: ", ifelse(alleleS, "allele-specific transcript", "transcript"), " ", id, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", " The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", " This may be due to floating point arithmetics.\n", sep = "")
+ }
+ heights = rbind(wiggle_uniq, wiggle - wiggle_uniq)
+ barplot(heights, space = 0, border = NA, names.arg = 1:len, col = c("black", "red"))
+ }
+ title(main = readdepth[pos, 1])
+}
+
+# poses is a vector of numbers
+generate_a_page = function(poses, title = NULL) {
+ n = length(poses)
+ ncol = ifelse(is_composite, floor(sqrt(n)), ncol_per_page)
+ nrow = ifelse(is_composite, ceiling(n / ncol), nrow_per_page)
+
+ par(mfrow = c(nrow, ncol), mar = c(2, 2, 2, 2))
+ if (is_composite) par(oma = c(0, 0, 3, 0))
+ sapply(poses, make_a_plot)
+ if (is_composite) mtext(title, outer = TRUE, line = 1)
+}
+
+plot_individual = function(i) {
+ fr = (i - 1) * num_plots_per_page + 1
+ to = min(i * num_plots_per_page, n)
+ generate_a_page(poses[fr:to])
+}
+
+# cid, composite id, can be either a gene id or transcript id (for allele-specific expression only)
+plot_composite = function(pos) {
+ generate_a_page(tmp_agg[pos, 2][[1]], tmp_agg[pos, 1])
+}
+
+
+pdf(output_plot_file)
+
+if (!is_composite) {
+ n = length(ids)
+ ub = (n - 1) %/% num_plots_per_page + 1
+ dumbvar = sapply(1:ub, plot_individual)
+} else {
+ dumbvar = sapply(poses, plot_composite)
+}
+
+cat("Plots are generated!\n")
+
+dev.off.output = dev.off()
diff --git a/rsem-generate-data-matrix b/rsem-generate-data-matrix
new file mode 100755
index 0000000..26390c1
--- /dev/null
+++ b/rsem-generate-data-matrix
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+
+use strict;
+
+if (scalar(@ARGV) == 0) {
+ print "Usage: rsem-generate-data-matrix sampleA.[alleles/genes/isoforms].results sampleB.[alleles/genes/isoforms].results ... > output_name.matrix\n";
+ print "All result files should have the same file type. The 'expected_count' columns of every result file are extracted to form the data matrix.\n";
+ exit(-1);
+}
+
+my $offsite = 4; # for new file formats
+if ($ARGV[0] =~ /alleles.results$/) { $offsite = 5; }
+
+my $line;
+my $n = scalar(@ARGV);
+my $M = -1;
+my @matrix = ();
+
+# 0, file_name; 1, reference of expected count array; 2, reference of transcript_id/gene_id array
+sub loadData {
+ open(INPUT, $_[0]);
+ my $line = <INPUT>; # The first line contains only column names
+ while ($line = <INPUT>) {
+ chomp($line);
+ my @fields = split(/\t/, $line);
+ push(@{$_[2]}, "\"$fields[0]\"");
+ push(@{$_[1]}, $fields[$offsite]);
+ }
+ close(INPUT);
+
+ if (scalar(@{$_[1]}) == 0) {
+ print STDERR "Nothing is detected! $_[0] may not exist or is empty.\n";
+ exit(-1);
+ }
+}
+
+#0, M; 1, reference of @ids_arr; 2, reference of @ids
+sub check {
+ my $size = $_[0];
+ for (my $i = 0; $i < $size; $i++) {
+ if ($_[1]->[$i] ne $_[2]->[$i]) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+my @ids_arr = ();
+
+for (my $i = 0; $i < $n; $i++) {
+ my (@ids, @ecs) = ();
+ &loadData($ARGV[$i], \@ecs, \@ids);
+
+ if ($M < 0) {
+ $M = scalar(@ids);
+ @ids_arr = @ids;
+ }
+ elsif (!&check($M, \@ids_arr, \@ids)) {
+ print STDERR "Number of lines among samples are not equal!\n";
+ exit(-1);
+ }
+
+ my $colname;
+ if (substr($ARGV[$i], 0, 2) eq "./") { $colname = substr($ARGV[$i], 2); }
+ else { $colname = $ARGV[$i]; }
+ $colname = "\"$colname\"";
+ @ecs = ($colname, @ecs);
+ push(@matrix, \@ecs);
+}
+
+@ids_arr = ("", @ids_arr);
+@matrix = (\@ids_arr, @matrix);
+
+for (my $i = 0; $i <= $M; $i++) {
+ for (my $j = 0; $j < $n; $j++) { print "$matrix[$j][$i]\t"; }
+ print "$matrix[$n][$i]\n";
+}
diff --git a/rsem-generate-ngvector b/rsem-generate-ngvector
new file mode 100755
index 0000000..4eac284
--- /dev/null
+++ b/rsem-generate-ngvector
@@ -0,0 +1,104 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use Pod::Usage;
+
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils;
+
+use Env qw(@PATH);
+@PATH = ("$FindBin::RealBin/EBSeq", @PATH);
+
+use strict;
+
+my $k = 25;
+my $help = 0;
+
+GetOptions("k=i" => \$k,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+
+my $command = "";
+
+$command = "rsem-for-ebseq-calculate-clustering-info $k $ARGV[0] $ARGV[1].ump";
+&runCommand($command);
+
+$command = "rsem-for-ebseq-generate-ngvector-from-clustering-info $ARGV[1].ump $ARGV[1].ngvec";
+&runCommand($command);
+
+__END__
+
+=head1 NAME
+
+rsem-generate-ngvector - Create Ng vector for EBSeq based only on transcript sequences.
+
+=head1 SYNOPSIS
+
+rsem-generate-ngvector [options] input_fasta_file output_name
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<input_fasta_file>
+
+The fasta file containing all reference transcripts. The transcripts must be in the same order as those in expression value files. Thus, 'reference_name.transcripts.fa' generated by 'rsem-prepare-reference' should be used.
+
+=item B<output_name>
+
+The name of all output files. The Ng vector will be stored as 'output_name.ngvec'.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<-k> <int>
+
+k mer length. See description section. (Default: 25)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program generates the Ng vector required by EBSeq for isoform level differential expression analysis based on reference sequences only. EBSeq can take variance due to read mapping ambiguity into consideration by grouping isoforms with parent gene's number of isoforms. However, for de novo assembled transcriptome, it is hard to obtain an accurate gene-isoform relationship. Instead, this program groups isoforms by using measures on read mappaing ambiguity directly. First, it calculates the 'unmappability' of each transcript. The 'unmappability' of a transcript is the ratio between the number of k mers with at least one perfect match to other transcripts and the total number of k mers of this transcript, where k is a parameter. Then, Ng vector is generated by applying Kmeans algorithm to the 'unmappability' values with number of clusters set as 3. 'rsem-generate-ngvector' will make sure the mean 'unmappability' scores for clusters are in ascending order. All transcripts whose lengths are less than k are assigned to cluster 3.
+
+If your reference is a de novo assembled transcript set, you should run 'rsem-generate-ngvector' first. Then load the resulting 'output_name.ngvec' into R. For example, you can use
+
+ NgVec <- scan(file="output_name.ngvec", what=0, sep="\n")
+
+. After that, replace 'IsoNgTrun' with 'NgVec' in the second line of section 3.2.5 (Page 10) of EBSeq's vignette:
+
+ IsoEBres=EBTest(Data=IsoMat, NgVector=NgVec, ...)
+
+This program only needs to run once per RSEM reference.
+
+=head1 OUTPUT
+
+=over
+
+=item B<output_name.ump>
+
+'unmappability' scores for each transcript. This file contains two columns. The first column is transcript name and the second column is 'unmappability' score.
+
+=item B<output_name.ngvec>
+
+Ng vector generated by this program.
+
+=back
+
+=head1 EXAMPLES
+
+Suppose the reference sequences file is '/ref/mouse_125/mouse_125.transcripts.fa' and we set the output_name as 'mouse_125':
+
+ rsem-generate-ngvector /ref/mouse_125/mouse_125.transcripts.fa mouse_125
+
+=cut
diff --git a/rsem-gff3-to-gtf b/rsem-gff3-to-gtf
new file mode 100755
index 0000000..ce6e1fd
--- /dev/null
+++ b/rsem-gff3-to-gtf
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2016
+# Bo Li (University of California, Berkeley)
+# bli25@berkeley.edu
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+# USA
+
+import os
+import sys
+import argparse
+from operator import itemgetter
+
+type_gene = ["gene", "snRNA_gene", "transposable_element_gene", "ncRNA_gene", "telomerase_RNA_gene",
+ "rRNA_gene", "tRNA_gene", "snoRNA_gene", "mt_gene", "miRNA_gene", "lincRNA_gene", "RNA", "VD_gene_segment"]
+type_transcript = ["transcript", "primary_transcript", "mRNA", "ncRNA", "tRNA", "rRNA", "snRNA", "snoRNA", "miRNA",
+ "pseudogenic_transcript", "lincRNA", "NMD_transcript_variant", "aberrant_processed_transcript",
+ "nc_primary_transcript", "processed_pseudogene", "mRNA_TE_gene"]
+type_exon = ["exon", "CDS", "five_prime_UTR", "three_prime_UTR", "UTR", "noncoding_exon", "pseudogenic_exon"]
+
+# can be either gene or transcript, need special treatment
+type_gene_or_transcript = ["pseudogene", "V_gene_segment", "C_gene_segment", "J_gene_segment", "processed_transcript"]
+
+
+class HelpOnErrorParser(argparse.ArgumentParser):
+ def error(self, msg):
+ sys.stderr.write("{0}: error: {1}\n\n".format(os.path.basename(sys.argv[0]), msg))
+ self.print_help()
+ sys.exit(-1)
+
+
+def my_assert(bool, msg):
+ if not bool:
+ sys.stderr.write(msg + "\n")
+ try:
+ os.remove(args.output_GTF_file)
+ except OSError:
+ pass
+ sys.exit(-1)
+
+
+class Feature:
+ # def gen_type_dict():
+ def gen_type_dict(self):
+ my_dict = {}
+ for my_type in type_gene:
+ my_dict[my_type] = "gene"
+ for my_type in type_transcript:
+ my_dict[my_type] = "transcript"
+ for my_type in type_exon:
+ my_dict[my_type] = "exon"
+
+ for my_type in type_gene_or_transcript:
+ my_dict[my_type] = "gene_or_transcript"
+
+ return my_dict
+
+ # type_dict = gen_type_dict()
+
+ def __init__(self):
+ self.type_dict = self.gen_type_dict()
+
+ def parse(self, line, line_no):
+ """ line should be free of leading and trailing spaces """
+
+ self.line = line
+ self.line_no = line_no
+
+ fields = line.split('\t')
+ my_assert(len(fields) == 9, "Line {0} does not have 9 fields:\n{1}".format(self.line_no, self.line))
+
+ self.seqid = fields[0]
+ self.source = fields[1]
+ self.original_type = fields[2]
+ self.feature_type = self.type_dict.get(fields[2], None)
+ self.start = int(fields[3])
+ self.end = int(fields[4])
+ self.strand = fields[6]
+ self.attributes = fields[8][:-1] if len(fields[8]) > 0 and fields[8][-1] == ';' else fields[8]
+
+ def parseAttributes(self):
+ self.attribute_dict = {}
+ for attribute in self.attributes.split(';'):
+ fields = attribute.split('=')
+ my_assert(len(fields) == 2, "Fail to parse attribute {0} of line {1}:\n{2}".format(attribute, self.line_no, self.line))
+ tag, value = fields
+ if tag == "Parent":
+ self.attribute_dict[tag] = value.split(',')
+ else:
+ self.attribute_dict[tag] = value
+
+ def getAttribute(self, tag, required = False):
+ value = self.attribute_dict.get(tag, None)
+ my_assert(not required or value != None, "Line {0} does not have attribute {1}:\n{2}".format(self.line_no, tag, self.line))
+ return value
+
+
+class Transcript:
+ def __init__(self, tid, feature):
+ self.tid = tid
+ self.tname = self.ttype = None
+ self.gid = self.gname = None
+ self.setT = False # if a transcript feature has been set
+
+ self.seqid = feature.seqid
+ # self.source = feature.source
+ self.source = None
+ self.strand = feature.strand
+
+ self.intervals = []
+
+ def setTranscript(self, feature):
+ my_assert(not self.setT,
+ "Transcript {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(self.tid, feature.line_no, feature.line))
+ self.setT = True
+ parents = feature.getAttribute("Parent", True)
+ my_assert(len(parents) == 1, "Transcript {0} at line {1} has more than one parents:\n{2}".format(self.tid, feature.line_no, feature.line))
+ self.gid = parents[0]
+ self.tname = feature.getAttribute("Name")
+ self.ttype = feature.original_type
+ self.source = feature.source
+
+ def addExon(self, feature):
+ self.intervals.append((feature.start, feature.end))
+
+ def merge(self):
+ self.intervals.sort(key = itemgetter(0))
+ self.results = []
+ cstart, cend = self.intervals[0]
+ for start, end in self.intervals[1:]:
+ if cend + 1 >= start:
+ cend = max(cend, end)
+ else:
+ self.results.append((cstart, cend))
+ cstart = start
+ cend = end
+ self.results.append((cstart, cend))
+
+ def __iter__(self):
+ self.index = 0
+ return self
+
+ def next(self):
+ if self.index == len(self.results):
+ raise StopIteration
+ interval = self.results[self.index]
+ self.index += 1
+ return interval
+
+ def __next__(self):
+ if self.index == len(self.results):
+ raise StopIteration
+ interval = self.results[self.index]
+ self.index += 1
+ return interval
+
+
+def getTranscript(tid, feature):
+ assert tid != None
+
+ pos = tid2pos.get(tid, None)
+ if pos == None:
+ transcript = Transcript(tid, feature)
+ tid2pos[tid] = len(transcripts)
+ transcripts.append(transcript)
+ else:
+ my_assert(pos >= 0,
+ "Line {0} describes an already processed Transcript {1}:\n{2}".format(feature.line_no, tid, feature.line))
+ transcript = transcripts[pos]
+ my_assert(transcript.seqid == feature.seqid and transcript.strand == feature.strand,
+ "Line {0}'s seqid/strand is not consistent with other records of transcript {1}:\n{2}".format(
+ feature.line_no, tid, feature.line))
+
+ return transcript
+
+def flush_out(fout):
+ global transcripts
+ global tid2pos
+ global num_trans
+ global patterns
+
+ for transcript in transcripts:
+ tid2pos[transcript.tid] = -1
+ if not transcript.setT or len(transcript.intervals) == 0 or (len(patterns) > 0 and transcript.ttype not in patterns):
+ continue
+
+ my_assert(transcript.gid in gid2gname,
+ "Cannot recognize transcript {0}'s parent {1}, a gene feature might be missing.".format(transcript.tid, transcript.gid))
+
+ transcript.gname = gid2gname[transcript.gid]
+
+ transcript.merge()
+
+ output_string = "{0}\t{1}\texon\t{{0}}\t{{1}}\t.\t{2}\t.\tgene_id \"{3}\"; transcript_id \"{4}\";".format(
+ transcript.seqid, transcript.source, transcript.strand, transcript.gid, transcript.tid)
+ if transcript.gname != None:
+ output_string += " gene_name \"{0}\";".format(transcript.gname)
+ if transcript.tname != None:
+ output_string += " transcript_name \"{0}\";".format(transcript.tname)
+ output_string += "\n"
+
+ for start, end in transcript:
+ fout.write(output_string.format(start, end))
+
+ num_trans += 1
+
+ transcripts = []
+
+
+
+parser = HelpOnErrorParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter, description = "Convert GFF3 files to GTF files.")
+parser.add_argument("input_GFF3_file", help = "Input GFF3 file.")
+parser.add_argument("output_GTF_file", help = "Output GTF file.")
+parser.add_argument("--make-genes-as-transcripts", help = "GFF3 file does not contain transcripts, make each gene as a transcript.", action = "store_true")
+parser.add_argument("--RNA-patterns", help = "Types of RNAs to be extracted, e.g. mRNA,rRNA", metavar = "<patterns>")
+parser.add_argument("--extract-sequences", help = "If GFF3 file contains reference sequences, extract them to the specified file", metavar = "<output.fa>")
+args = parser.parse_args()
+
+patterns = set()
+if args.RNA_patterns != None:
+ patterns = set(args.RNA_patterns.split(','))
+
+line_no = 0
+feature = Feature()
+
+gid2gname = {}
+
+tid2pos = {}
+transcripts = []
+
+num_trans = 0
+
+reachFASTA = False
+
+with open(args.input_GFF3_file) as fin:
+ fout = open(args.output_GTF_file, "w")
+
+ for line in fin:
+ line = line.strip()
+ line_no += 1
+ if line_no % 100000 == 0:
+ print("Loaded {0} lines".format(line_no))
+
+ if line.startswith("##FASTA"):
+ reachFASTA = True
+ break
+
+ if line.startswith("###"):
+ flush_out(fout)
+ continue
+
+ if line.startswith("#"):
+ continue
+
+ feature.parse(line, line_no)
+ if feature.feature_type == None:
+ continue
+ feature.parseAttributes()
+
+ if feature.feature_type == "gene_or_transcript":
+ parent = feature.getAttribute("Parent")
+ if parent == None:
+ feature.feature_type = "gene"
+ else:
+ feature.feature_type = "transcript"
+
+ if feature.feature_type == "gene":
+ gid = feature.getAttribute("ID", True)
+ my_assert(gid not in gid2gname,
+ "Gene {0} appears multiple times! Last occurrence is at line {1}:\n{2}".format(gid, feature.line_no, feature.line))
+ gid2gname[gid] = feature.getAttribute("Name")
+
+ if args.make_genes_as_transcripts:
+ feature.feature_type = feature.original_type = "transcript"
+ feature.attribute_dict["Parent"] = [feature.attribute_dict["ID"]]
+
+ if feature.feature_type == "transcript":
+ transcript = getTranscript(feature.getAttribute("ID", True), feature)
+ transcript.setTranscript(feature)
+
+ if feature.feature_type == "exon":
+ for parent in feature.getAttribute("Parent", True):
+ transcript = getTranscript(parent, feature)
+ transcript.addExon(feature)
+
+ flush_out(fout)
+ fout.close()
+
+ print("GTF file is successully generated.")
+ print("There are {0} transcripts contained in the generated GTF file.".format(num_trans))
+
+ if reachFASTA and args.extract_sequences != None:
+ with open(args.extract_sequences, "w") as fout:
+ for line in fin:
+ fout.write(line)
+ print("FASTA file is successfully generated.")
diff --git a/rsem-plot-model b/rsem-plot-model
new file mode 100755
index 0000000..3908e22
--- /dev/null
+++ b/rsem-plot-model
@@ -0,0 +1,169 @@
+#!/usr/bin/env Rscript
+
+argv <- commandArgs(TRUE)
+if (length(argv) != 2) {
+ cat("Usage: rsem-plot-model sample_name output_plot_file\n")
+ q(status = 1)
+}
+
+strvec <- strsplit(argv[1], split = "/")[[1]]
+token <- strvec[length(strvec)]
+
+stat.dir <- paste(argv[1], ".stat", sep = "")
+if (!file.exists(stat.dir)) {
+ cat("Error: directory does not exist: ", stat.dir, "\n", sep = "")
+ q(status = 1)
+}
+modelF <- paste(stat.dir, "/", token, ".model", sep = "")
+cntF <- paste(stat.dir, "/", token, ".cnt", sep = "")
+
+pdf(argv[2])
+
+con <- file(modelF, open = "r")
+
+# model type and forward probability
+model_type <- as.numeric(readLines(con, n = 4)[1])
+
+# fragment length distribution
+strvec <- readLines(con, n = 3)
+vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]])
+maxL <- vec[2] # maxL used for Profile
+x <- (vec[1] + 1) : vec[2]
+y <- as.numeric(strsplit(strvec[2], split = " ")[[1]])
+mode_len = which(y == max(y)) + vec[1]
+mean <- weighted.mean(x, y)
+std <- sqrt(weighted.mean((x - mean)^2, y))
+plot(x, y, type = "h",
+ main = "Fragment Length Distribution",
+ sub = sprintf("Mode = %d, Mean = %.1f, and Std = %.1f", mode_len, mean, std),
+ xlab = "Fragment Length",
+ ylab = "Probability")
+abline(v = mode_len, col = "red", lty = "dashed")
+
+# mate length distribution
+if (model_type == 0 || model_type == 1) bval <- as.numeric(readLines(con, n = 1)[1]) else bval <- 1
+
+if (bval == 1) {
+ list <- strsplit(readLines(con, n = 2), split = " ")
+ vec <- as.numeric(list[[1]])
+ maxL <- vec[2]
+ x <- (vec[1] + 1) : vec[2]
+ y <- as.numeric(list[[2]])
+ mode_len = which(y == max(y)) + vec[1]
+ mean <- weighted.mean(x, y)
+ std <- sqrt(weighted.mean((x - mean)^2, y))
+ plot(x, y, type = "h",
+ main = "Read Length Distribution",
+ sub = sprintf("Mode = %d, Mean = %.1f, and Std = %.1f", mode_len, mean, std),
+ xlab = "Read Length",
+ ylab = "Probability")
+}
+strvec <- readLines(con, n = 1)
+
+# RSPD
+bval <- as.numeric(readLines(con, n = 1)[1])
+if (bval == 1) {
+ bin_size <- as.numeric(readLines(con, n = 1)[1])
+ y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]])
+ par(cex.axis = 0.7)
+ barplot(y, space = 0, names.arg = 1:bin_size, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability")
+ par(cex.axis = 1.0)
+}
+strvec <- readLines(con, n = 1)
+
+# plot sequencing errors
+if (model_type == 1 || model_type == 3) {
+ # skip QD
+ N <- as.numeric(readLines(con, n = 1)[1])
+ readLines(con, n = N + 1)
+ readLines(con, n = 1) # for the blank line
+
+ # QProfile
+ readLines(con, n = 1)
+
+ x <- c()
+ peA <- c() # probability of sequencing error given reference base is A
+ peC <- c()
+ peG <- c()
+ peT <- c()
+
+ for (i in 1 : N) {
+ strvec <- readLines(con, n = 6)
+ list <- strsplit(strvec[1:4], split = " ")
+
+ vecA <- as.numeric(list[[1]])
+ vecC <- as.numeric(list[[2]])
+ vecG <- as.numeric(list[[3]])
+ vecT <- as.numeric(list[[4]])
+
+ if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) next
+ x <- c(x, (i - 1))
+ peA <- c(peA, ifelse(sum(vecA) < 1e-8, NA, -10 * log10(1.0 - vecA[1])))
+ peC <- c(peC, ifelse(sum(vecC) < 1e-8, NA, -10 * log10(1.0 - vecC[2])))
+ peG <- c(peG, ifelse(sum(vecG) < 1e-8, NA, -10 * log10(1.0 - vecG[3])))
+ peT <- c(peT, ifelse(sum(vecT) < 1e-8, NA, -10 * log10(1.0 - vecT[4])))
+ }
+
+ matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4,
+ main = "Observed Quality vs. Phred Quality Score",
+ xlab = "Phred Quality Score",
+ ylab = "Observed Quality")
+ legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
+} else {
+ # Profile
+ readLines(con, n = 1)
+
+ x <- c()
+ peA <- c() # probability of sequencing error given reference base is A
+ peC <- c()
+ peG <- c()
+ peT <- c()
+
+ for (i in 1: maxL) {
+ strvec <- readLines(con, n = 6)
+ list <- strsplit(strvec[1:4], split = " ")
+
+ vecA <- as.numeric(list[[1]])
+ vecC <- as.numeric(list[[2]])
+ vecG <- as.numeric(list[[3]])
+ vecT <- as.numeric(list[[4]])
+
+ if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) next
+ x <- c(x, i)
+ peA <- c(peA, ifelse(sum(vecA) < 1e-8, NA, (1.0 - vecA[1]) * 100))
+ peC <- c(peC, ifelse(sum(vecC) < 1e-8, NA, (1.0 - vecC[2]) * 100))
+ peG <- c(peG, ifelse(sum(vecG) < 1e-8, NA, (1.0 - vecG[3]) * 100))
+ peT <- c(peT, ifelse(sum(vecT) < 1e-8, NA, (1.0 - vecT[4]) * 100))
+ }
+
+ matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Position vs. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error")
+ legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
+}
+
+close(con)
+
+# Alignment statistics
+pair <- read.table(file = cntF, skip = 3, sep = "\t")
+
+stat_len = dim(pair)[1]
+upper_bound = pair[stat_len - 1, 1]
+my_labels = append(0:upper_bound, pair[stat_len, 1])
+my_heights = rep(0, upper_bound + 2)
+dummy = sapply(1:(stat_len - 1), function(id) { my_heights[pair[id, 1] + 1] <<- pair[id, 2] })
+my_heights[upper_bound + 2] = pair[stat_len, 2]
+my_colors = c("green", "blue", rep("dimgrey", upper_bound - 1), "red")
+
+barplot(my_heights, names.arg = my_labels,
+ col = my_colors, border = NA,
+ xlab = "Number of alignments per read",
+ ylab = "Number of reads",
+ main = "Alignment statistics")
+
+pie_values = c(my_heights[1], my_heights[2], sum(my_heights[3:(upper_bound + 1)]), my_heights[upper_bound + 2])
+pie_names = c("Unalignable", "Unique", "Multi", "Filtered")
+pie_labels = sprintf("%s %.0f%%", pie_names, pie_values * 100.0 / sum(pie_values))
+par(fig = c(0.4, 1, 0.35, 0.95), new = T)
+pie(pie_values, labels = pie_labels, col = c("green", "blue", "dimgrey", "red"), clockwise = T, init.angle = 270, cex = 0.8)
+par(fig = c(0, 1, 0, 1))
+
+dev.off.output <- dev.off()
diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles
new file mode 100755
index 0000000..5c3b3cd
--- /dev/null
+++ b/rsem-plot-transcript-wiggles
@@ -0,0 +1,172 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use Pod::Usage;
+
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils qw(runCommand getSAMTOOLS);
+
+use Env qw(@PATH);
+@PATH = ($FindBin::RealBin, "$FindBin::RealBin/" . getSAMTOOLS(), @PATH);
+
+use strict;
+
+
+my $gene_list = 0; # default is 0, means input is not a gene list
+my $transcript_list = 0; # default is 0, this option can only be turned on if allele-specific expression is calculated
+my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles
+my $p = 1;
+my $mem = "1G";
+my $help = 0;
+
+GetOptions("gene-list" => \$gene_list,
+ "transcript-list" => \$transcript_list,
+ "show-unique" => \$show_unique,
+ "p|num-threads=i" => \$p,
+ "memory-per-thread=s" => \$mem,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+
+my $alleleS = 0;
+if (-e "$ARGV[0].alleles.results") { $alleleS = 1; }
+
+pod2usage(-msg => "--transcript-list cannot be set if allele-specific reference is not built!", -exitval => 2, -verbose => 2) if (!$alleleS && $transcript_list);
+pod2usage(-msg => "--gene-list and --transcript-list cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($gene_list && $transcript_list);
+
+my $command = "";
+
+unless (-e "$ARGV[0].transcript.sorted.bam") {
+ $command = "samtools sort -@ $p -m $mem -o $ARGV[0].transcript.sorted.bam $ARGV[0].transcript.bam";
+ &runCommand($command);
+}
+unless (-e "$ARGV[0].transcript.readdepth") {
+ $command = "rsem-bam2readdepth $ARGV[0].transcript.sorted.bam $ARGV[0].transcript.readdepth";
+ &runCommand($command);
+}
+
+if ($show_unique) {
+ unless (-e "$ARGV[0].uniq.transcript.bam") {
+ $command = "rsem-get-unique $p $ARGV[0].transcript.bam $ARGV[0].uniq.transcript.bam";
+ &runCommand($command);
+ }
+ unless (-e "$ARGV[0].uniq.transcript.sorted.bam") {
+ $command = "samtools sort -@ $p -m $mem -o $ARGV[0].uniq.transcript.sorted.bam $ARGV[0].uniq.transcript.bam";
+ &runCommand($command);
+ }
+ unless (-e "$ARGV[0].uniq.transcript.readdepth") {
+ $command = "rsem-bam2readdepth $ARGV[0].uniq.transcript.sorted.bam $ARGV[0].uniq.transcript.readdepth";
+ &runCommand($command);
+ }
+}
+
+my $id_type;
+
+if ($alleleS) {
+ $id_type = 0;
+ if ($transcript_list) { $id_type = 1; }
+ if ($gene_list) { $id_type = 2; }
+}
+else {
+ $id_type = 1;
+ if ($gene_list) { $id_type = 2; }
+}
+
+$command = "rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $alleleS $id_type $show_unique $ARGV[2]";
+&runCommand($command);
+
+__END__
+
+=head1 NAME
+
+rsem-plot-transcript-wiggles - Generate PDF wiggle plots from transcript or gene ids
+
+=head1 SYNOPSIS
+
+ rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<sample_name>
+
+The name of the sample analyzed.
+
+=item B<input_list>
+
+A list of transcript ids or gene ids. But it cannot be a mixture of transcript & gene ids. Each id occupies one line without extra spaces.
+
+=item B<output_plot_file>
+
+The file name of the pdf file which contains all plots.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--gene-list>
+
+The input-list is a list of gene ids. (Default: off)
+
+=item B<--transcript-list>
+
+The input-list is a list of transcript ids. This option can only be turned on if allele-specific expression is calculated. (Default: off)
+
+=item B<--show-unique>
+
+Show the wiggle plots as stacked bar plots. See description section for details. (Default: off)
+
+=item B<-p/--num-threads> <int>
+
+Set the number of threads we can use. (Default: 1)
+
+=item B<--memory-per-thread> <string>
+
+Set the maximum allowable memory per thread. <string> represents the memory and accepts suffices 'K/M/G'. (Default: 1G)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated. If allele-specific expression is calculated, the basic unit for plotting is an allele-specific transcript and plots can be grouped by either transcript ids (--transcript-list) or gene ids (--gene-list).
+
+=head1 OUTPUT
+
+=over
+
+=item B<output_plot_file>
+
+This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangement of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. If allele-specific expression is calculated, the basin unit becomes an allele-specific transcript and transcript ids and gene ids can be used to group allele-specific transcripts.
+
+=item B<sample_name.transcript.sorted.bam and sample_name.transcript.readdepth>
+
+If these files do not exist, 'rsem-plot-transcript-wiggles' will automatically generate them.
+
+=item B<sample_name.uniq.transcript.bam, sample_name.uniq.transcript.sorted.bam and sample_name.uniq.transcript.readdepth>
+
+If '--show-unique' option is specified and these files do not exist, 'rsem-plot-transcript-wiggles' will automatically generate them.
+
+=back
+
+=head1 EXAMPLES
+
+Suppose sample_name and output_plot_file are set to 'mmliver_single_quals' and 'output.pdf' respectively. input_list is set to 'transcript_ids.txt' if transcript ids are provided, and is set to 'gene_ids.txt' if gene ids are provided.
+
+1) Transcript ids are provided and we just want normal wiggle plots:
+
+ rsem-plot-transcript-wiggles mmliver_single_quals transcript_ids.txt output.pdf
+
+2) Gene ids are provided and we want to show stacked bar plots:
+
+ rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf
+
+=cut
diff --git a/rsem-prepare-reference b/rsem-prepare-reference
new file mode 100755
index 0000000..b6ddc47
--- /dev/null
+++ b/rsem-prepare-reference
@@ -0,0 +1,508 @@
+#!/usr/bin/env perl
+
+use Getopt::Long qw(:config no_auto_abbrev);
+use Pod::Usage;
+use File::Basename;
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils;
+
+use Env qw(@PATH);
+@PATH = ($FindBin::RealBin, @PATH);
+
+use strict;
+use warnings;
+
+my $status;
+
+my $gtfF = "";
+my $gff3F = "";
+my $gff3_RNA_patterns = "";
+my $gff3_genes_as_transcripts = 0;
+my $gtf_sources = "None";
+my $mappingF = "";
+my $polyAChoice = 1; # 0, --polyA, add polyA tails for all isoforms; 1, default, no polyA tails; 2, --no-polyA-subset
+my $polyA = 0; # for option --polyA, default off
+my $polyALen = 125;
+my $subsetFile = "";
+my $prep_prsem = 0; ## if prepare reference for pRSEM
+my $mappability_bigwig_file = '';
+my $quiet = 0;
+my $help = 0;
+
+my $alleleMappingF = "";
+
+my $nthreads = 1; # number of threads to build aligner's indices
+
+my $bowtie = 0;
+my $bowtie_path = "";
+
+my $bowtie2 = 0;
+my $bowtie2_path = "";
+
+my $star = 0;
+my $star_path = '';
+my $star_sjdboverhang = 100;
+
+my $hisat2_hca = 0;
+my $hisat2_path = '';
+
+
+
+GetOptions("gtf=s" => \$gtfF,
+ "gff3=s" => \$gff3F,
+ "gff3-RNA-patterns=s" => \$gff3_RNA_patterns,
+ "gff3-genes-as-transcripts" => \$gff3_genes_as_transcripts,
+ "trusted-sources=s" => \$gtf_sources,
+ "transcript-to-gene-map=s" => \$mappingF,
+ "allele-to-gene-map=s" => \$alleleMappingF,
+ "polyA" => \$polyA,
+ "polyA-length=i" => \$polyALen,
+ "no-polyA-subset=s" => \$subsetFile,
+ "bowtie" => \$bowtie,
+ "bowtie-path=s" => \$bowtie_path,
+ "bowtie2" => \$bowtie2,
+ "bowtie2-path=s" => \$bowtie2_path,
+ "star" => \$star,
+ "star-path=s" => \$star_path,
+ "star-sjdboverhang=i" => \$star_sjdboverhang,
+ "hisat2-hca" =>\$hisat2_hca,
+ "hisat2-path=s" => \$hisat2_path,
+ "p|num-threads=i" => \$nthreads,
+ "prep-pRSEM" => \$prep_prsem, ## bool if prepare reference for pRSEM
+ 'mappability-bigwig-file=s' => \$mappability_bigwig_file,
+ "q|quiet" => \$quiet,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($mappingF ne "") && ($alleleMappingF ne ""));
+pod2usage(-msg => "--gtf and --gff3 are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($gff3F ne ""));
+pod2usage(-msg => "--gtf/--gff3 and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if ((($gtfF ne "") || ($gff3F ne "")) && ($alleleMappingF ne ""));
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+pod2usage(-msg => "No poly(A) tail should be added if --star is set!", -exitval => 2, -verbose => 2) if ($star && $polyA);
+
+if ( $prep_prsem ) {
+ my $msg = '';
+ if ($bowtie_path eq '' ) {
+ $msg = "--bowtie-path <path> needs to be specified for preparing pRSEM references\n";
+ }
+
+ if ($mappability_bigwig_file eq '') {
+ $msg = "--mappability-bigwig-file <string> needs to be specified for preparing pRSEM references\n";
+ }
+
+ if ( $msg ne '' ) {
+ pod2usage(-msg => $msg, -exitval => 2, -verbose => 1 );
+ }
+}
+
+if (!$bowtie && ($bowtie_path ne "")) { print "Warning: If Bowtie is not used, no need to set --bowtie-path option!\n"; }
+if (!$bowtie2 && ($bowtie2_path ne "")) { print "Warning: If Bowtie 2 is not used, no need to set --bowtie2-path option!\n"; }
+if (!$star && ($star_path ne "")) { print "Warning: If STAR is not used, no need to set --star-path option!\n"; }
+if (!$hisat2_hca && ($hisat2_path ne "")) { print "Warning: If HISAT2 is not used, no need to set --hisat2-path option!\n"; }
+
+my @list = split(/,/, $ARGV[0]);
+my $size = scalar(@list);
+
+if ($size == 1 && (-d $list[0])) {
+ my $dir = $list[0];
+ @list = (<$dir/*.fa>, <$dir/*.fasta>);
+ $size = scalar(@list);
+}
+
+pod2usage(-msg => "reference_fasta_file(s) is empty! Please check if you provide the correct folder name or file suffixes!", -exitval => 2, -verbose => 2) if ($size <= 0);
+
+if ($polyA) {
+ $polyAChoice = ($subsetFile ne "") ? 2 : 0;
+}
+
+if ($bowtie_path ne "") { $bowtie_path .= "/"; }
+if ($bowtie2_path ne "") { $bowtie2_path .= "/"; }
+if ($star_path ne "") { $star_path .= "/"; }
+if ($hisat2_path ne "") { $hisat2_path .= "/"; }
+
+my $command = "";
+
+if ($gff3F ne "") {
+ $gtfF = "$ARGV[1].gtf";
+ pod2usage(-msg => "A file with the name $gtfF alreay exists! GFF3-to-GTF conversion failed!", -exitval => 2, -verbose => 2) if (-e $gtfF);
+ $command = "rsem-gff3-to-gtf";
+ if ($gff3_RNA_patterns ne "") {
+ $command .= " --RNA-patterns $gff3_RNA_patterns";
+ }
+ if ($gff3_genes_as_transcripts) {
+ $command .= " --make-genes-as-transcripts";
+ }
+ $command .= " $gff3F $gtfF";
+ &runCommand($command)
+}
+
+if ($gtfF ne "") {
+ $"=" ";
+ $gtf_sources =~ s/ /\\ /g;
+ $command = "rsem-extract-reference-transcripts $ARGV[1] $quiet $gtfF $gtf_sources";
+ if ($mappingF ne "") { $command .= " 1 $mappingF"; }
+ else { $command .= " 0"; }
+ $command .= " @list";
+ &runCommand($command);
+}
+else {
+ $"=" ";
+ $command = "rsem-synthesis-reference-transcripts $ARGV[1] $quiet";
+ if ($mappingF ne "") { $command .= " 1 $mappingF"; }
+ elsif ($alleleMappingF ne "") { $command .= " 2 $alleleMappingF"; }
+ else { $command .= " 0"; }
+ $command .= " @list";
+ &runCommand($command);
+}
+
+$command = "rsem-preref $ARGV[1].transcripts.fa $polyAChoice $ARGV[1]";
+if ($polyAChoice != 1) { $command .= " -l $polyALen"; }
+if ($polyAChoice == 2) { $command .= " -f $subsetFile"; }
+if ($quiet) { $command .= " -q"; }
+
+&runCommand($command);
+
+if ($bowtie) {
+ $command = $bowtie_path."bowtie-build -f";
+ if ($quiet) { $command .= " -q"; }
+ $command .= " $ARGV[1].n2g.idx.fa $ARGV[1]";
+
+ &runCommand($command);
+}
+
+if ($bowtie2) {
+ $command = $bowtie2_path."bowtie2-build -f";
+ if ($nthreads > 1) { $command .= " --threads $nthreads"; }
+ if ($quiet) { $command .= " -q"; }
+ $command .= " $ARGV[1].idx.fa $ARGV[1]";
+
+ &runCommand($command);
+}
+
+if ($star) {
+ pod2usage(-msg => "Sorry, if you want RSEM run STAR for you, you must provide the genome sequence and associated GTF annotation.", -exitval => 2, -verbose => 2) if ($gtfF eq "");
+
+ my $out_star_genome_path = dirname($ARGV[1]);
+ $command = $star_path . "STAR " .
+ " --runThreadN $nthreads " .
+ " --runMode genomeGenerate " .
+ " --genomeDir $out_star_genome_path " .
+ " --genomeFastaFiles @list " .
+ " --sjdbGTFfile $gtfF " .
+ " --sjdbOverhang $star_sjdboverhang " .
+ " --outFileNamePrefix $ARGV[1]";
+ &runCommand($command);
+}
+
+if ($hisat2_hca) {
+ $command = $hisat2_path."hisat2-build -f";
+ if ($nthreads > 1) { $command .= " -p $nthreads"; }
+ if ($quiet) { $command .= " -q"; }
+ $command .= " $ARGV[1].idx.fa $ARGV[1]";
+ &runCommand($command);
+}
+
+if ( $prep_prsem ) {
+ $command = "$FindBin::RealBin/pRSEM/prsem-prepare-reference " .
+ " --num-threads $nthreads " .
+ " --bowtie-path $bowtie_path " .
+ " --mappability-bigwig-file $mappability_bigwig_file";
+
+ if ( $quiet ) {
+ $command .= ' --quiet ';
+ }
+
+ ## prepare reference only for chromosomes listed in GTF file, rather than
+ ## all chromosomes in the genome directory, because that's where we want to
+ ## call peaks and derive priors. This is different from RNA-seq, because
+ ## where peaks called totally depends on reference, there's no option to
+ ## control it later.
+ #my $ref_fas = join(',', @list);
+ my %chrom2exists = ();
+ open(IN, "<$gtfF") or die "cannot open $gtfF: $!\n";
+ while(my $line = <IN>){
+ my @words = split(/\t/, $line);
+ $chrom2exists{$words[0]} = 1;
+ }
+ close IN;
+
+ my @reflist = ();
+ my @suffixlist = ('.fa', '.fasta');
+ foreach my $fullname (@list) {
+ my $basename = basename($fullname, @suffixlist);
+ if ( exists $chrom2exists{$basename} ) {
+ push @reflist, $fullname;
+ }
+ }
+ my $ref_fas = join(',', @reflist);
+
+ $command .= " $ref_fas $ARGV[1]";
+ &runCommand($command);
+}
+
+__END__
+
+=head1 NAME
+
+rsem-prepare-reference - Prepare transcript references for RSEM and optionally build BOWTIE/BOWTIE2/STAR/HISAT2(transcriptome) indices.
+
+=head1 SYNOPSIS
+
+ rsem-prepare-reference [options] reference_fasta_file(s) reference_name
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<reference_fasta_file(s)>
+
+Either a comma-separated list of Multi-FASTA formatted files OR a directory name. If a directory name is specified, RSEM will read all files with suffix ".fa" or ".fasta" in this directory. The files should contain either the sequences of transcripts or an entire genome, depending on whether the '--gtf' option is used.
+
+=item B<reference name>
+
+The name of the reference used. RSEM will generate several reference-related files that are prefixed by this name. This name can contain path information (e.g. '/ref/mm9').
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--gtf> <file>
+
+If this option is on, RSEM assumes that 'reference_fasta_file(s)' contains the sequence of a genome, and will extract transcript reference sequences using the gene annotations specified in <file>, which should be in GTF format.
+
+If this and '--gff3' options are off, RSEM will assume 'reference_fasta_file(s)' contains the reference transcripts. In this case, RSEM assumes that name of each sequence in the Multi-FASTA files is its transcript_id.
+
+(Default: off)
+
+=item B<--gff3> <file>
+
+The annotation file is in GFF3 format instead of GTF format. RSEM will first convert it to GTF format with the file name 'reference_name.gtf'. Please make sure that 'reference_name.gtf' does not exist. (Default: off)
+
+=item B<--gff3-RNA-patterns> <pattern>
+
+<pattern> is a comma-separated list of transcript categories, e.g. "mRNA,rRNA". Only transcripts that match the <pattern> will be extracted. (Default: "mRNA")
+
+=item B<--gff3-genes-as-transcripts>
+
+This option is designed for untypical organisms, such as viruses, whose GFF3 files only contain genes. RSEM will assume each gene as a unique transcript when it converts the GFF3 file into GTF format.
+
+=item B<--trusted-sources> <sources>
+
+<sources> is a comma-separated list of trusted sources, e.g. "ENSEMBL,HAVANA". Only transcripts coming from these sources will be extracted. If this option is off, all sources are accepted. (Default: off)
+
+=item B<--transcript-to-gene-map> <file>
+
+Use information from <file> to map from transcript (isoform) ids to gene ids.
+Each line of <file> should be of the form:
+
+gene_id transcript_id
+
+with the two fields separated by a tab character.
+
+If you are using a GTF file for the "UCSC Genes" gene set from the UCSC Genome Browser, then the "knownIsoforms.txt" file (obtained from the "Downloads" section of the UCSC Genome Browser site) is of this format.
+
+If this option is off, then the mapping of isoforms to genes depends on whether the '--gtf' option is specified. If '--gtf' is specified, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene.
+
+(Default: off)
+
+=item B<--allele-to-gene-map> <file>
+
+Use information from <file> to provide gene_id and transcript_id information for each allele-specific transcript.
+Each line of <file> should be of the form:
+
+gene_id transcript_id allele_id
+
+with the fields separated by a tab character.
+
+This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the Multi-FASTA-formatted files.
+
+(Default: off)
+
+=item B<--polyA>
+
+Add poly(A) tails to the end of all reference isoforms. The length of poly(A) tail added is specified by '--polyA-length' option. STAR aligner users may not want to use this option. (Default: do not add poly(A) tail to any of the isoforms)
+
+=item B<--polyA-length> <int>
+
+The length of the poly(A) tails to be added. (Default: 125)
+
+=item B<--no-polyA-subset> <file>
+
+Only meaningful if '--polyA' is specified. Do not add poly(A) tails to those transcripts listed in <file>. <file> is a file containing a list of transcript_ids. (Default: off)
+
+=item B<--bowtie>
+
+Build Bowtie indices. (Default: off)
+
+=item B<--bowtie-path> <path>
+
+The path to the Bowtie executables. (Default: the path to Bowtie executables is assumed to be in the user's PATH environment variable)
+
+=item B<--bowtie2>
+
+Build Bowtie 2 indices. (Default: off)
+
+=item B<--bowtie2-path> <path>
+
+The path to the Bowtie 2 executables. (Default: the path to Bowtie 2 executables is assumed to be in the user's PATH environment variable)
+
+=item B<--star>
+
+Build STAR indices. (Default: off)
+
+=item B<--star-path> <path>
+
+The path to STAR's executable. (Default: the path to STAR executable is assumed to be in user's PATH environment variable)
+
+=item B<--star-sjdboverhang> <int>
+
+Length of the genomic sequence around annotated junction. It is only used for STAR to build splice junctions database and not needed for Bowtie or Bowtie2. It will be passed as the --sjdbOverhang option to STAR. According to STAR's manual, its ideal value is max(ReadLength)-1, e.g. for 2x101 paired-end reads, the ideal value is 101-1=100. In most cases, the default value of 100 will work as well as the ideal value. (Default: 100)
+
+=item B<--hisat2-hca>
+
+Build HISAT2 indices on the transcriptome according to Human Cell Atlas (HCA) SMART-Seq2 pipeline. (Default: off)
+
+=item B<--hisat2-path> <path>
+
+The path to the HISAT2 executables. (Default: the path to HISAT2 executables is assumed to be in the user's PATH environment variable)
+
+=item B<-p/--num-threads> <int>
+
+Number of threads to use for building STAR's genome indices. (Default: 1)
+
+=item B<-q/--quiet>
+
+Suppress the output of logging information. (Default: off)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 PRIOR-ENHANCED RSEM OPTIONS
+
+=over
+
+=item B<--prep-pRSEM>
+
+A Boolean indicating whether to prepare reference files for pRSEM, including building Bowtie indices for a genome and selecting training set isoforms. The index files will be used for aligning ChIP-seq reads in prior-enhanced RSEM and the training set isoforms will be used for learning prior. A path to Bowtie executables and a mappability file in bigWig format are required when this option is on. Currently, Bowtie2 is not supported for prior-enhanced RSEM. (Default: off)
+
+=item B<--mappability-bigwig-file> <string>
+
+Full path to a whole-genome mappability file in bigWig format. This file is required for running prior-enhanced RSEM. It is used for selecting a training set of isoforms for prior-learning. This file can be either downloaded from UCSC Genome Browser or generated by GEM (Derrien et al., 2012, PLoS One). (Default: "")
+
+=back
+
+=head1 DESCRIPTION
+
+This program extracts/preprocesses the reference sequences for RSEM and prior-enhanced RSEM. It can optionally build Bowtie indices (with '--bowtie' option) and/or Bowtie 2 indices (with '--bowtie2' option) using their default parameters. It can also optionally build STAR indices (with '--star' option) using parameters from ENCODE3's STAR-RSEM pipeline. For prior-enhanced RSEM, it can build Bowtie genomic indices and select training set isoforms (with options '--prep-pRSEM' and '--mappability-bigwig-file <string>'). If an alternative aligner is to be used, indices for that particular aligner can be built from either 'reference_name.idx.fa' or 'reference_name.n2g.idx.fa' (see OUTPUT for details). This program is used in conjunction with the 'rsem-calculate-expression' program.
+
+=head1 OUTPUT
+
+This program will generate 'reference_name.grp', 'reference_name.ti', 'reference_name.transcripts.fa', 'reference_name.seq', 'reference_name.chrlist' (if '--gtf' is on), 'reference_name.idx.fa', 'reference_name.n2g.idx.fa', optional Bowtie/Bowtie 2 index files, and optional STAR index files.
+
+'reference_name.grp', 'reference_name.ti', 'reference_name.seq', and 'reference_name.chrlist' are used by RSEM internally.
+
+B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in Multi-FASTA format. Poly(A) tails are not added and it may contain lower case bases in its sequences if the corresponding genomic regions are soft-masked.
+
+B<'reference_name.idx.fa' and 'reference_name.n2g.idx.fa'> are used by aligners to build their own indices. In these two files, all sequence bases are converted into upper case. In addition, poly(A) tails are added if '--polyA' option is set. The only difference between 'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' is that 'reference_name.n2g.idx.fa' in addition converts all 'N' characters to 'G' characters. This conversion is in particular desired for aligners (e.g. Bowtie) that do not allow reads to overlap with 'N' characters in the reference sequences. Otherwise, 'reference_name.idx.fa' should be used to build the aligner's index files. RSEM uses 'reference_name.idx.fa' to build Bowtie 2 indices and 'reference_name.n2g.idx.fa' to build Bowtie indices. For visualizing the transcript-coordinate-based BAM files generated by RSEM in IGV, 'reference_name.idx.fa' should be imported as a "genome" (see Visualization section in README.md for details).
+
+If the whole genome is indexed for prior-enhanced RSEM, all the index files will be generated with prefix as 'reference_name_prsem'. Selected isoforms for training set are listed in the file 'reference_name_prsem.training_tr_crd'
+
+
+=head1 EXAMPLES
+
+1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'. We want to put the generated reference files under '/ref' with name 'mouse_0'. We do not add any poly(A) tails. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information. For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file. Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'.
+
+There are two ways to write the command:
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --bowtie \
+ --bowtie-path /sw/bowtie \
+ /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \
+ /ref/mouse_0
+
+OR
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --bowtie \
+ --bowtie-path /sw/bowtie \
+ /data/mm9 \
+ /ref/mouse_0
+
+2) Suppose we also want to build Bowtie 2 indices in the above example and Bowtie 2 executables are found in '/sw/bowtie2', the command will be:
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --bowtie \
+ --bowtie-path /sw/bowtie \
+ --bowtie2 \
+ --bowtie2-path /sw/bowtie2 \
+ /data/mm9 \
+ /ref/mouse_0
+
+3) Suppose we want to build STAR indices in the above example and save index files under '/ref' with name 'mouse_0'. Assuming STAR executable is '/sw/STAR', the command will be:
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --star \
+ --star-path /sw/STAR \
+ -p 8 \
+ /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \
+ /ref/mouse_0
+
+OR
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --star \
+ --star-path /sw/STAR \
+ -p 8 \
+ /data/mm9
+ /ref/mouse_0
+
+STAR genome index files will be saved under '/ref/'.
+
+4) Suppose we want to prepare references for prior-enhanced RSEM in the above example. In this scenario, both STAR and Bowtie are required to build genomic indices - STAR for RNA-seq reads and Bowtie for ChIP-seq reads. Assuming their executables are under '/sw/STAR' and '/sw/Bowtie', respectively. Also, assuming the mappability file for mouse genome is '/data/mm9.bigWig'. The command will be:
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --star \
+ --star-path /sw/STAR \
+ -p 8 \
+ --prep-pRSEM \
+ --bowtie-path /sw/Bowtie \
+ --mappability-bigwig-file /data/mm9.bigWig \
+ /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \
+ /ref/mouse_0
+
+OR
+
+ rsem-prepare-reference --gtf mm9.gtf \
+ --transcript-to-gene-map knownIsoforms.txt \
+ --star \
+ --star-path /sw/STAR \
+ -p 8 \
+ --prep-pRSEM \
+ --bowtie-path /sw/Bowtie \
+ --mappability-bigwig-file /data/mm9.bigWig \
+ /data/mm9
+ /ref/mouse_0
+
+Both STAR and Bowtie's index files will be saved under '/ref/'. Bowtie files will have name prefix 'mouse_0_prsem'
+
+5) Suppose we only have transcripts from EST tags stored in 'mm9.fasta' and isoform-gene information stored in 'mapping.txt'. We want to add 125bp long poly(A) tails to all transcripts. The reference_name is set as 'mouse_125'. In addition, we do not want to build Bowtie/Bowtie 2 indices, and will use an alternative aligner to align reads against either 'mouse_125.idx.fa' or 'mouse_125.idx.n2g.fa':
+
+ rsem-prepare-reference --transcript-to-gene-map mapping.txt \
+ --polyA
+ mm9.fasta \
+ mouse_125
+
+
+=cut
diff --git a/rsem-refseq-extract-primary-assembly b/rsem-refseq-extract-primary-assembly
new file mode 100755
index 0000000..8289e70
--- /dev/null
+++ b/rsem-refseq-extract-primary-assembly
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+from sys import argv, exit
+
+if len(argv) != 3:
+ print("Usage: rsem-refseq-extract-primary-assembly input_top_level_assembly.fna output_primary_assembly.fna")
+ exit(-1)
+
+writeOut = True
+with open(argv[1]) as fin:
+ with open(argv[2], "w") as fout:
+ for line in fin:
+ line = line.strip()
+ if line[0] == '>':
+ writeOut = line.rfind("Primary Assembly") >= 0
+ if writeOut:
+ fout.write(line + "\n")
+
diff --git a/rsem-run-ebseq b/rsem-run-ebseq
new file mode 100755
index 0000000..485600a
--- /dev/null
+++ b/rsem-run-ebseq
@@ -0,0 +1,128 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use Pod::Usage;
+
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils;
+
+use Env qw(@PATH);
+@PATH = ("$FindBin::RealBin/EBSeq", @PATH);
+
+use strict;
+
+my $ngvF = "";
+my $help = 0;
+
+GetOptions("ngvector=s" => \$ngvF,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+pod2usage(-msg => "ngvector file cannot be named as #! # is reserved for other purpose!", -exitval => 2, -verbose => 2) if ($ngvF eq "#");
+
+my $command = "";
+
+my @conditions = split(/,/, $ARGV[1]);
+
+pod2usage(-msg => "At least 2 conditions are required for differential expression analysis!", -exitval => 2, -verbose => 2) if (scalar(@conditions) < 2);
+
+if ($ngvF eq "") { $ngvF = "#"; }
+
+$" = " ";
+$command = "rsem-for-ebseq-find-DE $FindBin::RealBin/EBSeq $ngvF $ARGV[0] $ARGV[2] @conditions";
+&runCommand($command)
+
+__END__
+
+=head1 NAME
+
+rsem-run-ebseq - Wrapper for EBSeq to perform differential expression analysis.
+
+=head1 SYNOPSIS
+
+rsem-run-ebseq [options] data_matrix_file conditions output_file
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<data_matrix_file>
+
+This file is a m by n matrix. m is the number of genes/transcripts and n is the number of total samples. Each element in the matrix represents the expected count for a particular gene/transcript in a particular sample. Users can use 'rsem-generate-data-matrix' to generate this file from expression result files.
+
+=item B<conditions>
+
+Comma-separated list of values representing the number of replicates for each condition. For example, "3,3" means the data set contains 2 conditions and each condition has 3 replicates. "2,3,3" means the data set contains 3 conditions, with 2, 3, and 3 replicates for each condition respectively.
+
+=item B<output_file>
+
+Output file name.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--ngvector> <file>
+
+This option provides the grouping information required by EBSeq for isoform-level differential expression analysis. The file can be generated by 'rsem-generate-ngvector'. Turning this option on is highly recommended for isoform-level differential expression analysis. (Default: off)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program is a wrapper over EBSeq. It performs differential expression analysis and can work on two or more conditions. All genes/transcripts and their associated statistcs are reported in one output file. This program does not control false discovery rate and call differential expressed genes/transcripts. Please use 'rsem-control-fdr' to control false discovery rate after this program is finished.
+
+=head1 OUTPUT
+
+=over
+
+=item B<output_file>
+
+This file reports the calculated statistics for all genes/transcripts. It is written as a matrix with row and column names. The row names are the genes'/transcripts' names. The column names are for the reported statistics.
+
+If there are only 2 different conditions among the samples, four statistics (columns) will be reported for each gene/transcript. They are "PPEE", "PPDE", "PostFC" and "RealFC". "PPEE" is the posterior probability (estimated by EBSeq) that a gene/transcript is equally expressed. "PPDE" is the posterior probability that a gene/transcript is differentially expressed. "PostFC" is the posterior fold change (condition 1 over condition2) for a gene/transcript. It is defined as the ratio between posterior mean expression estimates of the gene/transcript for each condition. "RealFC" is the real fold change (condition 1 over condition2) for a gene/transcript. It is the ratio of the normalized within condition 1 mean count over normalized within condition 2 mean count for the gene/transcript. Fold changes are calculated using EBSeq's 'PostFC' function. The genes/transcripts are reported in descending order of their "PPDE" values.
+
+If there are more than 2 different conditions among the samples, the output format is different. For differential expression analysis with more than 2 conditions, EBSeq will enumerate all possible expression patterns (on which conditions are equally expressed and which conditions are not). Suppose there are k different patterns, the first k columns of the output file give the posterior probability of each expression pattern is true. Patterns are defined in a separate file, 'output_file.pattern'. The k+1 column gives the maximum a posteriori (MAP) expression pattern for each gene/transcript. The k+2 column gives the posterior probability that not all conditions are equally expressed (column name "PPDE"). The genes/transcripts are reported in descending order of their "PPDE" column values. For details on how EBSeq works for more than 2 conditions, please refer to EBSeq's manual.
+
+=item B<output_file.normalized_data_matrix>
+
+This file contains the median normalized version of the input data matrix.
+
+=item B<output_file.pattern>
+
+This file is only generated when there are more than 2 conditions. It defines all possible expression patterns over the conditions using a matrix with names. Each row of the matrix refers to a different expression pattern and each column gives the expression status of a different condition. Two conditions are equally expressed if and only if their statuses are the same.
+
+=item B<output_file.condmeans>
+
+This file is only generated when there are more than 2 conditions. It gives the normalized mean count value for each gene/transcript at each condition. It is formatted as a matrix with names. Each row represents a gene/transcript and each column represent a condition. The order of genes/transcripts is the same as 'output_file'. This file can be used to calculate fold changes between conditions which users are interested in.
+
+=back
+
+=head1 EXAMPLES
+
+1) We're interested in isoform-level differential expression analysis and there are two conditions. Each condition has 5 replicates. We have already collected the data matrix as 'IsoMat.txt' and generated ngvector as 'ngvector.ngvec':
+
+ rsem-run-ebseq --ngvector ngvector.ngvec IsoMat.txt 5,5 IsoMat.results
+
+The results will be in 'IsoMat.results' and 'IsoMat.results.normalized_data_matrix' contains the normalized data matrix.
+
+2) We're interested in gene-level analysis and there are 3 conditions. The first condition has 3 replicates and the other two has 4 replicates each. The data matrix is named as 'GeneMat.txt':
+
+ rsem-run-ebseq GeneMat.txt 3,4,4 GeneMat.results
+
+Four files, 'GeneMat.results', 'GeneMat.results.normalized_data_matrix', 'GeneMat.results.pattern', and 'GeneMat.results.condmeans', will be generated.
+
+=cut
+
+
+
+
+
diff --git a/rsem-run-prsem-testing-procedure b/rsem-run-prsem-testing-procedure
new file mode 100755
index 0000000..10dffcb
--- /dev/null
+++ b/rsem-run-prsem-testing-procedure
@@ -0,0 +1,324 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use Pod::Usage;
+use File::Basename;
+use FindBin;
+use lib $FindBin::RealBin;
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo);
+
+use Env qw(@PATH);
+@PATH = ($FindBin::RealBin, "$FindBin::RealBin/sam", @PATH);
+
+use strict;
+use warnings;
+
+#const
+my $status = 0;
+my $bowtie_path = "";
+my $nThreads = 1;
+my $quiet = 0;
+my $help = 0;
+my $keep_intermediate_files = 0;
+my $version = 0;
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ('') x 7;
+
+my $chipseq_target_read_files = '';
+my $chipseq_control_read_files = '';
+my $chipseq_peak_file = '';
+my $partition_model = 'pk';
+my $chipseq_read_files_multi_targets = ''; ## read files for multiple targets
+ ## delimited by comma
+my $chipseq_bed_files_multi_targets = ''; ## BED files for multiple targets
+ ## delimited by comma
+
+GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+ "temporary-folder=s" => \$temp_dir,
+ "bowtie-path=s" => \$bowtie_path,
+ "p|num-threads=i" => \$nThreads,
+ 'chipseq-target-read-files=s' => \$chipseq_target_read_files,
+ ## delimited by comma if more than one
+ 'chipseq-control-read-files=s' => \$chipseq_control_read_files,
+ ## delimited by comma if more than one
+ 'chipseq-read-files-multi-targets=s' => \$chipseq_read_files_multi_targets,
+ ## delimited by comma
+ 'chipseq-bed-files-multi-targets=s' => \$chipseq_bed_files_multi_targets,
+ ## delimited by comma
+ 'chipseq-peak-file=s' => \$chipseq_peak_file,
+ 'partition-model=s' => \$partition_model,
+ "version" => \$version,
+ "q|quiet" => \$quiet,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+&showVersionInfo($FindBin::RealBin) if ($version == 1);
+
+#check parameters and options
+{
+ my $msg = '';
+
+ if ( ( $chipseq_peak_file eq '' ) &&
+ ( ( $chipseq_target_read_files eq '' ) ||
+ ( $chipseq_control_read_files eq '' ) ||
+ ( $bowtie_path eq '' ) ) &&
+ ( ( $chipseq_read_files_multi_targets eq '' ) ||
+ ( $bowtie_path eq '' ) ) &&
+ ( $chipseq_bed_files_multi_targets eq '' )
+ ) {
+ $msg = "please define one set of the following options to run pRSEM's testing procedure:\n" .
+ "1. --chipseq-peak-file <file>\n" .
+ "2. --chipseq-target-read-files <file> and\n" .
+ " --chipseq-control-read-files <file> and\n" .
+ " --bowtie-path <path>\n" .
+ "3. --chipseq-read-files-multi-targets <files> and\n" .
+ " --bowtie-path <path>\n" .
+ "4. --chipseq-bed-files-multi-targets <files>\n";
+ }
+
+ my @prsem_partition_models = ( 'pk', 'cmb_lgt' );
+
+ my %prtmdl2one = ();
+ foreach my $prtmdl (@prsem_partition_models) {
+ $prtmdl2one{$prtmdl} = 1;
+ }
+
+ if ( exists $prtmdl2one{$partition_model} ) {
+ if ( ( $partition_model eq 'cmb_lgt' ) &&
+ ( ( $chipseq_read_files_multi_targets eq '' ) &&
+ ( $chipseq_bed_files_multi_targets eq '' ) ) ){
+ $msg = 'either --chipseq-read-files-multi-targets <files> or ' .
+ '--chipseq-bed-files-multi-targets <files> needs to be ' .
+ "defined for pRSEM's partition model: '$partition_model'";
+ } elsif ( ( $partition_model ne 'pk' ) &&
+ ( $partition_model ne 'cmb_lgt' ) &&
+ ( ( $chipseq_target_read_files eq '' ) ||
+ ( $chipseq_control_read_files eq '' ) ||
+ ( $bowtie_path eq '' ) ) ){
+ $msg = '--chipseq-target-read-files <file> and ' .
+ '--chipseq-control-read-files <file> and ' .
+ '--bowtie-path <path> need to be defined for ' .
+ "pRSEM's partition model: '$partition_model'";
+ }
+ } else {
+ my $s_prt_mdls = join(', ', @prsem_partition_models);
+ $msg = "\n--partition-model <string> must be one of [$s_prt_mdls]\n" .
+ "pRSEM's testing procedure only supports the above partition models";
+ }
+
+ if ( $msg ne '' ) {
+ pod2usage(-msg => "$msg\n", -exitval => 2, -verbose => 2);
+ }
+
+ if ( ( $partition_model ne 'cmb_lgt' ) &&
+ ( ( $chipseq_read_files_multi_targets ne '' ) ||
+ ( $chipseq_bed_files_multi_targets ne '' ) ) ) {
+ print "\nCombining signals from multiple sources, partition model is set to 'cmb_lgt'\n\n";
+ $partition_model = 'cmb_lgt';
+ }
+}
+
+
+$refName = $ARGV[0];
+$sampleName = $ARGV[1];
+
+my $pos = rindex($sampleName, '/');
+if ($pos < 0) { $sampleToken = $sampleName; }
+else { $sampleToken = substr($sampleName, $pos + 1); }
+
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
+$stat_dir = "$sampleName.stat";
+
+if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
+if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
+
+$imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
+
+
+if ($bowtie_path ne "") { $bowtie_path .= "/"; }
+
+my $command = "";
+
+{
+ $command = "$FindBin::RealBin/pRSEM/prsem-testing-procedure " .
+ " --num-threads $nThreads " .
+ " --partition-model $partition_model ";
+
+ ## ChIP-seq peak file from single source
+ if ( $chipseq_peak_file ne '') { ## only for partition model pk
+ ## need to add sanity check!!
+ $command .= " --chipseq-peak-file $chipseq_peak_file";
+ } elsif ( $partition_model eq 'cmb_lgt' ) { ## multi-sources
+ if ( $chipseq_bed_files_multi_targets ne '' ) { ## use bed over read
+ $command .= ' --chipseq-bed-files-multi-targets ' .
+ $chipseq_bed_files_multi_targets;
+ } elsif ( $chipseq_read_files_multi_targets ne '' ) {
+ $command .= ' --chipseq-read-files-multi-targets ' .
+ $chipseq_read_files_multi_targets .
+ " --bowtie-path $bowtie_path" ;
+ }
+ } else { ## ChIP-seq reads files from single source
+ $command .= " --chipseq-target-read-files $chipseq_target_read_files " .
+ " --bowtie-path $bowtie_path" ;
+ if ( $chipseq_control_read_files ne '' ) {
+ $command .= " --chipseq-control-read-files $chipseq_control_read_files";
+ }
+ }
+
+ if ( $quiet ) {
+ $command .= ' --quiet ';
+ }
+
+ $command .= " $refName $sampleName $statName $imdName";
+ &runCommand($command);
+}
+
+
+if (!$keep_intermediate_files) {
+ &runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!");
+}
+
+__END__
+
+=head1 NAME
+
+rsem-run-prsem-testing-procedure
+
+=head1 SYNOPSIS
+
+rsem-run-prsem-testing-procedure [options] reference_name sample_name
+
+=head1 ARGUMENGS
+
+=over
+
+=item B<reference_name>
+
+The name of the reference used. Users must run 'rsem-prepare-reference' with this reference_name and with '--prep-pRSEM' specified before running this program.
+
+=item B<sample_name>
+
+The name of the sample analyzed. Users must run 'rsem-calculate-expression' with this sample_name and with RSEM's default Gibbs sampling performed before running this program.
+
+=back
+
+=head1 BASIC OPTIONS
+
+=over
+
+=item B<--bowtie-path> <string>
+
+The path to the Bowtie executables. (Default: the path to the Bowtie executables is assumed to be in the user's PATH environment variable)
+
+=item B<--chipseq-target-read-files> <string>
+
+Comma-separated full path of FASTQ read file(s) for ChIP-seq target. This option provides information to calculate ChIP-seq peaks and signals. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-control-read-files <string>' must be defined when this option is specified. (Default: "")
+
+=item B<--chipseq-control-read-files> <string>
+
+Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. This option provides information to call ChIP-seq peaks. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-target-read-files <string>' must be defined when this option is specified. (Default: "")
+
+
+=item B<--chipseq-read-files-multi-targets> <string>
+
+Comma-separated full path of FASTQ read files for multiple ChIP-seq targets. This option is used when prior is learned from multiple complementary data sets. It provides information to calculate ChIP-seq signals. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--bowtie-path <path>' must be defined and the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
+
+=item B<--chipseq-bed-files-multi-targets> <string>
+
+Comma-separated full path of BED files for multiple ChIP-seq targets. This option is used when prior is learned from multiple complementary data sets. It provides information of ChIP-seq signals and must have at least the first six BED columns. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
+
+=item B<--chipseq-peak-file> <string>
+
+Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. BED6+4 format. This file is used in prior-enhanced RSEM's default two-partition model. It partitions isoforms by whether they have ChIP-seq overlapping with their transcription start site region or not. Each partition will have its own prior parameter learned from a training set. This file can be either gzipped or ungzipped. (Default: "")
+
+=item B<--partition-model> <string>
+
+A keyword to specify the partition model. It must be either 'pk' or 'cmb_lgt'. For details, please see the help document of 'rsem-calculate-expression'.
+
+=item B<-p|--num-threads> <int>
+
+Number of threads to use. (Default: 1)
+
+=item B<--version>
+
+Show version information.
+
+=item B<-q|--quiet>
+
+Suppress the output of logging information. (Default: off)
+
+=item B<-h|--help>
+
+Show help information.
+
+=back
+
+=head1 ADVANCED OPTIONS
+
+=over
+
+=item B<--keep-intermediate-files>
+
+Keep temporary files generated by RSEM and this testing procedure. RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. By default, after this test is finished, the temporary directory is deleted. Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
+
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
+=back
+
+=head1 DESCRIPTION
+
+This program provides users a p-value and a log-likelihood to determine whether external data set(s) is informative and how informative it is for RNA-seq quantification. It is used in conjunction with prior-enhanced RSEM to let user select the most effective external data set(s).
+
+Users can run this program repetitively with different external data. All p-values and log-likelihoods will be saved in an output file 'sample_name.pval_LL'.
+
+=head1 NOTES
+
+Users must run 'rsem-prepare-reference' with the appropriate referece and with the option '--prep-pRSEM' before using this program
+
+Users must run 'rsem-calculate-expression' with the option '--calc-pme' before using this program
+
+The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
+
+=head1 OUTPUT
+
+=over
+
+=item B<sample_name.pval_LL>
+
+This file contains partition model's name, basename(s) of external data set file(s), p-value, and log-likelihood delimited by tab. When this program is ran repetiively, output will be concatenated to the end of this file without removing previous results.
+
+=back
+
+The following output files are the same as the ones generated by 'rsem-calculate-expression' with prior-enhanced RSEM. Please refer to the help document of 'rsem-calculate-expression' for details
+
+=over 2
+
+=item B<sample_name.stat/sample_name.all_tr_features>
+
+=item B<sample_name.stat/sample_name.all_tr_prior>
+
+=item B<sample_name.stat/sample_name.lgt_mdl.RData>
+
+=item B<sample_name.stat/sample_name.pval_LL>
+
+=item B<sample_name.stat/sample_name_uniform_prior_1.isoforms.results>
+
+=item B<sample_name.stat/sample_name_uniform_prior_1.genes.results>
+
+=back
+
+=head1 EXAMPLE
+
+Assuming RSEM reference files are under '/ref' with name 'mouse_125' and expression files are under '/expr' with name 'mouse_125'. Suppose we want to derive prior from four histone modification ChIP-seq read data sets: '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming Bowtie's executables are under '/sw/bowtie/' and we want to use 16 cores:
+
+ rsem-run-prsem-testing-procedure --partition-model cmb_lgt \
+ --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
+ --bowtie-path /sw/bowtie \
+ --num-threads 16 \
+ /ref/mouse_125 \
+ /expr/mouse_125
+
+=cut
diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm
new file mode 100644
index 0000000..53f7d71
--- /dev/null
+++ b/rsem_perl_utils.pm
@@ -0,0 +1,109 @@
+#!/usr/bin/env perl
+
+package rsem_perl_utils;
+
+use strict;
+
+require Exporter;
+our @ISA = qw(Exporter);
+our @EXPORT = qw(runCommand);
+our @EXPORT_OK = qw(runCommand collectResults showVersionInfo getSAMTOOLS hasPolyA);
+
+my $version = "RSEM v1.3.1"; # Update version info here
+my $samtools = "samtools-1.3"; # If update to another version of SAMtools, need to change this
+
+# command, {err_msg}
+sub runCommand {
+ print $_[0]."\n";
+ my $status = system($_[0]);
+
+ if ($? == -1) {
+ my @arr = split(/[ \t]+/, $_[0]);
+ print "$arr[0] : $!!\n";
+ print "Please check if you have compiled the associated codes by typing related \"make\" commands and/or made related executables ready to use.\n";
+ exit(-1);
+ }
+
+ if ($status != 0) {
+ my $errmsg = "";
+ if (scalar(@_) > 1) { $errmsg .= $_[1]."\n"; }
+ $errmsg .= "\"$_[0]\" failed! Plase check if you provide correct parameters/options for the pipeline!\n";
+ print $errmsg;
+ exit(-1);
+ }
+ print "\n";
+}
+
+my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "TPM_coefficient_of_quartile_variation", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound", "FPKM_coefficient_of_quartile_variation");
+
+my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "TPM_coefficient_of_quartile_variation", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound", "FPKM_coefficient_of_quartile_variation");
+
+my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "TPM_coefficient_of_quartile_variation", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound", "FPKM_coefficient_of_quartile_variation");
+
+# type, inpF, outF
+sub collectResults {
+ my $local_status;
+ my ($inpF, $outF);
+ my @results = ();
+ my $line;
+
+ $inpF = $_[1];
+ $outF = $_[2];
+
+ $local_status = open(INPUT, $inpF);
+ if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); }
+
+ @results = ();
+
+ while ($line = <INPUT>) {
+ chomp($line);
+ my @local_arr = split(/\t/, $line);
+ push(@results, \@local_arr);
+ }
+
+ close(INPUT);
+
+ $local_status = open(OUTPUT, ">$outF");
+ if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); }
+
+ my $n = scalar(@results);
+ my $m = scalar(@{$results[0]});
+
+ $" = "\t";
+
+ my @out_arr = ();
+ for (my $i = 0; $i < $n; $i++) {
+ if ($_[0] eq "allele") { push(@out_arr, $allele_title[$i]); }
+ elsif ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); }
+ elsif ($_[0] eq "gene") { push(@out_arr, $gene_title[$i]); }
+ else { print "A bug on 'collectResults' is detected!\n"; exit(-1); }
+ }
+ print OUTPUT "@out_arr\n";
+
+ for (my $i = 0; $i < $m; $i++) {
+ @out_arr = ();
+ for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); }
+ print OUTPUT "@out_arr\n";
+ }
+
+ close(OUTPUT);
+}
+
+sub showVersionInfo {
+ print "Current version: $version\n";
+ exit(0);
+}
+
+sub getSAMTOOLS {
+ return $samtools;
+}
+
+sub hasPolyA {
+ open(INPUT, $_[0]);
+ my $line = <INPUT>; chomp($line);
+ close(INPUT);
+ my ($fullLen, $totLen) = split(/ /, $line);
+ return $fullLen < $totLen;
+}
+
+1;
diff --git a/samValidator.cpp b/samValidator.cpp
new file mode 100644
index 0000000..e067966
--- /dev/null
+++ b/samValidator.cpp
@@ -0,0 +1,193 @@
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <set>
+#include <algorithm>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+
+#include "utils.h"
+#include "my_assert.h"
+
+using namespace std;
+
+samFile *in;
+bam_hdr_t *header;
+bam1_t *b, *b2;
+
+set<string> used;
+
+bool isValid;
+
+bool check_read(bam1_t *b, bam_hdr_t *header) {
+ uint32_t* cigar = bam_get_cigar(b);
+ for (int i = 0; i < b->core.n_cigar; ++i) {
+ char op = bam_cigar_opchr(*cigar);
+ if (op == 'N') {
+ printf("\nSkipped region is detected (cigar N) for read %s!\nTo use RSEM, please align your reads to a set of transcript sequences instead of a genome.\n", bam_get_qname(b));
+ return false;
+ }
+ else if (op == 'I' || op == 'D') {
+ printf("\nIndel alignment is detected (cigar %c) for read %s!\nRSEM currently does not support indel alignments.\n", op, bam_get_qname(b));
+ return false;
+ }
+ else if (op == 'S' || op == 'H' || op == 'P') {
+ printf("\nClipping or padding is detected (cigar %c) for read %s!\nRSEM currently doest not support clipping or padding.\n", op, bam_get_qname(b));
+ return false;
+ }
+ ++cigar;
+ }
+
+ if (b->core.pos < 0 || bam_endpos(b) > header->target_len[b->core.tid]) {
+ printf("\n");
+ if (bam_is_paired(b)) {
+ printf("Mate %d of paired-end read %s", (bam_is_read1(b) ? 1 : 2), bam_get_qname(b));
+ }
+ else {
+ printf("Read %s", bam_get_qname(b));
+ }
+ printf(" aligns to [%d, %d) of transcript %s, which exceeds the transcript's boundary [0, %d)!\n",
+ b->core.pos, bam_endpos(b), header->target_name[b->core.tid], header->target_len[b->core.tid]);
+ return false;
+ }
+
+ return true;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 2) {
+ printf("Usage: rsem-sam-validator <input.sam/input.bam/input.cram>\n");
+ exit(-1);
+ }
+
+ in = sam_open(argv[1], "r");
+ general_assert(in != 0, "Cannot open input file!");
+ header = sam_hdr_read(in);
+ general_assert(header != 0, "Cannot load SAM header!");
+ used.clear();
+ b = bam_init1(); b2 = bam_init1();
+
+ isValid = true;
+
+ HIT_INT_TYPE cnt = 0;
+ string cqname(""), qname;
+ int creadlen = 0, readlen, creadlen2, readlen2;
+ char ispaired = -1;
+
+ printf("."); fflush(stdout);
+ do {
+ int ret = sam_read1(in, header, b);
+ if (ret == -1) break;
+ else if (ret < 0) { isValid = false; continue; }
+ assert(b->core.l_qseq > 0);
+
+ qname = bam_get_canonical_name(b);
+
+ // if this is a paired-end read
+ if (ispaired == -1) ispaired = bam_is_paired(b);
+ else {
+ isValid = (ispaired == bam_is_paired(b));
+ if (!isValid) {
+ printf("\nWe detected both single-end and paired-end reads in the data!\nRSEM currently does not support a mixture of single-end/paired-end reads.\n");
+ continue;
+ }
+ }
+
+ if (ispaired) {
+ isValid = (sam_read1(in, header, b2) >= 0) && (qname == bam_get_canonical_name(b2)) && bam_is_paired(b2);
+ if (!isValid) {
+ printf("\nOnly find one mate for paired-end read %s!\nPlease make sure that the two mates of a paired-end read are adjacent to each other.\n", bam_get_qname(b));
+ continue;
+ }
+
+ assert(b2->core.l_qseq > 0);
+
+ isValid = (bam_is_read1(b) && bam_is_read2(b2)) || (bam_is_read1(b2) && bam_is_read2(b));
+ if (!isValid) {
+ printf("\nThe two mates of paired-end read %s are marked as both mate1 or both mate2!\n", bam_get_qname(b));
+ continue;
+ }
+
+ int value = int(bam_is_mapped(b)) + int(bam_is_mapped(b2));
+ isValid = (value != 1);
+ if (!isValid) {
+ printf("\nPaired-end read %s has an alignment with only one mate aligned!\n", bam_get_qname(b));
+ printf("Currently RSEM does not handle mixed alignments for paired-end reads.\n");
+ continue;
+ }
+
+ if (!bam_is_read1(b)) { bam1_t *tmp = b; b = b2; b2 = tmp; }
+
+ if (value == 2) {
+ isValid = (b->core.tid == b2->core.tid);
+ if (!isValid) {
+ printf("\nPaired-end read %s has a discordant alignment (two mates aligned to different reference sequences)!\n", bam_get_qname(b));
+ printf("Mate 1 aligns to %s and mate 2 aligns to %s\n", header->target_name[b->core.tid], header->target_name[b2->core.tid]);
+ printf("Currently RSEM does not handle discordant alignments.\n");
+ continue;
+ }
+
+ int strandedness = (int(bam_is_rev(b)) << 1) + int(bam_is_rev(b2));
+ isValid = (strandedness == 1 || strandedness == 2);
+ if (!isValid) {
+ printf("\nPaired-end read %s has an alignment in which two mates aligned to the same strand!\n", bam_get_qname(b));
+ printf("Its two mates aligned to %s in %s direction.\n", header->target_name[b->core.tid], (strandedness == 0 ? "forward" : "reverse"));
+ continue;
+ }
+
+ bam1_t *tb = (b->core.pos < b2->core.pos ? b : b2);
+ isValid = tb->core.pos >= 0 && tb->core.pos + abs(tb->core.isize) <= header->target_len[tb->core.tid];
+ if (!isValid) {
+ printf("\nPaired-end read %s aligns to [%d, %d) of transcript %s, which exceeds the transcript's boundary [0, %d)!\n",
+ bam_get_qname(b), tb->core.pos, tb->core.pos + abs(tb->core.isize), header->target_name[tb->core.tid], header->target_len[tb->core.tid]);
+ continue;
+ }
+ isValid = check_read(b, header);
+ if (!isValid) continue;
+ isValid = check_read(b2, header);
+ if (!isValid) continue;
+ }
+
+ readlen = b->core.l_qseq;
+ readlen2 = b2->core.l_qseq;
+ }
+ else {
+ if (bam_is_mapped(b)) {
+ isValid = check_read(b, header);
+ if (!isValid) continue;
+ }
+ readlen = b->core.l_qseq;
+ }
+
+ if (cqname != qname) {
+ isValid = used.find(qname) == used.end();
+ if (!isValid) { printf("\nThe alignments of read %s are not grouped together!\n", qname.c_str()); continue; }
+ used.insert(cqname);
+ cqname = qname;
+ creadlen = readlen;
+ if (ispaired) creadlen2 = readlen2;
+ }
+ else {
+ assert(cqname != "");
+ isValid = (creadlen == readlen && (!ispaired || creadlen2 == readlen2));
+ if (!isValid) { printf("\nRead %s have alignments showing different read/mate lengths!\n", qname.c_str()); continue; }
+ }
+
+ ++cnt;
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+
+ } while(isValid);
+
+ bam_destroy1(b); bam_destroy1(b2);
+ bam_hdr_destroy(header);
+ sam_close(in);
+
+ if (isValid) printf("\nThe input file is valid!\n");
+ else printf("The input file is not valid!\n");
+
+ return 0;
+}
diff --git a/sam_utils.h b/sam_utils.h
new file mode 100644
index 0000000..94db821
--- /dev/null
+++ b/sam_utils.h
@@ -0,0 +1,210 @@
+#ifndef SAM_UTILS_H_
+#define SAM_UTILS_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<vector>
+#include<string>
+#include<fstream>
+
+#include<stdint.h>
+
+#include "htslib/sam.h"
+
+#include "my_assert.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+
+/******************************************************/
+
+// These functions are adopted/modified from samtools source codes because the original codes are not visible from sam.h/bam.h
+
+inline int bam_aux_type2size(char x) {
+ if (x == 'C' || x == 'c' || x == 'A') return 1;
+ else if (x == 'S' || x == 's') return 2;
+ else if (x == 'I' || x == 'i' || x == 'f') return 4;
+ else if (x == 'd') return 8;
+ else return 0;
+}
+
+inline void expand_data_size(bam1_t *b) {
+ if (b->m_data < b->l_data) {
+ b->m_data = b->l_data;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+}
+
+/******************************************************/
+
+// These functions are specially designed for RSEM
+
+const char* whitespaces = " \t\n\r\f\v";
+
+inline bool bam_is_paired(const bam1_t* b) { return (b->core.flag & BAM_FPAIRED); }
+inline bool bam_is_proper(const bam1_t* b) { return (b->core.flag & BAM_FPROPER_PAIR); }
+inline bool bam_is_mapped(const bam1_t* b) { return !(b->core.flag & BAM_FUNMAP); }
+inline bool bam_is_unmapped(const bam1_t* b) { return (b->core.flag & BAM_FUNMAP); }
+inline bool bam_is_read1(const bam1_t* b) { return (b->core.flag & BAM_FREAD1); }
+inline bool bam_is_read2(const bam1_t* b) { return (b->core.flag & BAM_FREAD2); }
+
+inline std::string bam_get_canonical_name(const bam1_t* b) {
+ // Retain only the first whitespace-delimited word as the read name
+ // This prevents issues of mismatching names when aligners do not
+ // strip off extra words in read name strings
+ const char* raw_query_name = bam_get_qname(b);
+ const char* whitespace_pos = std::strpbrk(raw_query_name, whitespaces);
+ return (whitespace_pos == NULL ? std::string(raw_query_name) : std::string(raw_query_name, whitespace_pos - raw_query_name));
+}
+
+// Current RSEM only accept matches
+inline bool bam_check_cigar(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ char op = bam_cigar_op(*cigar);
+ int32_t oplen = bam_cigar_oplen(*cigar);
+
+ return (b->core.n_cigar == 1) && (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) && (b->core.l_qseq == oplen);
+}
+
+uint8_t bam_prb_to_mapq(double val) {
+ double err = 1.0 - val;
+ if (err <= 1e-10) return 100;
+ return (uint8_t)(-10 * log10(err) + .5); // round it
+}
+
+inline std::string bam_get_read_seq(const bam1_t* b) {
+ uint8_t *p = bam_get_seq(b);
+ std::string readseq = "";
+ char base = 0;
+
+ if (bam_is_rev(b)) {
+ for (int i = b->core.l_qseq - 1; i >= 0; i--) {
+ switch(bam_seqi(p, i)) {
+ //case 0 : base = '='; break;
+ case 1 : base = 'T'; break;
+ case 2 : base = 'G'; break;
+ case 4 : base = 'C'; break;
+ case 8 : base = 'A'; break;
+ case 15 : base = 'N'; break;
+ default : assert(false);
+ }
+ readseq.append(1, base);
+ }
+ }
+ else {
+ for (int i = 0; i < b->core.l_qseq; ++i) {
+ switch(bam_seqi(p, i)) {
+ //case 0 : base = '='; break;
+ case 1 : base = 'A'; break;
+ case 2 : base = 'C'; break;
+ case 4 : base = 'G'; break;
+ case 8 : base = 'T'; break;
+ case 15 : base = 'N'; break;
+ default : assert(false);
+ }
+ readseq.append(1, base);
+ }
+ }
+
+ return readseq;
+}
+
+inline std::string bam_get_qscore(const bam1_t* b) {
+ uint8_t *p = bam_get_qual(b);
+ std::string qscore = "";
+
+ if (bam_is_rev(b)) {
+ p = p + b->core.l_qseq - 1;
+ for (int i = 0; i < b->core.l_qseq; ++i) {
+ qscore.append(1, (char)(*p + 33));
+ --p;
+ }
+ }
+ else {
+ for (int i = 0; i < b->core.l_qseq; ++i) {
+ qscore.append(1, (char)(*p + 33));
+ ++p;
+ }
+ }
+
+ return qscore;
+}
+
+//convert transcript coordinate to chromosome coordinate and generate CIGAR string
+void tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector<uint32_t>& data) {
+ int length = transcript.getLength();
+ char strand = transcript.getStrand();
+ const std::vector<Interval>& structure = transcript.getStructure();
+
+ int s, i;
+ int oldlen, curlen;
+
+ uint32_t operation;
+
+ n_cigar = 0;
+ s = structure.size();
+
+ if (strand == '-') {
+ int tmp = sp;
+ sp = length - ep + 1;
+ ep = length - tmp + 1;
+ }
+
+ if (ep < 1 || sp > length) { // a read which align to polyA tails totally!
+ pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based
+
+ n_cigar = 1;
+ operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
+ data.push_back(operation);
+
+ return;
+ }
+
+ if (sp < 1) {
+ n_cigar++;
+ operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
+ data.push_back(operation);
+ sp = 1;
+ }
+
+ oldlen = curlen = 0;
+
+ for (i = 0; i < s; i++) {
+ oldlen = curlen;
+ curlen += structure[i].end - structure[i].start + 1;
+ if (curlen >= sp) break;
+ }
+ assert(i < s);
+ pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based
+
+ while (curlen < ep && i < s) {
+ n_cigar++;
+ operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH;
+ data.push_back(operation);
+ ++i;
+ if (i >= s) continue;
+ n_cigar++;
+ operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP;
+ data.push_back(operation);
+
+ oldlen = curlen;
+ sp = oldlen + 1;
+ curlen += structure[i].end - structure[i].start + 1;
+ }
+
+ if (i >= s) {
+ n_cigar++;
+ operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
+ data.push_back(operation);
+ }
+ else {
+ n_cigar++;
+ operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH;
+ data.push_back(operation);
+ }
+}
+
+#endif /* SAM_RSEM_AUX_H_ */
diff --git a/sampling.h b/sampling.h
new file mode 100644
index 0000000..7e445cf
--- /dev/null
+++ b/sampling.h
@@ -0,0 +1,67 @@
+#ifndef SAMPLING
+#define SAMPLING
+
+#include<ctime>
+#include<cstdio>
+#include<cassert>
+#include<vector>
+#include<set>
+
+#include "boost/random.hpp"
+
+typedef unsigned int seedType;
+typedef boost::random::mt19937 engine_type;
+typedef boost::random::uniform_01<> uniform_01_dist;
+typedef boost::random::gamma_distribution<> gamma_dist;
+typedef boost::random::variate_generator<engine_type&, uniform_01_dist> uniform_01_generator;
+typedef boost::random::variate_generator<engine_type&, gamma_dist> gamma_generator;
+
+class engineFactory {
+public:
+ static void init() { seedEngine = new engine_type(time(NULL)); }
+ static void init(seedType seed) { seedEngine = new engine_type(seed); }
+
+ static void finish() { if (seedEngine != NULL) delete seedEngine; }
+
+ static engine_type *new_engine() {
+ seedType seed;
+ static std::set<seedType> seedSet; // empty set of seeds
+ std::set<seedType>::iterator iter;
+
+ do {
+ seed = (*seedEngine)();
+ iter = seedSet.find(seed);
+ } while (iter != seedSet.end());
+ seedSet.insert(seed);
+
+ return new engine_type(seed);
+ }
+
+ private:
+ static engine_type *seedEngine;
+};
+
+engine_type* engineFactory::seedEngine = NULL;
+
+// arr should be cumulative!
+// interval : [,)
+// random number should be in [0, arr[len - 1])
+// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
+int sample(uniform_01_generator& rg, std::vector<double>& arr, int len) {
+ int l, r, mid;
+ double prb = rg() * arr[len - 1];
+
+ l = 0; r = len - 1;
+ while (l <= r) {
+ mid = (l + r) / 2;
+ if (arr[mid] <= prb) l = mid + 1;
+ else r = mid - 1;
+ }
+
+ if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); }
+ assert(l < len);
+
+ return l;
+}
+
+#endif
diff --git a/scanForPairedEndReads.cpp b/scanForPairedEndReads.cpp
new file mode 100644
index 0000000..c494f86
--- /dev/null
+++ b/scanForPairedEndReads.cpp
@@ -0,0 +1,137 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<vector>
+#include<algorithm>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+
+#include "utils.h"
+#include "my_assert.h"
+
+using namespace std;
+
+int nThreads;
+samFile *in, *out;
+bam_hdr_t *header;
+bam1_t *b, *b2;
+vector<bam1_t*> arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown;
+
+inline void add_to_appropriate_arr(bam1_t *b) {
+ if (bam_is_mapped(b) && bam_is_proper(b)) {
+ arr_both.push_back(bam_dup1(b)); return;
+ }
+
+ if (bam_is_read1(b)) arr_partial_1.push_back(bam_dup1(b));
+ else if (bam_is_read2(b)) arr_partial_2.push_back(bam_dup1(b));
+ else arr_partial_unknown.push_back(bam_dup1(b));
+}
+
+char get_pattern_code(uint32_t flag) {
+ if (flag & BAM_FREAD1) return ((flag & BAM_FREVERSE) ? 1 : 0);
+ else return ((flag & BAM_FREVERSE) ? 0 : 1);
+}
+
+bool less_than(bam1_t *a, bam1_t *b) {
+ int32_t ap1 = min(a->core.pos, a->core.mpos);
+ int32_t ap2 = max(a->core.pos, a->core.mpos);
+ int32_t bp1 = min(b->core.pos, b->core.mpos);
+ int32_t bp2 = max(b->core.pos, b->core.mpos);
+ char apat = get_pattern_code(a->core.flag); // apt: a's pattern of strand and mate information
+ char bpat = get_pattern_code(b->core.flag);
+
+ if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid;
+ if (ap1 != bp1) return ap1 < bp1;
+ if (ap2 != bp2) return ap2 < bp2;
+ return apat < bpat;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 4) {
+ printf("Usage: rsem-scan-for-paired-end-reads number_of_threads input.[sam/bam/cram] output.bam\n");
+ exit(-1);
+ }
+
+ nThreads = atoi(argv[1]);
+ in = sam_open(argv[2], "r");
+ general_assert(in != 0, "Cannot open " + cstrtos(argv[2]) + " !");
+ header = sam_hdr_read(in);
+ general_assert(header != 0, "Cannot load SAM header!");
+ out = sam_open(argv[3], "wb");
+ general_assert(out != 0, "Cannot open " + cstrtos(argv[3]) + " !");
+ sam_hdr_write(out, header);
+ if (nThreads > 1) general_assert(hts_set_threads(out, nThreads) == 0, "Fail to create threads for writing the BAM file!");
+
+ b = bam_init1(); b2 = bam_init1();
+
+ string qname;
+ bool go_on = (sam_read1(in, header, b) >= 0);
+ bool isPaired;
+ HIT_INT_TYPE cnt = 0;
+
+ printf("."); fflush(stdout);
+
+ while (go_on) {
+ qname = bam_get_canonical_name(b);
+ isPaired = bam_is_paired(b);
+
+ if (isPaired) {
+ add_to_appropriate_arr(b);
+ while ((go_on = (sam_read1(in, header, b) >= 0)) && (qname == bam_get_canonical_name(b))) {
+ general_assert_1(bam_is_paired(b), "Read " + qname + " is detected as both single-end and paired-end read!");
+ add_to_appropriate_arr(b);
+ }
+
+ general_assert_1(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!");
+ general_assert_1((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!");
+
+ if (!arr_both.empty()) {
+ sort(arr_both.begin(), arr_both.end(), less_than);
+ for (size_t i = 0; i < arr_both.size(); i++) { sam_write1(out, header, arr_both[i]); bam_destroy1(arr_both[i]); }
+ arr_both.clear();
+ }
+
+ while (!arr_partial_1.empty() || !arr_partial_2.empty()) {
+ if (!arr_partial_1.empty() && !arr_partial_2.empty()) {
+ sam_write1(out, header, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
+ sam_write1(out, header, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
+ }
+ else if (!arr_partial_1.empty()) {
+ sam_write1(out, header, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
+ sam_write1(out, header, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+ }
+ else {
+ sam_write1(out, header, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
+ sam_write1(out, header, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+ }
+ }
+
+ while (!arr_partial_unknown.empty()) {
+ sam_write1(out, header, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+ }
+ }
+ else {
+ sam_write1(out, header, b);
+ while ((go_on = (sam_read1(in, header, b) >= 0)) && (qname == bam_get_qname(b))) {
+ sam_write1(out, header, b);
+ }
+ }
+
+ ++cnt;
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+ }
+
+ printf("\nFinished!\n");
+
+ bam_destroy1(b); bam_destroy1(b2);
+ bam_hdr_destroy(header);
+
+ sam_close(in);
+ sam_close(out);
+
+ return 0;
+}
diff --git a/simul.h b/simul.h
new file mode 100644
index 0000000..dc6ec8b
--- /dev/null
+++ b/simul.h
@@ -0,0 +1,45 @@
+#ifndef SIMUL_H_
+#define SIMUL_H_
+
+#include<cassert>
+
+#include "boost/random.hpp"
+
+class simul {
+public:
+
+ simul(unsigned int seed) : engine(seed), rg(engine, boost::random::uniform_01<>()) {
+ }
+
+ // interval : [,)
+ // random number should be in [0, arr[len - 1])
+ // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0 ... len - 1
+ int sample(double* arr, int len) {
+ int l, r, mid;
+ double prb = random() * arr[len - 1];
+
+
+ l = 0; r = len - 1;
+ while (l <= r) {
+ mid = (l + r) / 2;
+ if (arr[mid] <= prb) l = mid + 1;
+ else r = mid - 1;
+ }
+
+ if (l >= len) {
+ assert(arr[len - 1] == 0.0);
+ l = int(random() * len);
+ }
+
+ return l;
+ }
+
+ double random() { return rg(); };
+
+private:
+ boost::random::mt19937 engine;
+ boost::random::variate_generator<boost::random::mt19937&, boost::random::uniform_01<> > rg;
+};
+
+#endif /* SIMUL_H_ */
+
diff --git a/simulation.cpp b/simulation.cpp
new file mode 100644
index 0000000..3b406b5
--- /dev/null
+++ b/simulation.cpp
@@ -0,0 +1,225 @@
+#include<cmath>
+#include<ctime>
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<iostream>
+#include<fstream>
+#include<sstream>
+#include<vector>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Read.h"
+#include "SingleRead.h"
+#include "SingleReadQ.h"
+#include "PairedEndRead.h"
+#include "PairedEndReadQ.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+#include "Refs.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+#include "WriteResults.h"
+
+#include "simul.h"
+
+using namespace std;
+
+bool verbose = true;
+
+bool alleleS;
+int OFFSITE;
+
+READ_INT_TYPE N;
+int model_type, M;
+
+Refs refs;
+Transcripts transcripts;
+
+vector<double> eel;
+vector<double> theta, counts;
+
+int n_os;
+ostream *os[2];
+char outReadF[2][STRLEN];
+
+char refName[STRLEN];
+char refF[STRLEN], tiF[STRLEN];
+
+simul *sampler;
+
+void genOutReadStreams(int type, char *outFN) {
+ switch(type) {
+ case 0 :
+ n_os = 1;
+ sprintf(outReadF[0], "%s.fa", outFN);
+ break;
+ case 1 :
+ n_os = 1;
+ sprintf(outReadF[0], "%s.fq", outFN);
+ break;
+ case 2 :
+ n_os = 2;
+ for (int i = 0; i < n_os; i++)
+ sprintf(outReadF[i], "%s_%d.fa", outFN, i + 1);
+ break;
+ case 3 :
+ n_os = 2;
+ for (int i = 0; i < n_os; i++)
+ sprintf(outReadF[i], "%s_%d.fq", outFN, i + 1);
+ break;
+ }
+
+ for (int i = 0; i < n_os; i++)
+ os[i] = new ofstream(outReadF[i]);
+}
+
+template<class ReadType, class ModelType>
+void simulate(char* modelF, char* resultsF) {
+ ModelType model(&refs);
+ ReadType read;
+ int sid;
+
+ model.read(modelF);
+
+ //calculate eel
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
+
+ //generate theta vector
+ ifstream fin(resultsF);
+ string line;
+ double tpm;
+ double denom = 0.0;
+ getline(fin, line); // read the first line, which is just column names
+ for (int i = 1; i <= M; i++) {
+ getline(fin, line);
+ size_t pos = 0;
+ for (int j = 0; j < OFFSITE; j++) pos = line.find_first_of('\t', pos) + 1;
+ size_t pos2 = line.find_first_of('\t', pos);
+ if (pos2 == string::npos) pos2 = line.length();
+ tpm = atof(line.substr(pos, pos2 - pos).c_str());
+ theta[i] = tpm * eel[i]; // during simulation, there is no check for effL < 0. The reason is for that case, eel[i] here = 0 and therefore no chance to sample from it
+ denom += theta[i];
+ }
+ assert(denom > EPSILON);
+ fin.close();
+ for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]);
+
+ READ_INT_TYPE resimulation_count = 0;
+
+ //simulating...
+ model.startSimulation(sampler, theta);
+ for (READ_INT_TYPE i = 0; i < N; i++) {
+ while (!model.simulate(i, read, sid)) { ++resimulation_count; }
+ read.write(n_os, os);
+ ++counts[sid];
+ if ((i + 1) % 1000000 == 0 && verbose) cout<<"GEN "<< i + 1<< endl;
+ }
+ model.finishSimulation();
+
+ cout<< "Total number of resimulation is "<< resimulation_count<< endl;
+}
+
+void releaseOutReadStreams() {
+ for (int i = 0; i < n_os; i++) {
+ ((ofstream*)os[i])->close();
+ delete os[i];
+ }
+}
+
+int main(int argc, char* argv[]) {
+ bool quiet = false;
+ FILE *fi = NULL;
+
+ if (argc < 7 || argc > 10) {
+ printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [--seed seed] [-q]\n\n");
+ printf("Parameters:\n\n");
+ printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
+ printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
+ printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.\n");
+ printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
+ printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
+ printf("output_name: Prefix for all output files.\n");
+ printf("--seed seed: Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.\n");
+ printf("-q: Set it will stop outputting intermediate information.\n\n");
+ printf("Outputs:\n\n");
+ printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n");
+ printf("output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.\n\n");
+ printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n");
+ printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n");
+ printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n");
+ printf("{>/@}: Either '>' or '@' must appear. '>' appears if FASTA files are generated and '@' appears if FASTQ files are generated\n");
+ printf("rid: Simulated read's index, numbered from 0\n");
+ printf("dir: The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-')\n");
+ printf("sid: Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the 'transcript_id' column of the 'sample_name.isoforms.results' file (at line sid + 1, line 1 is for column names)\n");
+ printf("pos: The start position of the simulated read in strand dir of transcript sid. It is numbered from 0\n");
+ printf("insertL: Only appear for paired-end reads. It gives the insert length of the simulated read.\n\n");
+ printf("Example:\n\n");
+ printf("Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as 'simulated_reads'. The command is:\n\n");
+ printf("\trsem-simulate-reads /ref/mouse_125 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads\n");
+ exit(-1);
+ }
+
+ quiet = false;
+ sampler = NULL;
+ for (int i = 7; i < argc; i++) {
+ if (!strcmp(argv[i], "-q")) quiet = true;
+ if (!strcmp(argv[i], "--seed")) {
+ assert(i + 1 < argc);
+ istringstream reader(argv[i + 1]);
+ unsigned int seed;
+ assert(reader>> seed);
+ sampler = new simul(seed);
+ }
+ }
+
+ verbose = !quiet;
+ if (sampler == NULL) sampler = new simul(time(NULL));
+
+ strcpy(refName, argv[1]);
+ alleleS = isAlleleSpecific(refName);
+ OFFSITE = (alleleS ? 6: 5);
+
+ //load basic files
+ sprintf(refF, "%s.seq", argv[1]);
+ refs.loadRefs(refF);
+ M = refs.getM();
+ sprintf(tiF, "%s.ti", argv[1]);
+ transcripts.readFrom(tiF);
+
+ //read model type from modelF
+ fi = fopen(argv[2], "r");
+ if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[2]); exit(-1); }
+ assert(fscanf(fi, "%d", &model_type) == 1);
+ fclose(fi);
+
+ theta.assign(M + 1, 0.0);
+ theta[0] = atof(argv[4]);
+ N = atoi(argv[5]);
+
+ genOutReadStreams(model_type, argv[6]);
+
+ counts.assign(M + 1, 0.0);
+
+ switch(model_type) {
+ case 0: simulate<SingleRead, SingleModel>(argv[2], argv[3]); break;
+ case 1: simulate<SingleReadQ, SingleQModel>(argv[2], argv[3]); break;
+ case 2: simulate<PairedEndRead, PairedEndModel>(argv[2], argv[3]); break;
+ case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
+ }
+
+ writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts);
+ releaseOutReadStreams();
+ delete sampler;
+
+ return 0;
+}
diff --git a/synthesisRef.cpp b/synthesisRef.cpp
new file mode 100644
index 0000000..266cd8b
--- /dev/null
+++ b/synthesisRef.cpp
@@ -0,0 +1,227 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<fstream>
+#include<sstream>
+#include<map>
+#include<vector>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+using namespace std;
+
+bool verbose = true;
+
+int M;
+
+map<string, string> name2seq;
+map<string, string>::iterator iter;
+
+Transcripts transcripts(1); // no genome, just transcript set
+char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN];
+char gtF[STRLEN], taF[STRLEN]; // group info between gene and transcript, transcript and allele
+
+int hasMappingFile;
+char mappingFile[STRLEN];
+
+map<string, string> mi_table; // mapping info table
+map<string, string>::iterator mi_iter; //mapping info table's iterator
+
+map<string, string> mi_table2; // allele_id to transcript_id
+map<string, string>::iterator mi_iter2; // corresponding iterator
+
+void loadMappingInfo(int file_type, char* mappingF) {
+ ifstream fin(mappingF);
+ string line, key, value, value2;
+
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist.");
+
+ switch(file_type) {
+ case 1:
+ mi_table.clear();
+ while (getline(fin, line)) {
+ line = cleanStr(line);
+ if (line[0] == '#') continue;
+ istringstream strin(line);
+ strin>>value>>key;
+ mi_table[key] = value;
+ }
+ break;
+ case 2:
+ mi_table.clear();
+ mi_table2.clear();
+ while (getline(fin, line)) {
+ line = cleanStr(line);
+ if (line[0] == '#') continue;
+ istringstream strin(line);
+ strin>> value>> value2>> key;
+ mi_table[key] = value;
+ mi_table2[key] = value2;
+ }
+ break;
+ default: assert(false);
+ }
+
+ fin.close();
+}
+
+void writeResults(int option, char* refName) {
+ ofstream fout;
+ string cur_gene_id, cur_transcript_id, name;
+ vector<int> gi, gt, ta;
+
+ sprintf(tiF, "%s.ti", refName);
+ transcripts.writeTo(tiF);
+ if (verbose) { printf("Transcript Information File is generated!\n"); }
+
+ cur_gene_id = ""; gi.clear();
+ if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ if (cur_gene_id != transcript.getGeneID()) {
+ gi.push_back(i);
+ if (option == 2) gt.push_back((int)ta.size());
+ cur_gene_id = transcript.getGeneID();
+ }
+ if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
+ ta.push_back(i);
+ cur_transcript_id = transcript.getTranscriptID();
+ }
+ }
+ gi.push_back(M + 1);
+ if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }
+
+ sprintf(groupF, "%s.grp", refName);
+ fout.open(groupF);
+ for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl;
+ fout.close();
+ if (verbose) { printf("Group File is generated!\n"); }
+
+ if (option == 2) {
+ sprintf(gtF, "%s.gt", refName);
+ fout.open(gtF);
+ for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl;
+ fout.close();
+ sprintf(taF, "%s.ta", refName);
+ fout.open(taF);
+ for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl;
+ fout.close();
+ if (verbose) { printf("Allele-specific group files are generated!\n"); }
+ }
+
+ sprintf(refFastaF, "%s.transcripts.fa", refName);
+ fout.open(refFastaF);
+ for (int i = 1; i <= M; i++) {
+ name = transcripts.getTranscriptAt(i).getSeqName();
+ iter = name2seq.find(name);
+ general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!");
+ fout<<">"<<name<<endl;
+ fout<<iter->second<<endl;
+ }
+ fout.close();
+
+ if (verbose) {
+ printf("Extracted Sequences File is generated!\n");
+ }
+}
+
+struct CursorPos {
+ char *filename;
+ int line_no, pos;
+} cursor;
+
+inline char check(char c) {
+ general_assert(isalpha(c), "FASTA file " + cstrtos(cursor.filename) + " contains an unknown character, " + \
+ ctos(c) + " (ASCII code " + itos(c) + "), at line " + itos(cursor.line_no) + ", position " + itos(cursor.pos + 1) + "!");
+ if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
+ if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
+ return c;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
+ printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
+ exit(-1);
+ }
+
+ verbose = !atoi(argv[2]);
+
+ if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }
+
+ // allele-specific
+ if (hasMappingFile == 2) { transcripts.setType(2); }
+
+ int start = hasMappingFile ? 5 : 4;
+
+ ifstream fin;
+ string line, gseq;
+ string seqname, gene_id, transcript_id;
+ int seqlen, len;
+
+ vector<Interval> vec;
+
+ M = 0;
+ name2seq.clear();
+ for (int i = start; i < argc; i++) {
+ fin.open(argv[i]);
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
+
+ cursor.filename = argv[i]; cursor.line_no = cursor.pos = 0;
+
+ getline(fin, line);
+ while ((fin) && (line[0] == '>')) {
+ istringstream strin(line.substr(1));
+ strin>>seqname;
+ ++cursor.line_no;
+
+ gseq = ""; seqlen = 0;
+ while((getline(fin, line)) && (line[0] != '>')) {
+ ++cursor.line_no;
+ len = line.length();
+ for (cursor.pos = 0; cursor.pos < len; ++cursor.pos) line[cursor.pos] = check(line[cursor.pos]);
+ seqlen += len;
+ gseq += line;
+ }
+ assert(seqlen > 0);
+ name2seq[seqname] = gseq;
+
+ transcript_id = seqname;
+ gene_id = seqname;
+
+ if (hasMappingFile) {
+ mi_iter = mi_table.find(seqname);
+ general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
+ gene_id = mi_iter->second;
+ if (hasMappingFile == 2) {
+ mi_iter2 = mi_table2.find(seqname);
+ general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
+ transcript_id = mi_iter2->second;
+ }
+ }
+
+ vec.clear();
+ vec.push_back(Interval(1, seqlen));
+ transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
+ ++M;
+
+ if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
+ }
+ fin.close();
+ }
+
+ if (M < 1) {
+ fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
+ exit(-1);
+ }
+
+ assert(M == transcripts.getM());
+ transcripts.sort();
+
+ writeResults(hasMappingFile, argv[1]);
+
+ return 0;
+}
diff --git a/tbam2gbam.cpp b/tbam2gbam.cpp
new file mode 100644
index 0000000..cc9d595
--- /dev/null
+++ b/tbam2gbam.cpp
@@ -0,0 +1,36 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+
+#include "utils.h"
+#include "Transcripts.h"
+#include "BamConverter.h"
+
+using namespace std;
+
+int nThreads;
+char tiF[STRLEN], chr_list[STRLEN];
+Transcripts transcripts;
+
+int main(int argc, char* argv[]) {
+ if (argc != 4 && argc != 6) {
+ printf("Usage: rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output [-p number_of_threads]\n");
+ exit(-1);
+ }
+
+ nThreads = 1; // default is 1
+ if (argc == 6) { assert(strcmp(argv[4], "-p") == 0); nThreads = atoi(argv[5]); }
+
+ sprintf(tiF, "%s.ti", argv[1]);
+ sprintf(chr_list, "%s.chrlist", argv[1]);
+ transcripts.readFrom(tiF);
+
+ printf("Start converting:\n");
+ BamConverter bc(argv[2], argv[3], chr_list, transcripts, nThreads, assemble_command(argc, argv));
+ bc.process();
+ printf("Genome bam file is generated!\n");
+
+ return 0;
+}
+
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..131b93d
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,166 @@
+#ifndef UTILS
+#define UTILS
+
+#include<cmath>
+#include<ctime>
+#include<cstdio>
+#include<cctype>
+#include<cstdlib>
+#include<cstring>
+#include<cassert>
+#include<string>
+#include<vector>
+#include<stdint.h>
+
+typedef uint64_t HIT_INT_TYPE;
+typedef uint64_t READ_INT_TYPE;
+
+const int STRLEN = 10005 ;
+const double EPSILON = 1e-300;
+const double MINEEL = 1.0;
+const double ORIVALVE = 0.1;
+const int RANGE = 201;
+const int OLEN = 25; // overlap length, number of bases must not be in poly(A) tails
+const int NBITS = 32; // use unsigned int, 32 bits per variable
+
+const int MAX_WARNS = 50; // Display at most 50 warnings of the same type
+
+extern bool verbose; // show detail intermediate outputs
+
+inline bool isZero(double a) { return fabs(a) < 1e-8; }
+inline bool isLongZero(double a) { return fabs(a) < 1e-30; }
+
+// Assume char's range is -128..127
+const int CHAR_RANGE = 128;
+
+static std::vector<int> init_base2id() {
+ std::vector<int> vec(CHAR_RANGE, -1);
+ vec['a'] = vec['A'] = 0;
+ vec['c'] = vec['C'] = 1;
+ vec['g'] = vec['G'] = 2;
+ vec['t'] = vec['T'] = 3;
+ vec['n'] = vec['N'] = 4;
+
+ return vec;
+}
+
+static const std::vector<int> base2id = init_base2id();
+
+inline int get_base_id(char c) {
+ if (c < 0 || base2id[c] < 0) {
+ fprintf(stderr, "Found unknown sequence letter %c at function get_base_id!\n", c);
+ exit(-1);
+ }
+ return base2id[c];
+}
+
+static std::vector<int> init_rbase2id() {
+ std::vector<int> vec(CHAR_RANGE, -1);
+ vec['a'] = vec['A'] = 3;
+ vec['c'] = vec['C'] = 2;
+ vec['g'] = vec['G'] = 1;
+ vec['t'] = vec['T'] = 0;
+ vec['n'] = vec['N'] = 4;
+
+ return vec;
+}
+
+static const std::vector<int> rbase2id = init_rbase2id();
+
+inline int get_rbase_id(char c) {
+ if (c < 0 || rbase2id[c] < 0) {
+ fprintf(stderr, "Found unknown sequence letter %c at function get_rbase_id!\n", c);
+ exit(-1);
+ }
+ return rbase2id[c];
+}
+
+inline char getOpp(char c) {
+ switch(c) {
+ case 'a' : return 't';
+ case 'c' : return 'g';
+ case 'g' : return 'c';
+ case 't' : return 'a';
+ case 'n' : return 'n';
+ case 'A' : return 'T';
+ case 'C' : return 'G';
+ case 'G' : return 'C';
+ case 'T' : return 'A';
+ case 'N' : return 'N';
+ default :
+ fprintf(stderr, "Found unknown sequence letter %c!\n", c);
+ exit(-1);
+ }
+}
+
+inline char getCharacter(int id) {
+ switch(id) {
+ case 0 : return 'A';
+ case 1 : return 'C';
+ case 2 : return 'G';
+ case 3 : return 'T';
+ case 4 : return 'N';
+ default :
+ fprintf(stderr, "Found unknown id %d!\n", id);
+ exit(-1);
+ }
+}
+
+static std::vector<unsigned int> init_mask_code() {
+ std::vector<unsigned int> vec(NBITS);
+ for (int i = 0; i < NBITS; i++) vec[i] = 1 << i;
+ return vec;
+}
+
+static std::vector<unsigned int> mask_codes = init_mask_code();
+
+inline std::string cleanStr(const std::string& str) {
+ int len = str.length();
+ int fr, to;
+
+ fr = 0;
+ while (fr < len && isspace(str[fr])) ++fr;
+ to = len - 1;
+ while (to >= 0 && isspace(str[to])) --to;
+
+ return (fr <= to ? str.substr(fr, to - fr + 1) : "");
+}
+
+inline void genReadFileNames(const char* readFN, int tagType, int read_type, int& s, char readFs[][STRLEN]){
+ const char tags[3][STRLEN] = {"un", "alignable", "max"};
+ char suffix[STRLEN];
+
+ if (read_type == 0 || read_type == 2) {
+ strcpy(suffix, "fa");
+ }
+ else {
+ strcpy(suffix, "fq");
+ }
+
+ if (read_type == 0 || read_type == 1) {
+ s = 1;
+ sprintf(readFs[0], "%s_%s.%s", readFN, tags[tagType], suffix);
+ }
+ else {
+ s = 2;
+ sprintf(readFs[0], "%s_%s_1.%s", readFN, tags[tagType], suffix);
+ sprintf(readFs[1], "%s_%s_2.%s", readFN, tags[tagType], suffix);
+ }
+}
+
+inline void printTimeUsed(const time_t& a, const time_t& b, const char* program_name) {
+ int hh = (b - a) / 3600;
+ int mm = (b - a) % 3600 / 60;
+ int ss = (b - a) % 60;
+
+ printf("Time Used for %s : %d h %02d m %02d s\n", program_name, hh, mm, ss);
+}
+
+inline std::string assemble_command(int argc, char* argv[]) {
+ std::string command = argv[0];
+ for (int i = 1; i < argc; ++i)
+ command += " " + std::string(argv[i]);
+ return command;
+}
+
+#endif
diff --git a/wiggle.cpp b/wiggle.cpp
new file mode 100644
index 0000000..032a869
--- /dev/null
+++ b/wiggle.cpp
@@ -0,0 +1,139 @@
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+#include <iostream>
+
+#include <stdint.h>
+#include "htslib/sam.h"
+#include "sam_utils.h"
+
+#include "utils.h"
+#include "my_assert.h"
+#include "wiggle.h"
+
+bool no_fractional_weight = false;
+
+void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) {
+ double w;
+
+ if (no_fractional_weight) w = 1.0;
+ else {
+ uint8_t *p_tag = bam_aux_get(b, "ZW");
+ if (p_tag == NULL) return;
+ w = bam_aux2f(p_tag);
+ }
+
+ int pos = b->core.pos;
+ uint32_t *p = bam_get_cigar(b);
+
+ for (int i = 0; i < (int)b->core.n_cigar; ++i, ++p) {
+ char op = bam_cigar_op(*p);
+ int op_len = bam_cigar_oplen(*p);
+
+ if (op == BAM_CMATCH)
+ for (int j = 0; j < op_len; ++j, ++pos) wiggle.read_depth[pos] += w;
+ else pos += ((bam_cigar_type(op) & 2) ? op_len : 0);
+ }
+}
+
+void build_wiggles(const std::string& bam_filename,
+ WiggleProcessor& processor) {
+
+ samFile *bam_in = sam_open(bam_filename.c_str(), "r");
+ general_assert(bam_in != NULL, "Cannot open " + bam_filename + "!");
+
+ bam_hdr_t *header = sam_hdr_read(bam_in);
+ general_assert(header != 0, "Cannot load SAM header!");
+ bool *used = new bool[header->n_targets];
+ memset(used, 0, sizeof(bool) * header->n_targets);
+
+ int cur_tid = -1; //current tid;
+ HIT_INT_TYPE cnt = 0;
+ bam1_t *b = bam_init1();
+ Wiggle wiggle;
+ while (sam_read1(bam_in, header, b) >= 0) {
+ if (bam_is_unmapped(b)) continue;
+
+ if (b->core.tid != cur_tid) {
+ if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); }
+ cur_tid = b->core.tid;
+ wiggle.name = header->target_name[cur_tid];
+ wiggle.length = header->target_len[cur_tid];
+ wiggle.read_depth.assign(wiggle.length, 0.0);
+ }
+ add_bam_record_to_wiggle(b, wiggle);
+ ++cnt;
+ if (cnt % 1000000 == 0) std::cout<< cnt<< std::endl;
+ }
+ if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); }
+
+ for (int32_t i = 0; i < header->n_targets; i++)
+ if (!used[i]) {
+ wiggle.name = header->target_name[i];
+ wiggle.length = header->target_len[i];
+ wiggle.read_depth.clear();
+ processor.process(wiggle);
+ }
+
+ bam_destroy1(b);
+ bam_hdr_destroy(header);
+ sam_close(bam_in);
+
+ delete[] used;
+}
+
+UCSCWiggleTrackWriter::UCSCWiggleTrackWriter(const std::string& output_filename,
+ const std::string& track_name) {
+ fo = fopen(output_filename.c_str(), "w");
+ fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n",
+ track_name.c_str(),
+ track_name.c_str());
+}
+
+UCSCWiggleTrackWriter::~UCSCWiggleTrackWriter() {
+ fclose(fo);
+}
+
+void UCSCWiggleTrackWriter::process(const Wiggle& wiggle) {
+ int sp, ep;
+
+ if (wiggle.read_depth.empty()) return;
+
+ sp = ep = -1;
+ for (size_t i = 0; i < wiggle.length; i++) {
+ if (wiggle.read_depth[i] >= 0.0095) {
+ ep = i;
+ }
+ else {
+ if (sp < ep) {
+ ++sp;
+ fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1);
+ for (int j = sp; j <= ep; j++) fprintf(fo, "%.2f\n", wiggle.read_depth[j]);
+ }
+ sp = i;
+ }
+ }
+ if (sp < ep) {
+ ++sp;
+ fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1);
+ for (int j = sp; j <= ep; j++) fprintf(fo, "%.2f\n", wiggle.read_depth[j]);
+ }
+}
+
+ReadDepthWriter::ReadDepthWriter(std::ostream& stream)
+ : stream_(stream) {
+}
+
+void ReadDepthWriter::process(const Wiggle& wiggle) {
+
+ stream_ << wiggle.name << '\t'
+ << wiggle.length << '\t';
+
+ if (wiggle.read_depth.empty()) { stream_ << "NA\n"; return; }
+
+ for (size_t i = 0; i < wiggle.length; ++i) {
+ if (i > 0) stream_ << ' ';
+ stream_ << wiggle.read_depth[i];
+ }
+ stream_ << '\n';
+}
diff --git a/wiggle.h b/wiggle.h
new file mode 100644
index 0000000..f94a0be
--- /dev/null
+++ b/wiggle.h
@@ -0,0 +1,49 @@
+#ifndef WIGGLE_H_
+#define WIGGLE_H_
+
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <ostream>
+
+extern bool no_fractional_weight; // if no_frac_weight == true, each alignment counts as weight 1
+
+struct Wiggle {
+ std::string name;
+ std::vector<double> read_depth;
+ size_t length;
+};
+
+class WiggleProcessor {
+public:
+ virtual ~WiggleProcessor() {}
+ virtual void process(const Wiggle& wiggle) = 0;
+};
+
+class UCSCWiggleTrackWriter : public WiggleProcessor {
+public:
+ UCSCWiggleTrackWriter(const std::string& output_filename,
+ const std::string& track_name);
+
+ ~UCSCWiggleTrackWriter();
+
+ void process(const Wiggle& wiggle);
+
+private:
+ FILE *fo;
+};
+
+class ReadDepthWriter : public WiggleProcessor {
+public:
+ ReadDepthWriter(std::ostream& stream);
+
+ void process(const Wiggle& wiggle);
+
+private:
+ std::ostream& stream_;
+};
+
+void build_wiggles(const std::string& bam_filename,
+ WiggleProcessor& processor);
+
+#endif