From 128e374c9225968cac7696f6a5dcff811c431b62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Sat, 9 Jul 2022 19:26:20 +0200
Subject: Import abpoa_1.4.1.orig.tar.gz

[dgit import orig abpoa_1.4.1.orig.tar.gz]
---
 .gitignore                 |   75 ++
 .gitmodules                |    3 +
 .travis.yml                |    8 +
 BLOSUM62.mtx               |   29 +
 CMakeLists.txt             |   49 ++
 HOXD70.mtx                 |    8 +
 LICENSE                    |   21 +
 MANIFEST.in                |   21 +
 Makefile                   |  128 ++++
 PAM250.mtx                 |   29 +
 README.md                  |  273 +++++++
 abpoa.pc.in                |    9 +
 example.c                  |  168 +++++
 include/abpoa.h            |  223 ++++++
 include/simd_instruction.h |  633 ++++++++++++++++
 pog.png                    |  Bin 0 -> 87750 bytes
 python/README.md           |  100 +++
 python/cabpoa.pxd          |  185 +++++
 python/example.py          |   90 +++
 python/pyabpoa.pyx         |  226 ++++++
 setup.py                   |   70 ++
 src/abpoa.c                |  220 ++++++
 src/abpoa.h                |  223 ++++++
 src/abpoa_align.c          |  503 +++++++++++++
 src/abpoa_align.h          |  129 ++++
 src/abpoa_graph.c          |  743 +++++++++++++++++++
 src/abpoa_graph.h          |   61 ++
 src/abpoa_output.c         |  921 ++++++++++++++++++++++++
 src/abpoa_output.h         |   15 +
 src/abpoa_plot.c           |  121 ++++
 src/abpoa_seed.c           |  745 +++++++++++++++++++
 src/abpoa_seed.h           |   25 +
 src/abpoa_seq.c            |  660 +++++++++++++++++
 src/abpoa_seq.h            |   23 +
 src/kalloc.c               |  205 ++++++
 src/kalloc.h               |   38 +
 src/kdq.h                  |  128 ++++
 src/khash.h                |  615 ++++++++++++++++
 src/kseq.h                 |  247 +++++++
 src/ksort.h                |  153 ++++
 src/kstring.c              |  250 +++++++
 src/kstring.h              |  277 +++++++
 src/kvec.h                 |  105 +++
 src/simd_abpoa_align.c     | 1716 ++++++++++++++++++++++++++++++++++++++++++++
 src/simd_abpoa_align.h     |   20 +
 src/simd_check.c           |   89 +++
 src/simd_instruction.h     |  633 ++++++++++++++++
 src/utils.c                |  407 +++++++++++
 src/utils.h                |  277 +++++++
 sub_example.c              |  128 ++++
 test_data/heter.fa         |   30 +
 test_data/seq.fa           |   20 +
 test_data/test.fa          |    8 +
 53 files changed, 12083 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 .travis.yml
 create mode 100644 BLOSUM62.mtx
 create mode 100644 CMakeLists.txt
 create mode 100644 HOXD70.mtx
 create mode 100644 LICENSE
 create mode 100644 MANIFEST.in
 create mode 100644 Makefile
 create mode 100644 PAM250.mtx
 create mode 100644 README.md
 create mode 100644 abpoa.pc.in
 create mode 100644 example.c
 create mode 100644 include/abpoa.h
 create mode 100644 include/simd_instruction.h
 create mode 100644 pog.png
 create mode 100644 python/README.md
 create mode 100644 python/cabpoa.pxd
 create mode 100644 python/example.py
 create mode 100644 python/pyabpoa.pyx
 create mode 100644 setup.py
 create mode 100644 src/abpoa.c
 create mode 100644 src/abpoa.h
 create mode 100644 src/abpoa_align.c
 create mode 100644 src/abpoa_align.h
 create mode 100644 src/abpoa_graph.c
 create mode 100644 src/abpoa_graph.h
 create mode 100644 src/abpoa_output.c
 create mode 100644 src/abpoa_output.h
 create mode 100644 src/abpoa_plot.c
 create mode 100644 src/abpoa_seed.c
 create mode 100644 src/abpoa_seed.h
 create mode 100644 src/abpoa_seq.c
 create mode 100644 src/abpoa_seq.h
 create mode 100644 src/kalloc.c
 create mode 100644 src/kalloc.h
 create mode 100644 src/kdq.h
 create mode 100644 src/khash.h
 create mode 100644 src/kseq.h
 create mode 100644 src/ksort.h
 create mode 100644 src/kstring.c
 create mode 100644 src/kstring.h
 create mode 100644 src/kvec.h
 create mode 100644 src/simd_abpoa_align.c
 create mode 100644 src/simd_abpoa_align.h
 create mode 100644 src/simd_check.c
 create mode 100644 src/simd_instruction.h
 create mode 100644 src/utils.c
 create mode 100644 src/utils.h
 create mode 100644 sub_example.c
 create mode 100644 test_data/heter.fa
 create mode 100644 test_data/seq.fa
 create mode 100644 test_data/test.fa

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2e6bb14
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,75 @@
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+.idea/*
+build/*
+
+# dot
+abpoa.dot
+abpoa.dot.pdf
+
+
+# readme
+README.html
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+lib/*
+*.lib
+*.a
+*.la
+*.lo
+
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+bin/*
+example
+sub_example
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+
+#data
+data/*
+test_data/cons.fa
+*.dot
+*.png
+*.pdf
+
+#tags
+tags
+cscope.*
+
+# python
+dist/*
+pyabpoa.egg-info/*
+python/build/*
+python/dist/*
+python/example.png
+python/pyabpoa.c
+python/pyabpoa.egg-info/*
+python/src/*
+
+# eval file
+evaluation/msa_abPOA
+evaluation/msa_spoa
+#evaluation/racon_abPOA
+#evaluation/racon_spoa
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..75c8cc0
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "include/simde"]
+	path = include/simde
+	url = https://github.com/simd-everywhere/simde.git
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..a0f7845
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,8 @@
+matrix:
+  include:
+    - language: c
+      compiler: gcc
+      script: make
+    - language: c
+      compiler: clang
+      script: make
diff --git a/BLOSUM62.mtx b/BLOSUM62.mtx
new file mode 100644
index 0000000..2387c15
--- /dev/null
+++ b/BLOSUM62.mtx
@@ -0,0 +1,29 @@
+# Entries for the BLOSUM62 matrix at a scale of ln(2)/2.0.
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  J  Z  X  *  O  U
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1 -1 -1 -4 -4 -4
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2  0 -1 -4 -4 -4
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  4 -3  0 -1 -4 -4 -4
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4 -3  1 -1 -4 -4 -4
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4 -4 -4
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0 -2  4 -1 -4 -4 -4
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1 -3  4 -1 -4 -4 -4
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -4 -2 -1 -4 -4 -4
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0 -3  0 -1 -4 -4 -4
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3  3 -3 -1 -4 -4 -4
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4  3 -3 -1 -4 -4 -4
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0 -3  1 -1 -4 -4 -4
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3  2 -1 -1 -4 -4 -4
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3  0 -3 -1 -4 -4 -4
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4 -4 -4
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0 -2  0 -1 -4 -4 -4
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1 -1 -1 -4 -4 -4
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -2 -2 -1 -4 -4 -4
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -1 -2 -1 -4 -4 -4
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3  2 -2 -1 -4 -4 -4
+B -2 -1  4  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4 -3  0 -1 -4 -4 -4
+J -1 -2 -3 -3 -1 -2 -3 -4 -3  3  3 -3  2  0 -3 -2 -1 -2 -1  2 -3  3 -3 -1 -4 -4 -4
+Z -1  0  0  1 -3  4  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -2 -2 -2  0 -3  4 -1 -4 -4 -4
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4 -4 -4
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1 -4 -4
+O -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1 -4
+U -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..9e4b642
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,49 @@
+cmake_minimum_required(VERSION 3.2)
+project(abpoa LANGUAGES C VERSION 3.0.0)
+
+include(GNUInstallDirs)
+find_package(ZLIB REQUIRED)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
+
+# build abPOA as a static library by default
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+
+add_library(abpoa
+    src/abpoa_align.c
+    src/abpoa_graph.c
+    src/abpoa_output.c
+    src/abpoa_plot.c
+    src/abpoa_seed.c
+    src/abpoa_seq.c
+    src/kalloc.c
+    src/kstring.c
+    src/simd_abpoa_align.c
+    src/simd_check.c
+    src/utils.c)
+
+add_executable(abpoa_bin
+    src/abpoa.c)
+
+target_link_libraries(abpoa_bin abpoa z pthread m)
+set_target_properties(abpoa_bin PROPERTIES OUTPUT_NAME abpoa)
+
+target_include_directories(abpoa PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+    $<INSTALL_INTERFACE:src>)
+
+
+install(TARGETS abpoa DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS abpoa_bin DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.h")
+
+# configure and install pkg-config file
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/abpoa.pc.in ${CMAKE_CURRENT_BINARY_DIR}/abpoa-1.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/abpoa-1.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/HOXD70.mtx b/HOXD70.mtx
new file mode 100644
index 0000000..d3de679
--- /dev/null
+++ b/HOXD70.mtx
@@ -0,0 +1,8 @@
+# all five kinds of bases need to be included
+# do not forget to set gap-open/extension penalty with -O/-E
+    A       C       G       T       N
+A   91      -114    -31     -123    0
+C   -114    100     -125    -31     0
+G   -31     -125    100     -114    0
+T   -123    -31     -144    91      0
+N   0       0       0       0       0
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8629edf
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2020 Yan Gao
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..26d8a80
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,21 @@
+include src/abpoa_align.h
+include src/abpoa_graph.h
+include src/abpoa.h
+include src/abpoa_output.h
+include src/abpoa_seed.h
+include src/abpoa_seq.h
+include src/kalloc.h
+include src/kdq.h
+include src/khash.h
+include src/kseq.h
+include src/ksort.h
+include src/kstring.h
+include src/kvec.h
+include src/simd_abpoa_align.h
+include src/simd_instruction.h
+include src/utils.h
+recursive-include include/ *.h
+include python/cabpoa.pxd
+include python/pyabpoa.c
+include python/pyabpoa.pyx
+include python/README.md
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1284a68
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,128 @@
+#CC          = gcc
+EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
+CFLAGS      = -Wall -O3 $(EXTRA_FLAGS)
+
+SIMD_FLAG   = -march=native
+
+ifneq ($(armv7),) # for ARMv7
+	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
+else
+ifneq ($(armv8),) # for ARMv8
+ifneq ($(aarch64),) # for Aarch64 
+	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
+else # for Aarch32
+	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
+endif
+endif
+endif
+
+# for debug
+ifneq ($(debug),)
+	DFLAGS   =   -D __DEBUG__
+endif
+# for gdb
+ifneq ($(gdb),)
+	CFLAGS   = -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
+else
+	CFLAGS   = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
+endif
+
+# for gprof
+ifneq ($(pg),)
+	PG_FLAG  =   -pg
+	CFLAGS  +=   -pg
+endif
+
+LIB     = -lm -lz -lpthread
+ifneq ($(PREFIX),)
+	OUT_PRE_DIR = $(PREFIX)
+else
+	OUT_PRE_DIR = .
+endif
+
+BIN_DIR = $(OUT_PRE_DIR)/bin
+LIB_DIR = $(OUT_PRE_DIR)/lib
+INC_DIR = ./include
+SRC_DIR = ./src
+
+SOURCE = $(SRC_DIR)/abpoa_align.c $(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa_graph.c $(SRC_DIR)/abpoa_plot.c $(SRC_DIR)/abpoa_seed.c $(SRC_DIR)/abpoa_seq.c $(SRC_DIR)/abpoa_output.c $(SRC_DIR)/kalloc.c $(SRC_DIR)/kstring.c  $(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/simd_check.c $(SRC_DIR)/utils.c
+HEADER = $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa.h $(INC_DIR)/abpoa.h $(SRC_DIR)/abpoa_seed.h $(SRC_DIR)/abpoa_seq.h $(SRC_DIR)/abpoa_output.h $(SRC_DIR)/kalloc.h $(SRC_DIR)/kdq.h $(SRC_DIR)/khash.h $(SRC_DIR)/kseq.h $(SRC_DIR)/ksort.h $(SRC_DIR)/kstring.h $(SRC_DIR)/kvec.h $(SRC_DIR)/simd_instruction.h $(INC_DIR)/simd_instruction.h $(SRC_DIR)/simd_abpoa_align.h $(SRC_DIR)/utils.h
+OBJS   = $(SRC_DIR)/abpoa_align.o $(SRC_DIR)/abpoa_graph.o $(SRC_DIR)/abpoa_plot.o $(SRC_DIR)/abpoa_seed.o $(SRC_DIR)/abpoa_seq.o $(SRC_DIR)/abpoa_output.o $(SRC_DIR)/kalloc.o $(SRC_DIR)/kstring.o $(SRC_DIR)/simd_abpoa_align.o $(SRC_DIR)/simd_check.o $(SRC_DIR)/utils.o
+
+# SIMD label
+SIMD_CHECK_D = -D __CHECK_SIMD_MAIN__
+
+FLAG_SSE2     = -msse2
+FLAG_SSE41    = -msse4.1
+FLAG_AVX2     = -mavx2
+# FLAG_AVX512F  = -mavx512f
+# FLAG_AVX512BW = -mavx512bw
+
+ifneq ($(sse2),)
+	SIMD_FLAG=$(FLAG_SSE2)
+	py_SIMD_FLAG = SSE2=1
+else ifneq ($(sse41),)
+	SIMD_FLAG=$(FLAG_SSE41)
+	py_SIMD_FLAG = SSE41=1
+else ifneq ($(avx2),)
+	SIMD_FLAG=$(FLAG_AVX2)
+	py_SIMD_FLAG = AVX2=1
+#else ifneq ($(avx512f),)
+#	SIMD_FLAG=$(FLAG_AVX512F)
+#	py_SIMD_FLAG = AVX512f=1
+#else ifneq ($(avx512bw),)
+#	SIMD_FLAG=$(FLAG_AVX512BW)
+#	py_SIMD_FLAG = AVX512BW=1
+endif
+
+.c.o:
+		$(CC) -c $(CFLAGS) $< -I$(INC_DIR) -o $@
+
+BIN      = $(BIN_DIR)/abpoa
+ifneq ($(gdb),)
+	BIN  = $(BIN_DIR)/gdb_abpoa
+endif
+ABPOALIB = $(LIB_DIR)/libabpoa.a
+# TODO add example
+EXAMPLE  = example
+
+
+all:       $(BIN) 
+abpoa:     $(BIN)
+libabpoa:  $(ABPOALIB)
+example:   $(EXAMPLE)
+
+$(BIN):$(SRC_DIR)/abpoa.o $(ABPOALIB)
+	if [ ! -d $(BIN_DIR) ]; then mkdir $(BIN_DIR); fi
+	$(CC) $(CFLAGS) $< -I$(INC_DIR) -L$(LIB_DIR) -labpoa $(LIB) -o $@ $(PG_FLAG)
+
+$(EXAMPLE):example.c $(ABPOALIB)
+	$(CC) $(CFLAGS) $< -o $@ -I$(INC_DIR) -L$(LIB_DIR) -labpoa $(LIB)
+
+$(ABPOALIB):$(OBJS)
+	if [ ! -d $(LIB_DIR) ]; then mkdir $(LIB_DIR); fi
+	$(AR) -csr $@ $(OBJS)
+
+$(SRC_DIR)/abpoa.o:$(SRC_DIR)/abpoa.c $(SRC_DIR)/abpoa.h $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa_align.h \
+                   $(SRC_DIR)/abpoa_seq.h $(SRC_DIR)/utils.h $(SRC_DIR)/simd_instruction.h
+	$(CC) -c $(CFLAGS) $(SIMD_FLAG) -I$(INC_DIR) $< -o $@
+
+$(SRC_DIR)/simd_check.o:$(SRC_DIR)/simd_check.c $(SRC_DIR)/simd_instruction.h
+	$(CC) -c $(CFLAGS) $(SIMD_FLAG) -I$(INC_DIR) $< -o $@
+
+$(SRC_DIR)/simd_abpoa_align.o:$(SRC_DIR)/simd_abpoa_align.c $(SRC_DIR)/abpoa_graph.h $(SRC_DIR)/abpoa_align.h $(SRC_DIR)/simd_instruction.h $(SRC_DIR)/utils.h
+	$(CC) -c $(CFLAGS) $(SIMD_FLAG) -I$(INC_DIR) $< -o $@
+
+install_py: python/cabpoa.pxd python/pyabpoa.pyx python/README.md
+	${py_SIMD_FLAG} python setup.py install
+	
+sdist: install_py
+	${py_SIMD_FLAG} python setup.py sdist #bdist_wheel
+
+publish_pypi: clean_py sdist
+	twine upload dist/*
+
+clean:
+	rm -f $(SRC_DIR)/*.[oa] $(LIB_DIR)/*.[oa] $(BIN)
+clean_py:
+	rm -rf build/ dist/ pyabpoa.egg-info/ python/pyabpoa.c
diff --git a/PAM250.mtx b/PAM250.mtx
new file mode 100644
index 0000000..c3f76db
--- /dev/null
+++ b/PAM250.mtx
@@ -0,0 +1,29 @@
+# Entries for the PAM250 matrix at a scale of ln(2)/3.0.
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  J  Z  X  *  O  U
+A  2 -2  0  0 -2  0  0  1 -1 -1 -2 -1 -1 -3  1  1  1 -6 -3  0  0 -1  0 -1 -8 -8 -8
+R -2  6  0 -1 -4  1 -1 -3  2 -2 -3  3  0 -4  0  0 -1  2 -4 -2 -1 -3  0 -1 -8 -8 -8
+N  0  0  2  2 -4  1  1  0  2 -2 -3  1 -2 -3  0  1  0 -4 -2 -2  2 -3  1 -1 -8 -8 -8
+D  0 -1  2  4 -5  2  3  1  1 -2 -4  0 -3 -6 -1  0  0 -7 -4 -2  3 -3  3 -1 -8 -8 -8
+C -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3  0 -2 -8  0 -2 -4 -5 -5 -1 -8 -8 -8
+Q  0  1  1  2 -5  4  2 -1  3 -2 -2  1 -1 -5  0 -1 -1 -5 -4 -2  1 -2  3 -1 -8 -8 -8
+E  0 -1  1  3 -5  2  4  0  1 -2 -3  0 -2 -5 -1  0  0 -7 -4 -2  3 -3  3 -1 -8 -8 -8
+G  1 -3  0  1 -3 -1  0  5 -2 -3 -4 -2 -3 -5  0  1  0 -7 -5 -1  0 -4  0 -1 -8 -8 -8
+H -1  2  2  1 -3  3  1 -2  6 -2 -2  0 -2 -2  0 -1 -1 -3  0 -2  1 -2  2 -1 -8 -8 -8
+I -1 -2 -2 -2 -2 -2 -2 -3 -2  5  2 -2  2  1 -2 -1  0 -5 -1  4 -2  3 -2 -1 -8 -8 -8
+L -2 -3 -3 -4 -6 -2 -3 -4 -2  2  6 -3  4  2 -3 -3 -2 -2 -1  2 -3  5 -3 -1 -8 -8 -8
+K -1  3  1  0 -5  1  0 -2  0 -2 -3  5  0 -5 -1  0  0 -3 -4 -2  1 -3  0 -1 -8 -8 -8
+M -1  0 -2 -3 -5 -1 -2 -3 -2  2  4  0  6  0 -2 -2 -1 -4 -2  2 -2  3 -2 -1 -8 -8 -8
+F -3 -4 -3 -6 -4 -5 -5 -5 -2  1  2 -5  0  9 -5 -3 -3  0  7 -1 -4  2 -5 -1 -8 -8 -8
+P  1  0  0 -1 -3  0 -1  0  0 -2 -3 -1 -2 -5  6  1  0 -6 -5 -1 -1 -2  0 -1 -8 -8 -8
+S  1  0  1  0  0 -1  0  1 -1 -1 -3  0 -2 -3  1  2  1 -2 -3 -1  0 -2  0 -1 -8 -8 -8
+T  1 -1  0  0 -2 -1  0  0 -1  0 -2  0 -1 -3  0  1  3 -5 -3  0  0 -1 -1 -1 -8 -8 -8
+W -6  2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4  0 -6 -2 -5 17  0 -6 -5 -3 -6 -1 -8 -8 -8
+Y -3 -4 -2 -4  0 -4 -4 -5  0 -1 -1 -4 -2  7 -5 -3 -3  0 10 -2 -3 -1 -4 -1 -8 -8 -8
+V  0 -2 -2 -2 -2 -2 -2 -1 -2  4  2 -2  2 -1 -1 -1  0 -6 -2  4 -2  2 -2 -1 -8 -8 -8
+B  0 -1  2  3 -4  1  3  0  1 -2 -3  1 -2 -4 -1  0  0 -5 -3 -2  3 -3  2 -1 -8 -8 -8
+J -1 -3 -3 -3 -5 -2 -3 -4 -2  3  5 -3  3  2 -2 -2 -1 -3 -1  2 -3  5 -2 -1 -8 -8 -8
+Z  0  0  1  3 -5  3  3  0  2 -2 -3  0 -2 -5  0  0 -1 -6 -4 -2  2 -2  3 -1 -8 -8 -8
+X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -8 -8 -8
+* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8  1 -8 -8
+O -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8  1 -8
+U -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8  1 
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8437abc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,273 @@
+# abPOA: adaptive banded Partial Order Alignment
+[![Latest Release](https://img.shields.io/github/release/yangao07/abPOA.svg?label=Release)](https://github.com/yangao07/abPOA/releases/latest)
+[![Github All Releases](https://img.shields.io/github/downloads/yangao07/abPOA/total.svg?label=Download)](https://github.com/yangao07/abPOA/releases)
+[![BioConda Install](https://img.shields.io/conda/dn/bioconda/abpoa.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/abpoa)
+[![PyPI](https://img.shields.io/pypi/dm/pyabpoa.svg?label=pip%20install)](https://pypi.python.org/pypi/pyabpoa)
+[![Published in Bioinformatics](https://img.shields.io/badge/Published%20in-Bioinformatics-blue.svg)](https://dx.doi.org/10.1093/bioinformatics/btaa963)
+[![GitHub Issues](https://img.shields.io/github/issues/yangao07/abPOA.svg?label=Issues)](https://github.com/yangao07/abPOA/issues)
+[![Build Status](https://img.shields.io/travis/yangao07/abPOA/master.svg?label=Master)](https://travis-ci.org/yangao07/abPOA)
+[![License](https://img.shields.io/badge/License-MIT-black.svg)](https://github.com/yangao07/abPOA/blob/master/LICENSE)
+<!-- [![PyPI](https://img.shields.io/pypi/v/pyabpoa.svg?style=flat)](https://pypi.python.org/pypi/pyabpoa) -->
+## Updates (v1.4.1)
+
+- Take quality score in FASTQ format file as weight (-Q)
+
+## Getting started
+Download the [latest release](https://github.com/yangao07/abPOA/releases):
+```
+wget https://github.com/yangao07/abPOA/releases/download/v1.4.1/abPOA-v1.4.1.tar.gz
+tar -zxvf abPOA-v1.4.1.tar.gz && cd abPOA-v1.4.1
+```
+Make from source and run with test data:
+```
+make; ./bin/abpoa ./test_data/seq.fa > cons.fa
+```
+Or, install via conda and run with test data:
+```
+conda install -c bioconda abpoa
+abpoa ./test_data/seq.fa > cons.fa
+```
+## Table of Contents
+
+- [Introduction](#introduction)
+- [Installation](#install)
+  - [Installing abPOA via conda](#conda)
+  - [Building abPOA from source files](#build)
+  - [Pre-built binary executable file for Linux/Unix](#binary)
+- [General usage](#usage)
+  - [To generate one consensus sequence](#gen_1cons)
+  - [To generate multiple consensus sequences](#gen_mcons)
+  - [To generate row-column multiple sequence alignment](#gen_msa)
+  - [To generate graph information in GFA format](#gen_gfa)
+  - [To align sequence to an existing graph in GFA/MSA format](#aln_to_gfa)
+  - [To generate a plot of the alignment graph](#gen_plot)
+- [Input](#input)
+- [Output](#output)
+  - [Consensus sequence](#cons)
+  - [Row-column multiple sequence alignment](#msa)
+  - [Full graph information](#gfa)
+  - [Plot of alignment graph](#plot)
+- [Algorithm description](#description)
+  - [Adaptive banding](#banding)
+  - [Minimizer-based seeding and partition](#seeding)
+  - [Minimizer-based progressive tree](#tree)
+  - [Multiple conensus sequences](#mcons)
+- [For development](#dev)
+- [Evaluation datasets](#eval)
+- [Contact](#contact)
+
+## <a name="introduction"></a>Introduction
+abPOA is an extended version of [Partial Order Alignment (POA](10.1093/bioinformatics/18.3.452)) 
+that performs adaptive banded dynamic programming (DP) with an SIMD implementation. 
+abPOA can perform multiple sequence alignment (MSA) on a set of input sequences and 
+generate a consensus sequence by applying the [heaviest bundling algorithm](10.1093/bioinformatics/btg109) 
+to the final alignment graph.
+
+abPOA can generate high-quality consensus sequences from error-prone long reads and offer 
+significant speed improvement over existing tools.
+
+abPOA supports three alignment modes (global, local, extension) and flexible scoring schemes that allow linear, affine and convex gap penalties. 
+It right now supports SSE2/SSE4.1/AVX2 vectorization.
+
+For more information, please refer to our [paper](https://dx.doi.org/10.1093/bioinformatics/btaa963) published in Bioinformatics.
+
+## <a name="install"></a>Installation
+
+### <a name="conda"></a>Installing abPOA via conda
+On Linux/Unix and Mac OS, abPOA can be installed via
+```
+conda install -c bioconda abpoa   # install abPOA program
+```
+
+### <a name="build"></a>Building abPOA from source files
+You can also build abPOA from source files. 
+Make sure you have gcc (>=6.4.0) and zlib installed before compiling.
+It is recommended to download the [latest release](https://github.com/yangao07/abPOA/releases).
+```
+wget https://github.com/yangao07/abPOA/releases/download/v1.4.1/abPOA-v1.4.1.tar.gz
+tar -zxvf abPOA-v1.4.1.tar.gz
+cd abPOA-v1.4.1; make
+```
+Or, you can use `git clone` command to download the source code.
+This gives you the latest version of abPOA, which might be still under development.
+```
+git clone --recursive https://github.com/yangao07/abPOA.git
+cd abPOA; make
+```
+
+### <a name="binary"></a>Pre-built binary executable file for Linux/Unix 
+If you meet any compiling issue, please try the pre-built binary file:
+```
+wget https://github.com/yangao07/abPOA/releases/download/v1.4.1/abPOA-v1.4.1_x64-linux.tar.gz
+tar -zxvf abPOA-v1.4.1_x64-linux.tar.gz
+```
+
+## <a name="usage"></a>General usage
+### <a name="gen_1cons"></a>To generate consensus sequence
+
+```
+abpoa seq.fa > cons.fa
+```
+
+### <a name="gen_mcons"></a>To generate multiple consensus sequences
+
+```
+abpoa heter.fa -d2 > 2cons.fa
+```
+
+### <a name="gen_msa"></a>To generate row-column multiple sequence alignment in FASTA format
+
+```
+abpoa seq.fa -r1 > out.msa
+abpoa seq.fa -r2 > out_cons.msa
+```
+
+### <a name="gen_gfa"></a>To generate graph information in [GFA](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) format
+
+```
+abpoa seq.fa -r3 > out.gfa
+```
+To include the generated consensus sequence as a path in the GFA file:
+```
+abpoa seq.fa -r4 > out.gfa
+```
+
+### <a name="aln_to_gfa"></a>To align sequence to an existing graph in GFA/MSA format
+```
+abpoa -i in.gfa seq.fa -r3 > out.gfa
+abpoa -i in.msa seq.fa -r1 > out.msa
+```
+For GFA input file, `S` and `P` lines are required and are used to reconstruct the alignment graph.
+For MSA input file, which is generally a FASTA format file, `-` in the sequence indicates the alignment gap.
+```
+abpoa seq1.fa -r1 > seq1.msa
+abpoa -i seq1.msa seq2.fa > cons.fa
+```
+
+### <a name="gen_plot"></a>To generate a plot of the alignment graph
+
+```
+abpoa seq.fa -g poa.png > cons.fa
+```
+See [Plot of alignment graph](#plot) for more details about the plot file.
+
+## <a name="input"></a>Input
+abPOA works with FASTA, FASTQ, gzip'd FASTA(.fa.gz) and gzip'd FASTQ(.fq.gz) formats. The input file is 
+expected to contains multiple sequences which will be processed sequentially to perform the iterative 
+sequence-to-graph (partial order) alignment.
+
+abPOA can also take a list of filenames as input with option `-l`, where each line is the path to one 
+file containing multiple sequences. Each sequence file is then individually aligned by abPOA to generate a
+consensus sequence.
+
+## <a name="output"></a>Output
+### <a name="cons"></a>Consensus sequence 
+By default, abPOA only outputs the consensus sequence generated from the final alignment graph.
+It is in FASTA format with the name field set as "Consensus_sequence".
+For example:
+```
+>Consensus_sequence
+ACGTGTACACGTTGAC
+```
+
+For diploid input sequences, you may want to generate two or more consensus sequences, simply set `-d/--max-num-cons` as a desired value:
+```
+abpoa heter.fa -d2
+```
+and this gives you two consensus sequences:
+```
+>Consensus_sequence_1
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>Consensus_sequence_2
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+```
+### <a name="msa"></a>Row-column multiple sequence alignment
+abPOA can also output the row-column multiple sequence alignment (RC-MSA) of all the aligned sequences in FASTA format.
+For example:
+```
+>1
+ACGTGTACA-GTTGAC
+>2
+A-G-GTACACGTT-AC
+>3
+A-GTGT-CACGTTGAC
+>4
+ACGTGTACA--TTGAC
+```
+The `-` in the sequence stands for alignment gap. 
+
+### <a name="gfa"></a>Full graph information
+abPOA can output the final alignment graph in GFA format.
+Each segment line (`S` line) represents one node and each link line (`L` line) represents one edge between two nodes.
+The original input sequences and the generated consensus sequence are described as paths in `P` lines.
+
+abPOA outputs two graph-related numbers in the header line (`H` line):
+`NS` and `NL`, which denote the total number of nodes and edges in the GFA file, respectively.
+
+Please refer to the [GFA specification](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) for more details of the GFA format.
+
+### <a name="plot"></a>Plot of alignment graph
+
+abPOA can generate a plot of the final partial order alignment graph with the help of `graphviz dot`. 
+For example:
+
+![pog](https://github.com/yangao07/abPOA/blob/master/pog.png)
+
+The numbers inside the nodes are the node IDs. The numbers on the edges are the edge weights.
+`S` and `E` are the auxiliary start and end nodes that have no sequence bases.
+
+Make sure you have `dot` installed before using abPOA to generate the plot.
+For Linux/Unix systems: `sudo apt-get install graphviz`.
+
+## <a name="description"></a>Algorithm description
+### <a name="banding"></a>Adaptive banding
+To understand how the adaptive banding working, please refer to our [Bioinformatics paper](https://dx.doi.org/10.1093/bioinformatics/btaa963).
+
+### <a name="seeding"></a>Minimizer-based seeding mode
+As abPOA always allocates quadratic size of memory, for very long input sequences (>10 kb), memory usage will be a challenge.
+
+To solve this issue, we develop a minimizer-based seeding and partition method to split the sequence and graph with a small window.
+The full POA DP matrix can be split into several smaller ones and adaptive banded POA can be performed within each small window separately.
+
+In more detail, abPOA extracts all the minimizers from all the input sequences, then all the minimizer hits between each pair of two sequences can be found.
+For each pair of sequences, the minimizer hits are first chained together using relatively stringent criteria to make sure that no big gap exists in the chain.
+This usually leads to several separated local chains of minimizer hits.
+A second round of chaining is then performed on all the local minimizer chains to generate a global chain going through the entire sequence.
+With this global chain, abPOA selects a series of minimizer hits as partition anchors which has at least a distance of 500 bp (by default, -n/--min-poa-win).
+Within each partitioned window, abPOA performs banded partial order alignment separately and combines all the alignment results at the end.
+
+### <a name="tree"></a>Minimizer-based progressive tree
+Instead of aligning all the sequences in the original order, abPOA can alternatively build a progressive tree to guide the alignment order.
+The generation of the progressive tree is also based on minimizers.
+For each pair of sequences, abPOA calculates their similarity score which is the Jaccard similarity of the minimizers, i.e. the number of minimizer hits divided by the total number of all minimizers from the two sequences.
+With all the similarity scores (minimizer-based Jaccard similarity), abPOA builds the progressive tree in the following way:
+
+1. Pick the first two sequences that have the highest scores. The progressive tree set is initialized as these first two sequences.
+2. For each remaining sequence, sum the scores between the remaining sequence and all the sequences from the current progressive tree set. Pick the one with the highest sum score, and push it to the progressive tree set.
+3. Repeat step 2, until no sequence remains.
+
+Then, abPOA performs partial order alignment following the order of sequences in this progressive tree set.
+
+### <a name="mcons"></a>Multiple consensus sequences
+Since v1.4.1, abPOA supports generating multiple consensus sequences from the final alignment graph (set -d/--max-num-cons as >1).
+
+The general underlying idea is to group input sequences into multiple clusters based on the heterozygous bases in the graph,
+Then, one consensus sequence is separately generated for each cluster of input sequences.
+The minimum allele frequency for each heterozygous base is 0.25 (by default, -q/--min-freq). 
+
+## <a name="dev"></a>For development
+abPOA is not only a stand-alone tool for MSA and consensus calling, it can also work as a programming library. [example.c](example.c) shows how to use the C APIs of abPOA to take a set of sequences as input and perform MSA and consensus calling. Basically, the library file `libabpoa.a` and two header files [abpoa.h](include/abpoa.h) and [simd_instruction.h](include/simd_instruction.h) are needed to make the abPOA library work in your program.
+
+abPOA also provides Python bindings to all the primary C APIs. Refer to [python/README.md](python/README.md) for more details.
+
+## <a name="eval"></a>Evaluation datasets
+The evaluation datasets and scripts used in [abPOA paper](https://dx.doi.org/10.1093/bioinformatics/btaa963) can be found in [abPOA-v1.0.5](https://github.com/yangao07/abPOA/releases/tag/v1.0.5).
+
+## <a name="contact"></a>Contact
+Yan Gao gaoy1@chop.edu
+
+Yi Xing xingyi@chop.edu
+
+Yadong Wang ydwang@hit.edu.cn
+
+[github issues](https://github.com/yangao07/abPOA/issues)
diff --git a/abpoa.pc.in b/abpoa.pc.in
new file mode 100644
index 0000000..d968e36
--- /dev/null
+++ b/abpoa.pc.in
@@ -0,0 +1,9 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: abPOA
+Description: abPOA 
+Version: @abPOA_VERSION@
+
+Libs: -L${libdir} -labpoa
+Cflags: -I${includedir}
diff --git a/example.c b/example.c
new file mode 100644
index 0000000..5711402
--- /dev/null
+++ b/example.c
@@ -0,0 +1,168 @@
+/* example.c libabpoa usage example
+   To compile: 
+gcc -g example.c -I ./include -L ./lib -labpoa -lz -lm -o example
+or:
+gcc -g example.c -I ./include ./lib/libabpoa.a -lz -lm -o example
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "include/abpoa.h"
+
+// for nt
+// AaCcGgTtNn ==> 0,1,2,3,4
+unsigned char nt4_table[256] = {
+       0, 1, 2, 3,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4 /*'-'*/, 4, 4,
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  3, 3, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  3, 3, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+// 65,97=>A, 67,99=>C, 71,103=>G, 84,85,116,117=>T, else=>N
+const char nt256_table[256] = {
+       'A', 'C', 'G', 'T',  'N', '-', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', '-',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'A', 'N', 'C',  'N', 'N', 'N', 'G',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'T', 'T', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'A', 'N', 'C',  'N', 'N', 'N', 'G',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'T', 'T', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N'
+};
+
+int main(void) {
+    int i, j, n_seqs = 10;
+    // char seqs[10][100] = {
+    //     "CGTCAATCTATCGAAGCATACGCGGGCAGAGCCGAAGACCTCGGCAATCCA",
+    //     "CCACGTCAATCTATCGAAGCATACGCGGCAGCCGAACTCGACCTCGGCAATCAC",
+    //     "CGTCAATCTATCGAAGCATACGCGGCAGAGCCCGGAAGACCTCGGCAATCAC",
+    //     "CGTCAATGCTAGTCGAAGCAGCTGCGGCAGAGCCGAAGACCTCGGCAATCAC",
+    //     "CGTCAATCTATCGAAGCATTCTACGCGGCAGAGCCGACCTCGGCAATCAC",
+    //     "CGTCAATCTAGAAGCATACGCGGCAAGAGCCGAAGACCTCGGCCAATCAC",
+    //     "CGTCAATCTATCGGTAAAGCATACGCTCTGTAGCCGAAGACCTCGGCAATCAC",
+    //     "CGTCAATCTATCTTCAAGCATACGCGGCAGAGCCGAAGACCTCGGCAATC",
+    //     "CGTCAATGGATCGAGTACGCGGCAGAGCCGAAGACCTCGGCAATCAC",
+    //     "CGTCAATCTAATCGAAGCATACGCGGCAGAGCCGTCTACCTCGGCAATCACGT"
+    //     };
+
+    char seqs[10][100] = {
+        "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT",
+        "CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT"
+        };
+
+    // initialize variables
+    abpoa_t *ab = abpoa_init();
+    abpoa_para_t *abpt = abpoa_init_para();
+
+    // alignment parameters
+    // abpt->align_mode = 0; // 0:global 1:local, 2:extension
+    // abpt->mat_fn = strdup("HOXD70.mtx"); abpt->use_score_matrix = 1; // score matrix instead of constant match/mismatch score
+    // abpt->match = 2;      // match score
+    // abpt->mismatch = 4;   // mismatch penalty
+    // abpt->gap_mode = ABPOA_CONVEX_GAP; // gap penalty mode
+    // abpt->gap_open1 = 4;  // gap open penalty #1
+    // abpt->gap_ext1 = 2;   // gap extension penalty #1
+    // abpt->gap_open2 = 24; // gap open penalty #2
+    // abpt->gap_ext2 = 1;   // gap extension penalty #2
+                             // gap_penalty = min{gap_open1 + gap_len * gap_ext1, gap_open2 + gap_len * gap_ext2}
+    // abpt->bw = 10;        // extra band used in adaptive banded DP
+    // abpt->bf = 0.01; 
+     
+    // output options
+    abpt->out_msa = 1; // generate Row-Column multiple sequence alignment(RC-MSA), set 0 to disable
+    abpt->out_cons = 1; // generate consensus sequence, set 0 to disable
+    abpt->w = 6, abpt->k = 9; abpt->min_w = 10; // minimizer-based seeding and partition
+    abpt->progressive_poa = 1;
+    abpt->max_n_cons = 2; // to generate 2 consensus sequences
+
+    abpoa_post_set_para(abpt);
+
+    // collect sequence length, trasform ACGT to 0123
+    int *seq_lens = (int*)malloc(sizeof(int) * n_seqs);
+    uint8_t **bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * n_seqs);
+    int **weights = (int**)malloc(sizeof(int*) * n_seqs);
+    for (i = 0; i < n_seqs; ++i) {
+        seq_lens[i] = strlen(seqs[i]);
+        bseqs[i] = (uint8_t*)malloc(sizeof(uint8_t) * seq_lens[i]);
+        weights[i] = (int*)malloc(sizeof(int) * seq_lens[i]);
+        for (j = 0; j < seq_lens[i]; ++j) {
+            bseqs[i][j] = nt4_table[(int)seqs[i][j]];
+            if (j >= 12) weights[i][j] = 2;
+            else weights[i][j] = 0;
+        }
+    }
+
+    // 1. directly output to stdout
+    fprintf(stdout, "=== output to stdout ===\n");
+    abpt->use_qv = 1;
+    // perform abpoa-msa
+    // set weights as NULL if no quality score weights are used
+    abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, weights, stdout);
+
+    // 2. output MSA alignment and consensus sequence stored in (abpoa_cons_t *)
+    abpoa_cons_t *abc = ab->abc;
+    fprintf(stdout, "=== stored in variables ===\n");
+    fprintf(stdout, ">Multiple_sequence_alignment\n");
+    for (i = 0; i < abc->n_seq; ++i) {
+        for (j = 0; j < abc->msa_len; ++j) {
+            fprintf(stdout, "%c", nt256_table[abc->msa_base[i][j]]);
+        }
+        fprintf(stdout, "\n");
+    }
+
+    for (i = 0; i < abc->n_cons; ++i) {
+        fprintf(stdout, ">Consensus_sequence");
+        if (abc->n_cons > 1) {
+            fprintf(stdout, "_%d ", i+1);
+            for (j = 0; j < abc->clu_n_seq[i]; ++j) { // output read ids for each cluster/group
+                fprintf(stdout, "%d", abc->clu_read_ids[i][j]);
+                if (j != abc->clu_n_seq[i]-1) fprintf(stdout, ",");
+            }
+        }
+        fprintf(stdout, "\n");
+        for (j = 0; j < abc->cons_len[i]; ++j)
+            fprintf(stdout, "%c", nt256_table[abc->cons_base[i][j]]);
+        fprintf(stdout, "\n");
+    }
+
+    /* generate DOT partial order graph plot */
+    abpt->out_pog = strdup("example.png"); // dump parital order graph to file
+    if (abpt->out_pog != NULL) abpoa_dump_pog(ab, abpt);
+
+    // free seq-related variables
+    for (i = 0; i < n_seqs; ++i) { free(bseqs[i]); free(weights[i]); }
+    free(bseqs); free(seq_lens); free(weights);
+
+    // free abpoa-related variables
+    abpoa_free(ab); abpoa_free_para(abpt); 
+    return 0;
+}
diff --git a/include/abpoa.h b/include/abpoa.h
new file mode 100644
index 0000000..87bbdcf
--- /dev/null
+++ b/include/abpoa.h
@@ -0,0 +1,223 @@
+#ifndef ABPOA_H
+#define ABPOA_H
+
+#include <stdint.h>
+#include "simd_instruction.h"
+
+#define ABPOA_GLOBAL_MODE 0
+#define ABPOA_LOCAL_MODE  1
+#define ABPOA_EXTEND_MODE 2
+//#define ABPOA_SEMI_MODE 3
+
+// gap mode
+#define ABPOA_LINEAR_GAP 0
+#define ABPOA_AFFINE_GAP 1
+#define ABPOA_CONVEX_GAP 2
+
+#define ABPOA_EXTRA_B 10
+#define ABPOA_EXTRA_F 0.01
+
+#define ABPOA_CIGAR_STR "MIDXSH"
+#define ABPOA_CMATCH     0
+#define ABPOA_CINS       1
+#define ABPOA_CDEL       2
+#define ABPOA_CDIFF      3
+#define ABPOA_CSOFT_CLIP 4
+#define ABPOA_CHARD_CLIP 5
+
+#define ABPOA_SRC_NODE_ID  0
+#define ABPOA_SINK_NODE_ID 1
+
+#define ABPOA_OUT_CONS     0
+#define ABPOA_OUT_MSA      1
+#define ABPOA_OUT_CONS_MSA 2
+#define ABPOA_OUT_GFA      3
+#define ABPOA_OUT_CONS_GFA 4
+#define ABPOA_OUT_CONS_FQ  5
+
+#define ABPOA_HB 0
+#define ABPOA_HC 1
+
+// NOTE: upper boundary of in_edge_n is pow(2,30)
+// for MATCH/MISMATCH: node_id << 34  | query_id << 4 | op
+// for INSERTION:      query_id << 34 | op_len << 4   | op
+// for DELETION:       node_id << 34  | op_len << 4   | op // op_len is always equal to 1
+// for CLIP            query_id << 34 | op_len << 4   | op 
+#define abpoa_cigar_t uint64_t 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+    int n_cigar, m_cigar; abpoa_cigar_t *graph_cigar;
+    int node_s, node_e, query_s, query_e; // for local and  extension mode
+    int n_aln_bases, n_matched_bases;
+    int32_t best_score; 
+    // uint8_t is_rc:1; // is_rc: best_score is from the reverse complement
+                        // now is_rc is determined based on minimizer-based seeding and chaining
+} abpoa_res_t;
+
+typedef struct {
+    int m; int *mat; char *mat_fn; // score matrix
+    int use_score_matrix; // set _mat_ based on score matrix file, then _match_/_mismatch_ is not used.
+    int match, max_mat, mismatch, min_mis, gap_open1, gap_open2, gap_ext1, gap_ext2; int inf_min;
+    // minimizer seeding parameter
+    int k, w, min_w;
+    int wb; float wf; // extra band width
+    int zdrop, end_bonus; // from minimap2
+    // int simd_flag; // available SIMD instruction
+    // alignment mode
+    uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_cons:1, out_gfa:1, out_fq:1, use_read_ids:1, amb_strand:1;
+    uint8_t use_qv:1, disable_seeding:1, progressive_poa:1;
+    char *incr_fn, *out_pog;
+    int align_mode, gap_mode, max_n_cons;
+    double min_freq; // for multiploid data
+    int verbose; // to control output msg
+
+    // char LogTable65536[65536];
+    // char bit_table16[65536];
+} abpoa_para_t;
+
+typedef struct {
+    int node_id;
+    int in_edge_n, in_edge_m, *in_id;
+    int out_edge_n, out_edge_m, *out_id; int *out_weight;
+    int *read_weight, n_read, m_read; // weight of each read, valid when use_qv=1
+    uint64_t **read_ids; int read_ids_n; // for each edge
+
+    int aligned_node_n, aligned_node_m, *aligned_node_id; // mismatch; aligned node will have same rank
+    // int heaviest_weight, heaviest_out_id; // for consensus
+    uint8_t base; // 0~m
+    // ID, pos ???
+} abpoa_node_t;
+
+typedef struct {
+    abpoa_node_t *node; int node_n, node_m, index_rank_m; 
+    int *index_to_node_id;
+    int *node_id_to_index, *node_id_to_max_pos_left, *node_id_to_max_pos_right, *node_id_to_max_remain, *node_id_to_msa_rank;
+    uint8_t is_topological_sorted:1, is_called_cons:1, is_set_msa_rank:1;
+} abpoa_graph_t;
+
+typedef struct {
+    int n_cons, n_seq, msa_len; // # cons, # of total seq, length of row-column MSA (including gaps)
+    int *clu_n_seq;      // # of reads in each read cluster/group, size: n_cons
+    int **clu_read_ids; // read ids for each cluster/group, size: n_cons * clu_n_seq[i]
+    int *cons_len;       // length of each consensus sequence, size: n_cons
+    int **cons_node_ids; // node id of each consensus, size: n_cons * cons_len[i]
+    uint8_t **cons_base; // sequence base of each consensus, size: n_cons * cons_len[i]
+    uint8_t **msa_base;  // sequence base of RC-MSA, size: (n_seq + n_cons) * msa_len
+    int **cons_cov;      // coverage of each consensus base, size: n_cons * cons_len[i]
+    int **cons_phred_score; // phred score for each consensus base, size: n_cons * cons_len[i]
+} abpoa_cons_t;
+
+typedef struct {
+    int l, m; char *s;
+} abpoa_str_t;
+
+typedef struct {
+    int n_seq, m_seq;
+    abpoa_str_t *seq, *name, *comment, *qual;
+    uint8_t *is_rc;
+} abpoa_seq_t;
+
+typedef struct {
+    SIMDi *s_mem; uint64_t s_msize; // qp, DP_HE, dp_f OR qp, DP_H, dp_f : based on (qlen, num_of_value, m, node_n)
+    int *dp_beg, *dp_end, *dp_beg_sn, *dp_end_sn, rang_m; // if band : based on (node_m)
+} abpoa_simd_matrix_t;
+
+typedef struct {
+    abpoa_graph_t *abg;
+    abpoa_seq_t *abs;
+    abpoa_simd_matrix_t *abm;
+    abpoa_cons_t *abc;
+} abpoa_t;
+
+// init for abpoa parameters
+abpoa_para_t *abpoa_init_para(void);
+void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mat_fn);
+void abpoa_post_set_para(abpoa_para_t *abpt);
+void abpoa_free_para(abpoa_para_t *abpt);
+
+// init for alignment
+abpoa_t *abpoa_init(void);
+void abpoa_free(abpoa_t *ab);
+
+// perform msa
+int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, int **qual_weights, FILE *out_fp);
+
+int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp);
+
+// clean alignment graph
+void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen);
+
+// restore graph from GFA/FASTA file
+abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt);
+
+// for development:
+// align a sequence to a graph
+int abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res);
+// align a sequence to a graph between beg_node_id and end_node_id (both are excluded)
+void abpoa_subgraph_nodes(abpoa_t *ab, abpoa_para_t *abpt, int inc_beg, int inc_end, int *exc_beg, int *exc_end);
+int abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res);
+
+// add a node to a graph
+// para:
+//   base: 0123 for ACGT
+int abpoa_add_graph_node(abpoa_graph_t *abg, uint8_t base);
+
+// add an edge to a graph
+// para:
+//   from_id/to_id: ids of from and to nodes
+//   check_edge: set as 1 if this edge maybe alread exist and only need to update weight, set as 0 if the edge is new
+//   add_read_id: set as 1 if read_id is used (to use row-column algorithm/generate MSA result/multiple consensus)
+//   read_id: is of sequence
+//   read_ids_n: size of read_id array, each one is 64-bit (1+(tot_read_n-1)/64)
+int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_edge, int w, uint8_t add_read_id, uint8_t add_read_weight, int read_id, int read_ids_n, int tot_read_n);
+
+// add an alignment to a graph
+// para:
+//   query: 0123 for ACGT
+//   qlen: query length
+//   n_cigar/abpoa_cigar: from alignment result (abpoa_res_t)
+//   read_id: id of sequence
+//   tot_read_n: total number of sequence
+int abpoa_add_graph_alignment(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int *weight, int qlen, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends);
+int abpoa_add_subgraph_alignment(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int *weight, int qlen, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends);
+
+void abpoa_BFS_set_node_index(abpoa_graph_t *abg, int src_id, int sink_id);
+void abpoa_BFS_set_node_remain(abpoa_graph_t *abg, int src_id, int sink_id);
+
+// topological sortting of graph
+void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt);
+
+// generate consensus sequence from graph
+// para:
+//   out_fp: consensus sequence output in FASTA format, set as NULL to disable
+//   cons_seq, cons_l, cons_n: store consensus sequences in variables, set cons_n as NULL to disable. 
+//     cons_seq: store consensus sequences
+//     cons_l: store consensus sequences length
+//     cons_n: store number of consensus sequences
+//     Note: cons_seq and cons_l need to be freed by user.
+void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt);
+void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// generate column multiple sequence alignment from graph
+void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt);
+void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// generate graph in GFA format to _out_fp_
+void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// output cons/msa
+void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// generate DOT graph plot and dump graph into PDF/PNG format file
+void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/simd_instruction.h b/include/simd_instruction.h
new file mode 100644
index 0000000..41deb16
--- /dev/null
+++ b/include/simd_instruction.h
@@ -0,0 +1,633 @@
+// A header file to get you set going with Intel SIMD instrinsic programming. 
+// <immintrin.h> is inlucded for SSE2, SSE41, AVX2 and AVX512F, AVX512BW
+// SSE4.1: floor and blend is available)
+// AVX2: double speed
+
+// do not support AVX512F/AVX512BW 12/20/2021 - Yan Gao
+// AVX512F: quardruple speed
+// AVX512BW: byte and word operation
+
+#include <stdlib.h>
+#include <errno.h>
+
+#pragma once
+#ifndef SIMD_INSTRUCTION_H
+#define SIMD_INSTRUCTION_H
+
+#undef __AVX512F__
+#undef __AVX512BW__
+
+#ifndef USE_SIMDE
+#include <immintrin.h>
+#else // use SIMDE
+#ifdef __AVX512F__
+#include "simde/simde/x86/avx512.h"
+#else
+#ifdef __AVX2__
+#include "simde/simde/x86/avx2.h"
+#else
+#ifdef __SSE4_1__
+#include "simde/simde/x86/sse4.1.h"
+#else
+#include "simde/simde/x86/sse2.h"
+#endif // end of sse41
+#endif // end of AVX2
+#endif // end of 512F
+#endif // end of USE_SIMDE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define SIMD_SSE      0x1
+#define SIMD_SSE2     0x2
+#define SIMD_SSE3     0x4
+#define SIMD_SSSE3    0x8
+#define SIMD_SSE41    0x10
+#define SIMD_SSE42    0x20
+#define SIMD_AVX      0x40
+#define SIMD_AVX2     0x80
+#define SIMD_AVX512F  0x100
+#define SIMD_AVX512BW 0x200
+
+// #define SIMDFree(x) _mm_free(x)
+// posix_memalign and free
+#define SIMDFree(x) free(x)
+
+// Shift, Blend, ... for 8/16 and 32/64
+#ifdef __AVX512BW__
+// start of AVX512BW
+
+typedef __m512 SIMDf;
+typedef __m512i SIMDi;
+
+#define SIMDStore(x,y) _mm512_store_ps(x,y)
+#define SIMDStorei(x,y) _mm512_store_si512(x,y)
+#define SIMDLoad(x) _mm512_load_ps(x)
+#define SIMDLoadi(x) _mm512_load_si512(x)
+#define SIMDZero _mm512_setzero_si512()
+#define SIMDSetZero() _mm512_setzero_ps()
+#define SIMDSetZeroi() _mm512_setzero_si512()
+#define SIMDSetOne(x) _mm512_set1_ps(x)
+#define SIMDSetOnei8(x) _mm512_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm512_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm512_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm512_set1_epi64(x)
+#define SIMDAdd(x,y) _mm512_add_ps(x,y)
+#define SIMDAddi8(x,y) _mm512_add_epi8(x,y)
+#define SIMDAddi16(x,y) _mm512_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm512_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm512_add_epi64(x,y)
+#define SIMDSub(x,y) _mm512_sub_ps(x,y)
+#define SIMDSubi8(x,y) _mm512_sub_epi8(x,y)
+#define SIMDSubi16(x,y) _mm512_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm512_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm512_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm512_mul_ps(x,y)
+#define SIMDMuli32(x,y) _mm512_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm512_and_ps(x,y)
+#define SIMDAndi(x,y) _mm512_and_si512(x,y)
+#define SIMDAndNot(x,y) _mm512_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm512_andnot_si512(x,y)
+#define SIMDOr(x,y) _mm512_or_ps(x,y)
+#define SIMDOri(x,y) _mm512_or_si512(x,y)
+#define SIMDShiftLeft(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(x, _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), (16-(n))) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), (32-(n))) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _mm512_shuffle_i64x2(SIMDZero, _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), (48-(n))) : \
+    _mm512_bslli_epi128(_mm512_shuffle_i64x2(SIMDZero,  _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), ((n)-48))))
+/*
+static inline SIMDi SIMDShiftLeft(SIMDi x, const int n) { // x=a|b|c|d
+    SIMDi tmp1,tmp2;
+    if (n < 16) {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)); // tmp1=0|0|c|d
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(2,1,0,2)); // tmp2=b|c|d|0
+        return _mm512_alignr_epi8(x, tmp2, 16 - n);
+    } else if (n < 32) {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)); // tmp1=0|0|c|d
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(2,1,0,2)); // tmp2=b|c|d|0
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)); // tmp1=c|d|0|0
+        return _mm512_alignr_epi8(tmp2, tmp1, 32 - n);
+    } else if (n < 48) {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0));    // tmp1=c|d|0|0
+        tmp2 = _mm512_shuffle_i64x2(SIMDZero, tmp1, _MM_SHUFFLE(2,0,0,0)); // tmp2=d|0|0|0
+        return _mm512_alignr_epi8(tmp1, tmp2, 48 - n);
+    } else {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0));    // tmp1=c|d|0|0
+        tmp2 = _mm512_shuffle_i64x2(SIMDZero, tmp1, _MM_SHUFFLE(2,0,0,0)); // tmp2=d|0|0|0
+        return _mm512_bslli_epi128(tmp2, n - 48);
+    }
+}*/
+#define SIMDShiftRight(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2( _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), x, (n)) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), ((n)-16)) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), ((n)-32)) : \
+    _mm512_bsrli_epi128(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), ((n)-48))))
+/*
+static inline SIMDi SIMDShiftRight(SIMDi x, int n) { // x=a|b|c|d
+    SIMDi tmp1, tmp2;
+    if (n < 16) {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)); // tmp1=a|b|0|0
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(0,3,2,1)); // tmp2=0|a|b|c
+        return _mm512_alignr_epi8(tmp2, x, n);
+    } else if (n < 32) {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)); // tmp1=a|b|0|0
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(0,3,2,1)); // tmp2=0|a|b|c
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)); // tmp1=0|0|a|b
+        return _mm512_alignr_epi8(tmp1, tmp2, n-16);
+    } else if (n < 48) {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2));    // tmp1=0|0|a|b
+        tmp2 = _mm512_shuffle_i64x2(tmp1, SIMDZero, _MM_SHUFFLE(0,0,2,1)); // tmp2=0|0|0|a
+        return _mm512_alignr_epi8(tmp2, tmp1, n-32);
+    } else {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2));    // tmp1=0|0|a|b
+        tmp2 = _mm512_shuffle_i64x2(tmp1, SIMDZero, _MM_SHUFFLE(0,0,2,1)); // tmp2=0|0|0|a
+        return _mm512_bsrli_epi128(tmp2, n - 48);
+    }
+}*/
+#define SIMDShiftLeftOnei16(x,y) _mm512_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm512_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm512_slli_epi64(x,y)
+#define SIMDShiftRightOnei16(x,y) _mm512_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm512_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm512_srli_epi64(x,y)
+#define SIMDEqualM(x,y) _mm512_cmpeq_ps_mask(x,y)
+#define SIMDEquali8M(x,y) _mm512_cmpeq_epi8_mask(x,y)
+#define SIMDEquali16M(x,y) _mm512_cmpeq_epi16_mask(x,y)
+#define SIMDEquali32M(x,y) _mm512_cmpeq_epi32_mask(x,y)
+#define SIMDEquali64M(x,y) _mm512_cmpeq_epi64_mask(x,y)
+#define SIMDNotEqualM(x,y) _mm512_cmpneq_ps_mask(x,y)
+#define SIMDNotEquali8M(x,y) _mm512_cmpneq_epi8_mask(x,y)
+#define SIMDNotEquali16M(x,y) _mm512_cmpneq_epi16_mask(x,y)
+#define SIMDNotEquali32M(x,y) _mm512_cmpneq_epi32_mask(x,y)
+#define SIMDNotEquali64M(x,y) _mm512_cmpneq_epi64_mask(x,y)
+#define SIMDGreaterThani8M(x,y) _mm512_cmpgt_epi8_mask(x,y)
+#define SIMDGreaterThani16M(x,y) _mm512_cmpgt_epi16_mask(x,y)
+#define SIMDGreaterThani32M(x,y) _mm512_cmpgt_epi32_mask(x,y)
+#define SIMDGreaterThani64M(x,y) _mm512_cmpgt_epi64_mask(x,y)
+#define SIMDGreaterThanOrEquali8M(x,y) _mm512_cmpge_epi8_mask(x,y)
+#define SIMDGreaterThanOrEquali16M(x,y) _mm512_cmpge_epi16_mask(x,y)
+#define SIMDGreaterThanOrEquali32M(x,y) _mm512_cmpge_epi32_mask(x,y)
+#define SIMDGreaterThanOrEquali64M(x,y) _mm512_cmpge_epi64_mask(x,y)
+#define SIMDLessThanM(x,y) _mm512_cmplt_ps_mask(x,y)
+#define SIMDLessThani8M(x,y) _mm512_cmplt_epi8_mask(x,y)
+#define SIMDLessThani16M(x,y) _mm512_cmplt_epi16_mask(x,y)
+#define SIMDLessThani32M(x,y) _mm512_cmplt_epi32_mask(x,y)
+#define SIMDLessThani64M(x,y) _mm512_cmplt_epi64_mask(x,y)
+#define SIMDLessThanOrEqualM(x,y) _mm512_cmple_ps_mask(x,y)
+#define SIMDLessThanOrEquali8M(x,y) _mm512_cmple_epi8_mask(x,y)
+#define SIMDLessThanOrEquali16M(x,y) _mm512_cmple_epi16_mask(x,y)
+#define SIMDLessThanOrEquali32M(x,y) _mm512_cmple_epi32_mask(x,y)
+#define SIMDLessThanOrEquali64M(x,y) _mm512_cmple_epi64_mask(x,y)
+#define SIMDMax(x,y) _mm512_max_ps(x,y)
+#define SIMDMaxi8(x,y) _mm512_max_epi8(x,y)
+#define SIMDMaxi16(x,y) _mm512_max_epi16(x,y)
+#define SIMDMaxi32(x,y) _mm512_max_epi32(x,y)
+#define SIMDMaxi64(x,y) _mm512_max_epi64(x,y)
+#define SIMDMin(x,y) _mm512_min_ps(x,y)
+#define SIMDMini8(x,y) _mm512_min_epi8(x,y)
+#define SIMDMini16(x,y) _mm512_min_epi16(x,y)
+#define SIMDMini32(x,y) _mm512_min_epi32(x,y)
+#define SIMDMini64(x,y) _mm512_min_epi64(x,y)
+
+#define SIMDBlend(x,y,z) _mm512_mask_blend_ps(z, x, y)
+#define SIMDBlendi8(x,y,z) _mm512_mask_blend_epi8(z, x, y)
+#define SIMDBlendi16(x,y,z) _mm512_mask_blend_epi16(z, x, y)
+#define SIMDBlendi32(x,y,z) _mm512_mask_blend_epi32(z, x, y)
+#define SIMDBlendi64(x,y,z) _mm512_mask_blend_epi64(z, x, y)
+
+// with AVX512BW
+#define Maski8 __mmask64
+#define Maski16 __mmask32
+#define Maski32 __mmask16
+#define Maski64 __mmask8
+/* x = a == b ? c : d */ 
+#define SIMDSetIfEquali8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDEquali8M(a,b)); } 
+#define SIMDSetIfEquali16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDEquali16M(a,b)); } 
+#define SIMDSetIfEquali32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDEquali32M(a,b)); } 
+#define SIMDSetIfEquali64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDEquali64M(a,b)); } 
+/* x = a > b ? c : d */
+#define SIMDSetIfGreateri8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDGreaterThani8M(a,b)); } 
+#define SIMDSetIfGreateri16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16M(a,b)); } 
+#define SIMDSetIfGreateri32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32M(a,b)); } 
+#define SIMDSetIfGreateri64(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani64M(a,b)); } 
+/* x = a < b ? c : d */
+#define SIMDSetIfLessi8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDGreaterThani8M(b,a)); } 
+#define SIMDSetIfLessi16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16M(b,a)); } 
+#define SIMDSetIfLessi32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32M(b,a)); } 
+#define SIMDSetIfLessi64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani64M(b,a)); } 
+
+/* x = a > b ? c : d, y = a > b ? a : b */
+#define SIMDGetIfGreateri8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(a,b);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+#define SIMDGetIfGreateri16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(a,b); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfGreateri32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(a,b); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfGreateri64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(a,b); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+/* x = a < b ? c : d, y = a < b ? a : b */
+#define SIMDGetIfLessi8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(b,a);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+#define SIMDGetIfLessi16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(b,a); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfLessi32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(b,a); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfLessi64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(b,a); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+
+// end of AVX512BW
+#else
+#ifdef __AVX512F__
+
+// start of AVX512F
+
+// XXX AVX512F has no  following instructions (AVX512BW HAS), so AVX512F is not working for 8/16 bits tasks
+// addi8/16, subi8/16, alignri8, bslli_epi128, bslrli_epi128,
+// comeqi8/16, cmpneqi8/16, cmpgti8/16, cmpgei8/16, cmplti8/16, cmplei8
+// maxi8/16, blendi8/i16, slli_epi16,srli_epi16 
+typedef __m512 SIMDf;
+typedef __m512i SIMDi;
+
+#define SIMDStore(x,y) _mm512_store_ps(x,y)
+#define SIMDStorei(x,y) _mm512_store_si512(x,y)
+#define SIMDLoad(x) _mm512_load_ps(x)
+#define SIMDLoadi(x) _mm512_load_si512(x)
+#define SIMDZero _mm512_setzero_si512()
+#define SIMDSetZero() _mm512_setzero_ps()
+#define SIMDSetZeroi() _mm512_setzero_si512()
+#define SIMDSetOne(x) _mm512_set1_ps(x)
+#define SIMDSetOnei8(x) _mm512_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm512_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm512_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm512_set1_epi64(x)
+#define SIMDAdd(x,y) _mm512_add_ps(x,y)
+//#define SIMDAddi8(x,y) _mm512_add_epi8(x,y)
+//#define SIMDAddi16(x,y) _mm512_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm512_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm512_add_epi64(x,y)
+#define SIMDSub(x,y) _mm512_sub_ps(x,y)
+//#define SIMDSubi8(x,y) _mm512_sub_epi8(x,y)
+//#define SIMDSubi16(x,y) _mm512_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm512_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm512_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm512_mul_ps(x,y)
+#define SIMDMuli32(x,y) _mm512_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm512_and_ps(x,y)
+#define SIMDAndi(x,y) _mm512_and_si512(x,y)
+#define SIMDAndNot(x,y) _mm512_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm512_andnot_si512(x,y)
+#define SIMDOr(x,y) _mm512_or_ps(x,y)
+#define SIMDOri(x,y) _mm512_or_si512(x,y)
+/*#define SIMDShiftLeft(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(x, _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), (16-(n))) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), (32-(n))) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _mm512_shuffle_i64x2(SIMDZero, _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), (48-(n))) : \
+    _mm512_bslli_epi128(_mm512_shuffle_i64x2(SIMDZero,  _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), ((n)-48))))
+#define SIMDShiftRight(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2( _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), x, (n)) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), ((n)-16)) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), ((n)-32)) : \
+    _mm512_bsrli_epi128(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), ((n)-48))))*/
+//#define SIMDShiftLeftOnei16(x,y) _mm512_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm512_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm512_slli_epi64(x,y)
+//#define SIMDShiftRightOnei16(x,y) _mm512_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm512_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm512_srli_epi64(x,y)
+#define SIMDEqualM(x,y) _mm512_cmpeq_ps_mask(x,y)
+//#define SIMDEquali8M(x,y) _mm512_cmpeq_epi8_mask(x,y)
+//#define SIMDEquali16M(x,y) _mm512_cmpeq_epi16_mask(x,y)
+#define SIMDEquali32M(x,y) _mm512_cmpeq_epi32_mask(x,y)
+#define SIMDEquali64M(x,y) _mm512_cmpeq_epi64_mask(x,y)
+#define SIMDNotEqualM(x,y) _mm512_cmpneq_ps_mask(x,y)
+//#define SIMDNotEquali8M(x,y) _mm512_cmpneq_epi8_mask(x,y)
+//#define SIMDNotEquali16M(x,y) _mm512_cmpneq_epi16_mask(x,y)
+#define SIMDNotEquali32M(x,y) _mm512_cmpneq_epi32_mask(x,y)
+#define SIMDNotEquali64M(x,y) _mm512_cmpneq_epi64_mask(x,y)
+//#define SIMDGreaterThani8M(x,y) _mm512_cmpgt_epi8_mask(x,y)
+//#define SIMDGreaterThani16M(x,y) _mm512_cmpgt_epi16_mask(x,y)
+#define SIMDGreaterThani32M(x,y) _mm512_cmpgt_epi32_mask(x,y)
+#define SIMDGreaterThani64M(x,y) _mm512_cmpgt_epi64_mask(x,y)
+//#define SIMDGreaterThanOrEquali8M(x,y) _mm512_cmpge_epi8_mask(x,y)
+//#define SIMDGreaterThanOrEquali16M(x,y) _mm512_cmpge_epi16_mask(x,y)
+#define SIMDGreaterThanOrEquali32M(x,y) _mm512_cmpge_epi32_mask(x,y)
+#define SIMDGreaterThanOrEquali64M(x,y) _mm512_cmpge_epi64_mask(x,y)
+#define SIMDLessThanM(x,y) _mm512_cmplt_ps_mask(x,y)
+//#define SIMDLessThani8M(x,y) _mm512_cmplt_epi8_mask(x,y)
+//#define SIMDLessThani16M(x,y) _mm512_cmplt_epi16_mask(x,y)
+#define SIMDLessThani32M(x,y) _mm512_cmplt_epi32_mask(x,y)
+#define SIMDLessThani64M(x,y) _mm512_cmplt_epi64_mask(x,y)
+#define SIMDLessThanOrEqualM(x,y) _mm512_cmple_ps_mask(x,y)
+//#define SIMDLessThanOrEquali8M(x,y) _mm512_cmple_epi8_mask(x,y)
+//#define SIMDLessThanOrEquali16M(x,y) _mm512_cmple_epi16_mask(x,y)
+#define SIMDLessThanOrEquali32M(x,y) _mm512_cmple_epi32_mask(x,y)
+#define SIMDLessThanOrEquali64M(x,y) _mm512_cmple_epi64_mask(x,y)
+#define SIMDMax(x,y) _mm512_max_ps(x,y)
+//#define SIMDMaxi8(x,y) _mm512_max_epi8(x,y)
+//#define SIMDMaxi16(x,y) _mm512_max_epi16(x,y)
+#define SIMDMaxi32(x,y) _mm512_max_epi32(x,y)
+#define SIMDMaxi64(x,y) _mm512_max_epi64(x,y)
+#define SIMDMin(x,y) _mm512_min_ps(x,y)
+//#define SIMDMini8(x,y) _mm512_min_epi8(x,y)
+//#define SIMDMini16(x,y) _mm512_min_epi16(x,y)
+#define SIMDMini32(x,y) _mm512_min_epi32(x,y)
+#define SIMDMini64(x,y) _mm512_min_epi64(x,y)
+
+#define SIMDBlend(x,y,z) _mm512_mask_blend_ps(z, x, y)
+//#define SIMDBlendi8(x,y,z) _mm512_mask_blend_epi8(z, x, y)
+//#define SIMDBlendi16(x,y,z) _mm512_mask_blend_epi16(z, x, y)
+#define SIMDBlendi32(x,y,z) _mm512_mask_blend_epi32(z, x, y)
+#define SIMDBlendi64(x,y,z) _mm512_mask_blend_epi64(z, x, y)
+
+// with AVX512F
+//#define Maski8 __mmask64
+//#define Maski16 __mmask32
+#define Maski32 __mmask16
+#define Maski64 __mmask8
+/* x = a == b ? c : d */ 
+//#define SIMDSetIfEquali8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDEquali8M(a,b)); } 
+//#define SIMDSetIfEquali16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDEquali16M(a,b)); } 
+#define SIMDSetIfEquali32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDEquali32M(a,b)); } 
+#define SIMDSetIfEquali64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDEquali64M(a,b)); } 
+/* x = a > b ? c : d */
+//#define SIMDSetIfGreateri8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDGreaterThani8M(a,b)); } 
+//#define SIMDSetIfGreateri16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16M(a,b)); } 
+#define SIMDSetIfGreateri32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32M(a,b)); } 
+#define SIMDSetIfGreateri64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani64M(a,b)); } 
+/* x = a < b ? c : d */
+//#define SIMDSetIfLessi8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDGreaterThani8M(b,a)); } 
+//#define SIMDSetIfLessi16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani8M(b,a)); } 
+#define SIMDSetIfLessi32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani8M(b,a)); } 
+#define SIMDSetIfLessi64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani8M(b,a)); } 
+
+/* x = a > b ? c : d, y = a > b ? a : b */
+//#define SIMDGetIfGreateri8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(a,b);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+//#define SIMDGetIfGreateri16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(a,b); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfGreateri32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(a,b); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfGreateri64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(a,b); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+/* x = a < b ? c : d, y = a < b ? a : b */
+//#define SIMDGetIfLessi8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(b,a);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+//#define SIMDGetIfLessi16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(b,a); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfLessi32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(b,a); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfLessi64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(b,a); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+
+// end of AVX512F
+#else  // AVX2 SSE4.1 SSE2
+#ifdef __AVX2__
+
+// start of AVX2
+// m256 will be our base type
+typedef __m256 SIMDf;  //for floats
+typedef __m256i SIMDi; //for integers
+
+//intrinsic functions
+#define SIMDStore(x,y) _mm256_store_ps(x,y)
+#define SIMDLoad(x) _mm256_load_ps(x)
+#define SIMDStorei(x,y) _mm256_store_si256(x,y)
+#define SIMDLoadi(x) _mm256_load_si256(x)
+#define SIMDSet(x,y,z,w,a,b,c,d) _mm256_set_ps(x,y,z,w,a,b,c,d)
+#define SIMDSeti8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32) __mm256_set_epi8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32)
+#define SIMDSeti16(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16) __mm256_set_epi16(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16)
+#define SIMDSeti32(x1,x2,x3,x4,x5,x6,x7,x8) __mm256_set_epi32(x1,x2,x3,x4,x5,x6,x7,x8)
+#define SIMDSeti64(x1,x2,x3,x4) __mm256_set_epi64x(x1,x2,x3,x4)
+#define SIMDSeti128(x,y) __mm256_set_m128(x,y)
+#define SIMDSetZero() _mm256_setzero_ps()
+#define SIMDSetZeroi() _mm256_setzero_si256()
+#define SIMDSetOne(x) _mm256_set1_ps(x)
+#define SIMDSetOnei8(x) _mm256_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm256_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm256_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm256_set1_epi64x(x)
+#define SIMDAdd(x,y) _mm256_add_ps(x,y)
+#define SIMDAddi8(x,y) _mm256_add_epi8(x,y)
+#define SIMDAddi16(x,y) _mm256_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm256_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm256_add_epi64(x,y)
+#define SIMDSub(x,y) _mm256_sub_ps(x,y)
+#define SIMDSubi8(x,y) _mm256_sub_epi8(x,y)
+#define SIMDSubi16(x,y) _mm256_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm256_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm256_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm256_mul_ps(x,y)
+#define SIMDMuli(x,y) _mm256_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm256_and_ps(x,y)
+#define SIMDAndi(x,y) _mm256_and_si256(x,y)
+#define SIMDAndNot(x,y) _mm256_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm256_andnot_si256(x,y)
+#define SIMDOr(x,y) _mm256_or_ps(x,y)
+#define SIMDOri(x,y) _mm256_or_si256(x,y)
+#define SIMDShiftLeft(a, n) (n) < 16 ? \
+    _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), (16-(n))) : \
+    _mm256_slli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), ((n)-16))
+
+#define SIMDShiftRight(a, n) (n) < 16 ? \
+    _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), (n)) : \
+    _mm256_srli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), ((n)-16))
+#define SIMDShiftLeftOnei16(x,y) _mm256_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm256_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm256_slli_epi64(x,y)
+#define SIMDShiftRightOnei16(x,y) _mm256_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm256_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm256_srli_epi64(x,y)
+#define SIMDEqual(x,y)  _mm256_cmp_ps(x,y,_CMP_EQ_OQ) 
+#define SIMDEquali16(x,y) _mm256_cmpeq_epi16(x,y)
+#define SIMDEquali8(x,y) _mm256_cmpeq_epi8(x,y)
+#define SIMDEquali32(x,y) _mm256_cmpeq_epi32(x,y)
+#define SIMDEquali64(x,y) _mm256_cmpeq_epi64(x,y)
+#define SIMDGreaterThan(x,y) _mm256_cmp_ps(x,y,_CMP_GT_OQ)
+#define SIMDGreaterThani16(x,y) _mm256_cmpgt_epi16(x,y)
+#define SIMDGreaterThani8(x,y) _mm256_cmpgt_epi8(x,y)
+#define SIMDGreaterThani32(x,y) _mm256_cmpgt_epi32(x,y)
+#define SIMDGreaterThani64(x,y) _mm256_cmpgt_epi64(x,y) 
+#define SIMDFloor(x) _mm256_floor_ps(x)
+#define SIMDMax(x,y) _mm256_max_ps(x,y)
+#define SIMDMaxi8(x,y) _mm256_max_epi8(x,y)
+#define SIMDMaxi16(x,y) _mm256_max_epi16(x,y)
+#define SIMDMaxi32(x,y) _mm256_max_epi32(x,y)
+#define SIMDMaxi64(x,y) _mm256_max_epi64(x,y)
+#define SIMDMin(x,y) _mm256_min_ps(x,y)
+#define SIMDMini8(x,y) _mm256_min_epi8(x,y)
+#define SIMDMini16(x,y) _mm256_min_epi16(x,y)
+#define SIMDMini32(x,y) _mm256_min_epi32(x,y)
+
+#define SIMDBlendV(x,y,z) _mm256_blendv_ps(x,y,z)
+#define SIMDBlendVi8(x,y,z) _mm256_blendv_epi8(x,y,z)
+
+// end of AVX2 only
+ 
+#else // SSE4.1 SSE2
+
+// start of SSE4.1 and SSE2
+// m128 will be our base type
+typedef __m128 SIMDf;   //for floats
+typedef __m128i SIMDi; //for integers
+
+#define SIMDStore(x,y) _mm_store_ps(x,y)
+#define SIMDLoad(x) _mm_load_ps(x)
+#define SIMDStorei(x,y) _mm_store_si128(x,y)
+#define SIMDLoadi(x) _mm_load_si128(x)
+#define SIMDSeti8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16) __mm_set_epi8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16)
+#define SIMDSeti16(x1,x2,x3,x4,x5,x6,x7,x8) __mm_set_epi16(x1,x2,x3,x4,x5,x6,x7,x8)
+#define SIMDSeti32(x1,x2,x3,x4) __mm_set_epi32(x1,x2,x3,x4)
+#define SIMDSeti64(x,y) __mm_set_epi64(x,y)
+#define SIMDSetOne(x) _mm_set1_ps(x)
+#define SIMDSetZero() _mm_setzero_ps()
+#define SIMDSetOnei8(x) _mm_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm_set1_epi64(x)
+#define SIMDSetZeroi() _mm_setzero_si128()
+#define SIMDAdd(x,y) _mm_add_ps(x,y)
+#define SIMDAddi8(x,y) _mm_add_epi8(x,y)
+#define SIMDAddi16(x,y) _mm_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm_add_epi64(x,y)
+#define SIMDSub(x,y) _mm_sub_ps(x,y)
+#define SIMDSubi8(x,y) _mm_sub_epi8(x,y)
+#define SIMDSubi16(x,y) _mm_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm_mul_ps(x,y)
+#define SIMDMuli(x,y) _mm_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm_and_ps(x,y)
+#define SIMDAndi(x,y) _mm_and_si128(x,y)
+#define SIMDAndNot(x,y) _mm_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm_andnot_si128(x,y)
+#define SIMDOr(x,y) _mm_or_ps(x,y)
+#define SIMDOri(x,y) _mm_or_si128(x,y)
+#define SIMDShiftLeft(x,y) _mm_slli_si128(x,y) // shift whole x by y bits
+#define SIMDShiftRight(x,y) _mm_srli_si128(x,y)
+#define SIMDShiftLeftOnei16(x,y) _mm_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm_slli_epi64(x,y)
+#define SIMDShiftRightOnei16(x,y) _mm_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm_srli_epi64(x,y)
+#define SIMDEqual(x,y)  _mm_cmpeq_ps(x,y)
+#define SIMDEquali8(x,y) _mm_cmpeq_epi8(x,y)
+#define SIMDEquali16(x,y) _mm_cmpeq_epi16(x,y)
+#define SIMDEquali32(x,y) _mm_cmpeq_epi32(x,y)
+#define SIMDGreaterThan(x,y) _mm_cmpgt_ps(x,y)
+#define SIMDGreaterThani8(x,y) _mm_cmpgt_epi8(x,y)
+#define SIMDGreaterThani16(x,y) _mm_cmpgt_epi16(x,y)
+#define SIMDGreaterThani32(x,y) _mm_cmpgt_epi32(x,y)
+#define SIMDLessThan(x,y) _mm_cmplt_ps(x,y)
+#define SIMDLessThani8(x,y) _mm_cmplt_epi8(x,y) 
+#define SIMDLessThani16(x,y) _mm_cmplt_epi16(x,y) 
+#define SIMDLessThani32(x,y) _mm_cmplt_epi32(x,y) 
+#define SIMDMax(x,y) _mm_max_ps(x,y)
+#define SIMDMaxi16(x,y) _mm_max_epi16(x,y)
+#define SIMDMin(x,y) _mm_min_ps(x,y)
+#define SIMDMini16(x,y) _mm_min_epi16(x,y)
+
+#define Maski16 __mmask8
+#define Maski32 __mmask8
+
+#ifdef __SSE4_1__
+
+// start of SSE4.1 only
+#define SIMDBlendV(x,y,z) _mm_blendv_ps(x,y,z)	    // z is __mask
+#define SIMDBlendVi8(x,y,z) _mm_blendv_epi8(x,y,z)	
+#define SIMDEquali64(x,y) _mm_cmpeq_epi64(x,y)
+#define SIMDFloor(x) _mm_floor_ps(x)
+#define SIMDMaxi8(x,y) _mm_max_epi8(x,y)
+#define SIMDMini8(x,y) _mm_min_epi8(x,y)
+#define SIMDMaxi32(x,y) _mm_max_epi32(x,y)
+#define SIMDMini32(x,y) _mm_min_epi32(x,y)
+// end of SSE4.1 only
+
+#else  // SSE2
+
+// start of SSE2 only
+#define SIMDBlendV(x,y,z) SIMDOr(SIMDAndNot(z,x), SIMDAnd(z,y))   //if we don't have sse4
+#define SIMDBlendVi8(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))    //if we don't have sse4
+#define SIMDMaxi8(x,y) SIMDBlendVi8(y, x, SIMDGreaterThani8(x,y))
+#define SIMDMini8(x,y) SIMDBlendVi8(x, y, SIMDGreaterThani8(x,y))
+#define SIMDMaxi32(x,y) SIMDBlendi32(y, x, SIMDGreaterThani32(x,y))
+#define SIMDMini32(x,y) SIMDBlendi32(x, y, SIMDGreaterThani32((x,y))
+// end of SSE2 only
+// end of SSE4.1 and SSE2
+
+#endif // SSE4.1
+
+#endif // AVX2
+
+// start of no AVX512F (AVX2/SSE4.1/SSE2)
+#define SIMDBlendi16(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))
+#define SIMDBlendi32(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))
+#define SIMDBlendi64(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))
+
+/* x = a == b ? c : d */ 
+#define SIMDSetIfEquali8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDEquali8(a,b)); } 
+#define SIMDSetIfEquali16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDEquali16(a,b)); } 
+#define SIMDSetIfEquali32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDEquali32(a,b)); } 
+#define SIMDSetIfEquali64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDEquali64(a,b)); } 
+/* x = a > b ? c : d */
+#define SIMDSetIfGreateri8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDGreaterThani8(a,b)); } 
+#define SIMDSetIfGreateri16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16(a,b)); } 
+#define SIMDSetIfGreateri32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32(a,b)); } 
+#define SIMDSetIfGreateri64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani64(a,b)); } 
+/* x = a < b ? c : d */
+#define SIMDSetIfLessi8(x,a,b,c,d)  { x = BlendVi8(d, c, SIMDGreaterThani8(b,a)); } 
+#define SIMDSetIfLessi16(x,a,b,c,d) { x = Blendi16(d, c, SIMDGreaterThani16(b,a)); } 
+#define SIMDSetIfLessi32(x,a,b,c,d) { x = Blendi32(d, c, SIMDGreaterThani32(b,a)); } 
+#define SIMDSetIfLessi64(x,a,b,c,d) { x = Blendi64(d, c, SIMDGreaterThani64(b,a)); } 
+
+/* x = a > b ? c : d, y = a > b ? a : b */
+#define SIMDGetIfGreateri8(x,y,a,b,c,d)  { SIMDi cmp = SIMDGreaterThani8(a,b);  x = SIMDBlendVi8(d, c, cmp); y = SIMDBlendVi8(b, a, cmp); } 
+#define SIMDGetIfGreateri16(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani16(a,b); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfGreateri32(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani32(a,b); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfGreateri64(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani64(a,b); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+/* x = a < b ? c : d, y = a < b ? a : b */
+#define SIMDGetIfLessi8(x,y,a,b,c,d)  { SIMDi cmp = SIMDGreaterThani8(b,a);  x = SIMDBlendVi8(d, c, cmp); y = SIMDBlendVi8(b, a, cmp); } 
+#define SIMDGetIfLessi16(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani16(b,a); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfLessi32(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani32(b,a); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfLessi64(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani64(b,a); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+// end of no AVX512F (AVX2/SSE4.1/SSE2)
+
+#endif // AVX512F
+#endif // AVX512BW
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// int simd_check(void);
+
+/*
+static void *SIMDMalloc(size_t size, size_t align) {
+    void *ret = (void*)_mm_malloc(size, align);
+    if (ret == NULL) {
+        fprintf(stderr, "[%s] mm_Malloc fail!\nSize: %ld\n", __func__, size);
+        exit(1);
+    }
+    else return ret;
+}*/
+
+// use posix_memalign
+static void *SIMDMalloc(size_t size, size_t align) {
+    void *ret; int res;
+    res = posix_memalign(&ret, align, size);
+    if (res != 0) {
+        char error[10];
+        if (res == EINVAL) strcpy(error, "EINVAR");
+        else if (res == ENOMEM)
+            strcpy(error, "ENOMEM");
+        else strcpy(error, "Unknown");
+        fprintf(stderr, "[%s] posix_memalign fail!\nSize: %ld, Error: %s\n", __func__, size, error);
+        exit(1);
+    }
+    else return ret;
+}
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // SIMD_INSTRUCTION_H
diff --git a/pog.png b/pog.png
new file mode 100644
index 0000000..2d29d2a
Binary files /dev/null and b/pog.png differ
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..972d076
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,100 @@
+# pyabpoa: abPOA Python interface
+## Introduction
+pyabpoa provides an easy-to-use interface to [abPOA](https://github.com/yangao07/abPOA), it contains all the APIs that can be used to perform MSA for a set of sequences and consensus calling from the final alignment graph.
+
+## Installation
+
+### Install pyabpoa with pip
+
+pyabpoa can be installed with pip:
+
+```
+pip install pyabpoa
+```
+
+### Install pyabpoa from source
+Alternatively, you can install pyabpoa from source (cython is required):
+```
+git clone --recursive https://github.com/yangao07/abPOA.git
+cd abPOA
+make install_py
+```
+
+## Examples
+The following code illustrates how to use pyabpoa.
+```
+import pyabpoa as pa
+a = pa.msa_aligner()
+seqs=[
+'CCGAAGA',
+'CCGAACTCGA',
+'CCCGGAAGA',
+'CCGAAGA'
+]
+res=a.msa(seqs, out_cons=True, out_msa=True, out_pog='pog.png', incr_fn='') # perform multiple sequence alignment 
+                                                                # generate a figure of alignment graph to pog.png
+
+for seq in res.cons_seq:
+    print(seq)  # print consensus sequence
+
+res.print_msa() # print row-column multiple sequence alignment in PIR format
+```
+You can also try the example script provided in the source folder:
+```
+python ./python/example.py
+```
+
+
+## APIs
+
+### Class pyabpoa.msa_aligner
+```
+pyabpoa.msa_aligner(aln_mode='g', ...)
+```
+This constructs a multiple sequence alignment handler of pyabpoa, it accepts the following arguments:
+
+* **aln_mode**: alignment mode. 'g': global, 'l': local, 'e': extension; default: **'g'**
+* **is_aa**: input is amino acid sequence; default: **False**
+* **match**: match score; default: **2**
+* **mismatch**: match penaty; default: **4**
+* **score_matrix**: scoring matrix file, **match** and **mismatch** are not used when **score_matrix** is used; default: **''**
+* **gap_open1**: first gap opening penalty; default: **4**
+* **gap_ext1**: first gap extension penalty; default: **2**
+* **gap_open2**: second gap opening penalty; default: **24**
+* **gap_ext2**: second gap extension penalty; default: **1**
+* **extra_b**: first adaptive banding paremeter; set as < 0 to disable adaptive banded DP; default: **10**
+* **extra_f**: second adaptive banding paremete; the number of extra bases added on both sites of the band is *b+f\*L*, where *L* is the length of the aligned sequence; default : **0.01**
+
+The `msa_aligner` handler provides one method which performs multiple sequence alignment and takes four arguments:
+```
+pyabpoa.msa_aligner.msa(seqs, out_cons, out_msa, out_pog='', incr_fn='')
+```
+
+* **seqs**: a list variable containing a set of input sequences; **positional**
+* **out_cons**: a bool variable to ask pyabpoa to generate consensus sequence; **positional**
+* **out_msa**: a bool variable to ask pyabpoa to generate RC-MSA; **positional**
+* **max_n_cons**: maximum number of consensus sequence to generate; default: **1**
+* **min_freq**: minimum frequency of each consensus to output (effective when **max_n_cons** > 1); default: **0.3**
+* **out_pog**: name of a file (`.png` or `.pdf`) to store the plot of the final alignment graph; default: **''**
+* **incr_fn**: name of an existing graph (GFA) or MSA (FASTA) file, incrementally align sequence to this graph/MSA; default: **''**
+
+### Class pyabpoa.msa_result
+```
+pyabpoa.msa_result(seq_n, cons_n, cons_len, ...)
+```
+This class describes the information of the generated consensus sequence and the RC-MSA. The returned result of `pyabpoa.msa_aligner.msa()` is an object of this class that has the following properties:
+
+* **n_seq**: number of input aligned sequences
+* **n_cons**: number of generated consensus sequences (generally 1, could be 2 or more if **max_n_cons** is set as > 1)
+* **clu_n_seq**: an array of sequence cluster size
+* **cons_len**: an array of consensus sequence length(s)
+* **cons_seq**: an array of consensus sequence(s)
+* **cons_cov**: an array of consensus sequence coverage for each base
+* **msa_len**: size of each row in the RC-MSA
+* **msa_seq**: an array containing `n_seq`+`n_cons` strings that demonstrates the RC-MSA, each consisting of one input sequence and several `-` indicating the alignment gaps. 
+
+`pyabpoa.msa_result()` has a function of `print_msa` which prints the RC-MSA to screen.
+
+```
+pyabpoa.msa_result().print_msa()
+```
diff --git a/python/cabpoa.pxd b/python/cabpoa.pxd
new file mode 100644
index 0000000..f5eb558
--- /dev/null
+++ b/python/cabpoa.pxd
@@ -0,0 +1,185 @@
+from libc.stdint cimport int8_t, uint8_t, int32_t, int64_t, uint32_t, uint64_t
+from libc.stdio cimport FILE
+
+cdef extern from "simd_instruction.h":
+    int simd_check()
+
+cdef extern from "abpoa.h":
+    cdef int ABPOA_GLOBAL_MODE "ABPOA_GLOBAL_MODE"
+    cdef int ABPOA_LOCAL_MODE "ABPOA_LOCAL_MODE"
+    cdef int ABPOA_EXTEND_MODE "ABPOA_EXTEND_MODE"
+
+    # gap mode
+    cdef int ABPOA_LINEAR_GAP "ABPOA_LINEAR_GAP"
+    cdef int ABPOA_AFFINE_GAP "ABPOA_AFFINE_GAP"
+    cdef int ABPOA_CONVEX_GAP "ABPOA_CONVEX_GAP"
+
+    cdef int ABPOA_EXTRA_B "ABPOA_EXTRA_B"
+    cdef float ABPOA_EXTRA_F "ABPOA_EXTRA_F"
+
+    cdef char *ABPOA_CIGAR_STR "ABPOA_CIGAR_STR"
+    cdef int ABPOA_CMATCH "ABPOA_CMATCH"
+    cdef int ABPOA_CINS "ABPOA_CINS"
+    cdef int ABPOA_CDEL "ABPOA_CDEL"      
+    cdef int ABPOA_CDIFF "ABPOA_CDIFF"     
+    cdef int ABPOA_CSOFT_CLIP "ABPOA_CSOFT_CLIP"
+    cdef int ABPOA_CHARD_CLIP "ABPOA_CHARD_CLIP"
+
+    cdef int ABPOA_SRC_NODE_ID "ABPOA_SRC_NODE_ID"
+    cdef int ABPOA_SINK_NODE_ID "ABPOA_SINK_NODE_ID"
+
+    cdef int ABPOA_OUT_CONS "ABPOA_OUT_CONS"
+    cdef int ABPOA_OUT_MSA "ABPOA_OUT_MSA"
+    cdef int ABPOA_OUT_CONS_MSA "ABPOA_OUT_CONS_MSA"
+    cdef int ABPOA_OUT_GFA "ABPOA_OUT_GFA"
+    cdef int ABPOA_OUT_CONS_GFA "ABPOA_OUT_CONS_GFA"
+
+    cdef int ABPOA_HB "ABPOA_HB"
+    cdef int ABPOA_HC "ABPOA_HC"
+    cdef int ABPOA_MF "ABPOA_MF"
+
+    ctypedef struct abpoa_res_t:
+        int n_cigar, m_cigar
+        uint64_t *graph_cigar
+        int node_s, node_e, query_s, query_e # for local and  extension mode
+        int n_aln_bases, n_matched_bases
+        uint32_t best_score
+
+
+    ctypedef struct abpoa_para_t:
+        int m
+        int *mat # score matrix
+        char *mat_fn
+        int use_score_matrix
+        int match, max_mat, mismatch, min_mis, gap_open1, gap_open2, gap_ext1, gap_ext2
+        int inf_min
+        int k, w, min_w
+        int wb # 1st part of extra band width
+        float wf # 2nd part of extra band width. w=wb+wf*L (L is sequence length)
+        int zdrop, end_bonus # from minimap2
+        # int simd_flag # available SIMD instruction
+        # alignment mode
+        uint8_t ret_cigar, rev_cigar, out_msa, out_cons, out_gfa, out_fq, use_read_ids, amb_strand # mode: 0: global, 1: local, 2: extend
+        uint8_t use_qv, disable_seeding, progressive_poa
+        char *incr_fn
+        char *out_pog
+        int align_mode, gap_mode, max_n_cons
+        double min_freq # for diploid data
+        int verbose
+
+
+    ctypedef struct abpoa_node_t:
+        int node_id
+        int in_edge_n, in_edge_m
+        int *in_id
+        int out_edge_n, out_edge_m
+        int *out_id
+        int *out_weight
+        int *read_weight
+        int n_read, m_read
+        uint64_t **read_ids
+        int read_ids_n # for diploid
+        int aligned_node_n, aligned_node_m
+        int *aligned_node_id # mismatch; aligned node will have same rank
+        uint8_t base # 0~m
+
+    ctypedef struct abpoa_graph_t:
+        abpoa_node_t *node
+        int node_n, node_m, index_rank_m
+        int *index_to_node_id
+        int *node_id_to_index 
+        int *node_id_to_max_pos_left
+        int *node_id_to_max_pos_right
+        int *node_id_to_max_remain
+        int *node_id_to_msa_rank
+        uint8_t is_topological_sorted, is_called_cons, is_set_msa_rank
+
+    ctypedef struct abpoa_cons_t:
+        int n_cons, n_seq, msa_len
+        int *clu_n_seq
+        int **clu_read_ids
+        int *cons_len
+        int **cons_node_ids
+        uint8_t **cons_base
+        uint8_t **msa_base
+        int **cons_cov
+        int **cons_phred_score;
+
+    ctypedef struct abpoa_str_t:
+        int l, m
+        char *s
+
+    ctypedef struct abpoa_seq_t:
+        int n_seq, m_seq
+        abpoa_str_t *seq
+        abpoa_str_t *name
+        abpoa_str_t *comment
+        abpoa_str_t *qual
+        uint8_t *is_rc
+
+    ctypedef struct abpoa_simd_matrix_t:
+        pass
+    
+    ctypedef struct abpoa_t:
+        abpoa_graph_t *abg
+        abpoa_seq_t *abs
+        abpoa_simd_matrix_t *abm
+        abpoa_cons_t *abc
+
+    # init for abpoa parameters
+    abpoa_para_t *abpoa_init_para()
+    void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mtx_fn)
+    void abpoa_post_set_para(abpoa_para_t *abpt)
+    void abpoa_free_para(abpoa_para_t *abpt)
+
+
+    # init for alignment
+    abpoa_t *abpoa_init()
+    void abpoa_free(abpoa_t *ab)
+
+    # do msa for a set of input sequences
+    int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, int ** qual_weights, FILE *out_fp)
+    int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp)
+
+    # clean alignment graph
+    void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen)
+
+    # restore graph from GFA/MSA file
+    abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt)
+
+    # align a sequence to a graph
+    int abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res)
+
+    # align to sub-graph
+    void abpoa_subgraph_nodes(abpoa_t *ab, abpoa_para_t *abpt, int inc_beg, int inc_end, int *exc_beg, int *exc_end)
+    int abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res)
+
+    # add an alignment to a graph
+    int abpoa_add_graph_node(abpoa_graph_t *abg, uint8_t base)
+    void abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_edge, int w, uint8_t add_read_id, uint8_t add_read_weight, int read_id, int read_ids_n, int tot_read_n)
+    int abpoa_add_graph_alignment(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int *weight, int qlen, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends)
+    int abpoa_add_subgraph_alignment(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int *weight, int qlen, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends)
+
+    void abpoa_BFS_set_node_index(abpoa_graph_t *abg, int src_id, int sink_id)
+    void abpoa_BFS_set_node_remain(abpoa_graph_t *abg, int src_id, int sink_id)
+    void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt)
+
+    # generate consensus sequence from graph
+    # para:
+    #   out_fp: consensus sequence output in FASTA format, set as NULL to disable
+    void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt)
+    void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp)
+
+
+    # generate column multiple sequence alignment from graph
+    void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt)
+    void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp)
+
+    # generate full graph in GFA format
+    void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp)
+
+    # output to out_fp
+    void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp)
+
+    # generate DOT graph plot 
+    void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt)
diff --git a/python/example.py b/python/example.py
new file mode 100644
index 0000000..e26ebbb
--- /dev/null
+++ b/python/example.py
@@ -0,0 +1,90 @@
+import pyabpoa as pa
+
+#@parameters of msa_aligner:
+#   aln_mode='g' # g: global, l: local, e: extension
+#   is_aa=False # set as True if input is amino acid sequence
+#   score_matrix='' # file of score matrix, e.g. HOXD70.mtx/BLOSUM62.mtx
+#   match=2
+#   mismatch=4
+#   gap_open1=4
+#   gap_open2=24
+#   gap_ext1=2
+#   gap_ext2=1
+#   extra_b = 10 # 1st part of extra band, -1 to disable banded DP
+#   extra_f = 0.01  # 2nd part of eatra band. w = extra_b+extra_f*L (L is sequence length)
+#   max_n_cons=1 # to output at most N cons, set max_n_cons as N
+#   min_freq=0.3 # minimum frequence of each consensus to output for diploid data
+
+# construct msa aligner
+a = pa.msa_aligner()
+
+print("==== First exmaple: 2 consensus sequences ====\n")
+# for multiple consensus
+seqs=[
+ 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATAAAAAAAAAAAAAAAAAAACGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT',
+ 'CGATCGATCGATCGATGCATGCATCGATGCATCGATCGATGCATGCAT'
+ ]
+
+#@parameters of msa
+#seqs: multiple sequences
+out_cons=True # generate consensus sequence, set as False to disable
+out_msa=True # generate row-column multiple sequence alignment, set as False to disable
+out_pog="example1.png" # generate plot of alignment graph, set None to disable
+max_n_cons = 2
+
+# multiple sequence alignment for 'seqs'
+res=a.msa(seqs, out_cons=out_cons, out_msa=out_msa, max_n_cons=max_n_cons, out_pog=out_pog)
+
+# output result
+if out_cons:
+    for i in range(res.n_cons):
+        print(">Consensus_sequence_{}".format(i+1))
+        print(res.cons_seq[i])
+if out_msa:
+    res.print_msa()
+
+
+print("\n\n==== Second exmaple: 1 consensus sequence ====\n")
+seqs=[
+'CGTCAATCTATCGAAGCATACGCGGGCAGAGCCGAAGACCTCGGCAATCCA',
+'CCACGTCAATCTATCGAAGCATACGCGGCAGCCGAACTCGACCTCGGCAATCAC',
+'CGTCAATCTATCGAAGCATACGCGGCAGAGCCCGGAAGACCTCGGCAATCAC',
+'CGTCAATGCTAGTCGAAGCAGCTGCGGCAGAGCCGAAGACCTCGGCAATCAC',
+'CGTCAATCTATCGAAGCATTCTACGCGGCAGAGCCGACCTCGGCAATCAC',
+'CGTCAATCTAGAAGCATACGCGGCAAGAGCCGAAGACCTCGGCCAATCAC',
+'CGTCAATCTATCGGTAAAGCATACGCTCTGTAGCCGAAGACCTCGGCAATCAC',
+'CGTCAATCTATCTTCAAGCATACGCGGCAGAGCCGAAGACCTCGGCAATC',
+'CGTCAATGGATCGAGTACGCGGCAGAGCCGAAGACCTCGGCAATCAC',
+'CGTCAATCTAATCGAAGCATACGCGGCAGAGCCGTCTACCTCGGCAATCACGT'
+]
+
+#@parameters of msa
+#seqs: multiple sequences
+out_cons=True # generate consensus sequence, set as False to disable
+out_msa=True # generate row-column multiple sequence alignment, set as False to disable
+out_pog="example2.png" # generate plot of alignment graph, set None to disable
+max_n_cons = 1
+
+# multiple sequence alignment for 'seqs'
+res=a.msa(seqs, out_cons=out_cons, out_msa=out_msa, max_n_cons=max_n_cons, out_pog=out_pog)
+
+# output result
+if out_cons:
+    for i in range(res.n_cons):
+        print(">Consensus_sequence_{}".format(i+1))
+        print(res.cons_seq[i])
+if out_msa:
+    for i in range(res.n_seq):
+        print(">Seq_{}".format(i+1))
+        print(res.msa_seq[i])
+    for i in range(res.n_cons):
+        print(">Consensus_sequence_{}".format(i+1))
+        print(res.msa_seq[res.n_seq+i])
diff --git a/python/pyabpoa.pyx b/python/pyabpoa.pyx
new file mode 100644
index 0000000..d344e3f
--- /dev/null
+++ b/python/pyabpoa.pyx
@@ -0,0 +1,226 @@
+import re, sys, os
+from libc.stdlib cimport malloc, free
+from libc.stdint cimport uint8_t
+from collections import defaultdict as dd
+cimport cython
+from cabpoa cimport *
+
+
+cdef class msa_result:
+    cdef int n_seq
+    cdef int n_cons
+    cdef clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov  # _cons_len:[int], _cons_seq:['']
+    cdef int msa_len
+    cdef msa_seq  # _msa_seq:['']
+
+    def __cinit__(self, n_seq, n_cons, clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov, msa_len, msa_seq):
+        self.n_seq = n_seq
+        self.n_cons = n_cons
+        self.clu_n_seq = clu_n_seq
+        self.clu_read_ids = clu_read_ids
+        self.cons_len = cons_len
+        self.cons_seq = cons_seq
+        self.cons_cov = cons_cov
+        self.msa_len = msa_len
+        self.msa_seq = msa_seq
+
+    @property
+    def n_seq(self): return self.n_seq
+
+    @property
+    def n_cons(self): return self.n_cons
+
+    @property
+    def clu_n_seq(self): return self.clu_n_seq
+
+    @property
+    def clu_read_ids(self): return self.clu_read_ids
+
+    @property
+    def cons_len(self): return self.cons_len
+
+    @property
+    def cons_seq(self): return self.cons_seq
+
+    @property
+    def cons_cov(self): return self.cons_cov
+
+    @property
+    def msa_len(self): return self.msa_len
+
+    @property
+    def msa_seq(self): return self.msa_seq
+
+    def print_msa(self): 
+        if not self.msa_seq: return
+        for i, s in enumerate(self.msa_seq):
+            if i < self.n_seq:
+                print('>Seq_{}'.format(i+1))
+            else:
+                if self.n_cons > 1:
+                    cons_id = '_{} {}'.format(i-self.n_seq+1, ','.join(list(map(str, self.clu_read_ids[i-self.n_seq]))))
+                else:
+                    cons_id = ''
+                print('>Consensus_sequence{}'.format(cons_id))
+            print(s)
+        return
+
+
+def set_seq_int_dict(m):
+    if m == 5:  # ACGTN ==> 01234, U ==> 4
+        seqs = 'ACGUTN'
+        ints = [0, 1, 2, 3, 3, 4]
+    elif m == 27:  # ACGTN    ==> 01234, BDEFH... ==> 56789...
+        seqs = 'ACGTNBDEFHIJKLMOPQRSUVWXYZ*'
+        ints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+    else:
+        raise Exception('Unexpected m: {}'.format(m))
+
+    seq2int_dict = dd(lambda: m-1)
+    int2seq_dict = dd(lambda: '-')
+    for s, i in zip(seqs, ints):
+        seq2int_dict[s] = i
+        seq2int_dict[s.lower()] = i
+        int2seq_dict[i] = s
+    return seq2int_dict, int2seq_dict
+
+
+cdef class msa_aligner:
+    cdef abpoa_t *ab
+    cdef abpoa_para_t abpt
+    cdef seq2int_dict, int2seq_dict
+
+    def __cinit__(self, aln_mode='g', is_aa=False, match=2, mismatch=4, score_matrix=b'', gap_open1=4, gap_open2=24, gap_ext1=2, gap_ext2=1,
+            extra_b=10, extra_f=0.01):
+        self.ab = abpoa_init()
+
+        if aln_mode == 'g':
+            self.abpt.align_mode = ABPOA_GLOBAL_MODE
+        elif aln_mode == 'l':
+            self.abpt.align_mode = ABPOA_LOCAL_MODE
+        elif aln_mode == 'e':
+            self.abpt.align_mode = ABPOA_EXTEND_MODE
+        else:
+            raise Exception('Unknown align mode: {}'.format(aln_mode))
+        if is_aa:
+            self.abpt.m = 27
+            self.abpt.mat = <int*>malloc(27 * 27 * cython.sizeof(int))
+        else:
+            self.abpt.m = 5
+            self.abpt.mat = <int*>malloc(25 * cython.sizeof(int))
+        self.abpt.match = match
+        self.abpt.mismatch = mismatch
+
+        if score_matrix:
+            if isinstance(score_matrix, str): 
+                score_matrix = bytes(score_matrix, 'utf-8')
+            if os.path.exists(score_matrix.decode('utf-8')):
+                abpoa_set_mat_from_file(&self.abpt, score_matrix)
+            else:
+                raise Exception('Matrix file not exist: {}'.format(score_matrix.decode('utf-8')))
+
+        self.abpt.gap_open1 = gap_open1
+        self.abpt.gap_open2 = gap_open2
+        self.abpt.gap_ext1 = gap_ext1
+        self.abpt.gap_ext2 = gap_ext2
+        self.abpt.ret_cigar = 1
+
+        self.abpt.wb = extra_b
+        self.abpt.wf = extra_f 
+        self.abpt.use_qv = 0
+        self.abpt.end_bonus = -1 # disable end_bonus/zdrop
+        self.abpt.zdrop = -1
+        self.abpt.disable_seeding = 1
+        self.abpt.progressive_poa = 0
+
+        self.seq2int_dict, self.int2seq_dict = set_seq_int_dict(self.abpt.m)
+
+    def __dealloc__(self):
+        free(self.abpt.mat)
+        abpoa_free(self.ab)
+
+    def __bool__(self):
+        return self.ab != NULL
+
+
+    def msa(self, seqs, out_cons, out_msa, max_n_cons=1, min_freq=0.25, out_pog=b'', incr_fn=b''):
+        cdef int seq_n = len(seqs)
+        cdef int exist_n = 0
+        cdef int tot_n = seq_n
+        cdef uint8_t *bseq
+        cdef abpoa_res_t res
+        cdef abpoa_cons_t abc
+
+        if out_cons: self.abpt.out_cons = 1
+        else: self.abpt.out_cons = 0
+        if out_msa: self.abpt.out_msa = 1
+        else: self.abpt.out_msa = 0
+        self.abpt.max_n_cons = max_n_cons
+        self.abpt.min_freq = min_freq
+        if out_pog: 
+            if isinstance(out_pog, str): out_pog = bytes(out_pog, 'utf-8')
+            self.abpt.out_pog = out_pog
+        else: self.abpt.out_pog = NULL
+
+        abpoa_post_set_para(&self.abpt)
+        abpoa_reset(self.ab, &self.abpt, len(seqs[0]))
+        if incr_fn:
+            if isinstance(incr_fn, str):
+                incr_fn = bytes(incr_fn, 'utf-8')
+            self.abpt.incr_fn = incr_fn
+            abpoa_restore_graph(self.ab, &self.abpt)
+            exist_n = self.ab[0].abs[0].n_seq
+            tot_n += exist_n
+        else:
+            self.abpt.incr_fn = NULL
+
+        self.ab[0].abs[0].n_seq += seq_n
+
+        for read_i, seq in enumerate(seqs):
+            seq_l = len(seq)
+            bseq = <uint8_t*>malloc(seq_l * cython.sizeof(uint8_t))
+            for i in range(seq_l):
+                bseq[i] = self.seq2int_dict[seq[i]]
+            res.n_cigar = 0
+            abpoa_align_sequence_to_graph(self.ab, &self.abpt, bseq, seq_l, &res)
+
+            abpoa_add_graph_alignment(self.ab, &self.abpt, bseq, NULL, seq_l, NULL, res, exist_n+read_i, tot_n, 1)
+            free(bseq)
+            if res.n_cigar: free(res.graph_cigar)
+
+        if self.abpt.out_msa:
+            abpoa_generate_rc_msa(self.ab, &self.abpt)
+        elif self.abpt.out_cons:
+            abpoa_generate_consensus(self.ab, &self.abpt)
+        abc = self.ab[0].abc[0]
+
+        n_cons, clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov, msa_len, msa_seq = 0, [], [], [], [], [], 0, []
+        n_cons = abc.n_cons
+        for i in range(n_cons):
+            clu_n_seq.append(abc.clu_n_seq[i])
+            cons_len.append(abc.cons_len[i])
+            clu_read_ids1, cons_seq1, cons_cov1 = [], '', []
+            for j in range(abc.clu_n_seq[i]):
+                clu_read_ids1.append(abc.clu_read_ids[i][j])
+            clu_read_ids.append(clu_read_ids1)
+            for j in range(abc.cons_len[i]):
+                c = abc.cons_base[i][j]
+                if isinstance(c, bytes): c = ord(c)
+                cons_seq1 += self.int2seq_dict[c]
+                cons_cov1.append(abc.cons_cov[i][j])
+            cons_seq.append(cons_seq1)
+            cons_cov.append(cons_cov1)
+
+        msa_len = abc.msa_len
+        if msa_len > 0:
+            for i in range(abc.n_seq + n_cons):
+                msa_seq1 = ''
+                for c in abc.msa_base[i][:msa_len]:
+                    if isinstance(c, bytes): c = ord(c)
+                    msa_seq1 += self.int2seq_dict[c]
+                msa_seq.append(msa_seq1)
+
+        if self.abpt.out_pog:
+            abpoa_dump_pog(self.ab, &self.abpt)
+        return msa_result(tot_n, n_cons, clu_n_seq, clu_read_ids, cons_len, cons_seq, cons_cov, msa_len, msa_seq)
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a082250
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,70 @@
+import os, platform, sys
+
+try:
+    from setuptools import setup, Extension
+except ImportError:
+    from distutils.core import setup
+    from distutils.extension import Extension
+
+cmdclass = {}
+
+try:
+    from Cython.Build import build_ext
+except ImportError: # without Cython
+    #module_src = 'python/pyabpoa.c'
+    sys.stderr.write('Error: \'cython\' is required to install pyabpoa\n')
+    sys.exit(0)
+else: # with Cython
+    module_src = 'python/pyabpoa.pyx'
+    cmdclass['build_ext'] = build_ext
+
+
+simde = '-DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES'
+sys.path.append('python')
+
+if platform.machine() in ["aarch64", "arm64"]:
+    simd_flag = '-march=armv8-a+simd -D__AVX2__'
+elif platform.machine() in ["aarch32"]:
+    simd_flag = '-march=armv8-a+simd -mfput=auto -D__AVX2__'
+else:
+    simd_flag='-march=native'
+    if os.getenv('SSE4', False):
+        simd_flag='-msse4.1'
+    elif os.getenv('SSE2', False):
+        simd_flag='-msse2'
+    elif os.getenv('AVX2', False):
+        simd_flag='-mavx2'
+    #elif os.getenv('AVX512F', False):
+    #    simd_flag='-mavx512f'
+    #elif os.getenv('AVX512BW', False):
+    #    simd_flag='-mavx512bw'
+
+src_dir='src/'
+inc_dir='include/'
+
+src=[module_src, src_dir+'abpoa_align.c', src_dir+'abpoa_graph.c', src_dir+'abpoa_output.c', src_dir+'abpoa_plot.c', src_dir+'abpoa_seed.c', src_dir+'abpoa_seq.c', src_dir+'kalloc.c', src_dir+'kstring.c', src_dir+'simd_abpoa_align.c', src_dir+'simd_check.c', src_dir+'utils.c']
+
+long_description = open('python/README.md').read()
+
+setup(
+    # Information
+    name = "pyabpoa",
+    description = "pyabpoa: SIMD-based partial order alignment using adaptive band",
+    long_description = long_description,
+    long_description_content_type="text/markdown",
+    version = "1.4.1",
+    url = "https://github.com/yangao07/abPOA",
+    author = "Yan Gao",
+    author_email = "gaoy1@chop.edu",
+    license = "MIT",
+    keywords = "multiple-sequence-alignment  partial-order-graph-alignment",
+    # Build instructions
+    ext_modules = [Extension("pyabpoa",
+                    sources=src,
+                    include_dirs=[inc_dir],
+                    depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_output.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'],
+                    libraries = ['z', 'm', 'pthread'],
+                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde, simd_flag])],
+    install_requires=['cython'],
+    cmdclass = cmdclass
+)
diff --git a/src/abpoa.c b/src/abpoa.c
new file mode 100644
index 0000000..e9eb1d4
--- /dev/null
+++ b/src/abpoa.c
@@ -0,0 +1,220 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include "abpoa.h"
+#include "abpoa_graph.h"
+#include "abpoa_align.h"
+#include "abpoa_seq.h"
+#include "utils.h"
+
+char NAME[20] = "abPOA";
+char PROG[20] = "abpoa";
+#define _ba BOLD UNDERLINE "a" NONE
+#define _bb BOLD UNDERLINE "b" NONE
+#define _bP BOLD UNDERLINE "P" NONE
+#define _bO BOLD UNDERLINE "O" NONE
+#define _bA BOLD UNDERLINE "A" NONE
+char DESCRIPTION[100] = _ba "daptive " _bb "anded " _bP "artial " _bO "rder " _bA "lignment";
+char VERSION[20] = "1.4.1";
+char CONTACT[30] = "gaoy1@chop.edu";
+
+const struct option abpoa_long_opt [] = {
+    { "align-mode", 1, NULL, 'm' },
+
+    { "match", 1, NULL, 'M' },
+    { "mismatch", 1, NULL, 'X' },
+    { "matrix", 1, NULL, 't' },
+    { "gap-open", 1, NULL, 'O' },
+    { "gap-ext", 1, NULL, 'E' },
+
+    { "extra-b", 1, NULL, 'b' },
+    { "extra-f", 1, NULL, 'f' },
+    { "zdrop", 1, NULL, 'z' },
+    { "bouns", 1, NULL, 'e' },
+
+    { "seeding", 0, NULL, 'S'},
+    { "k-mer", 1, NULL, 'k' },
+    { "window", 1, NULL, 'w' },
+    { "min-poa-win", 1, NULL, 'n' },
+    { "progressive", 0, NULL, 'p'},
+
+    { "use-qual-weight", 0, NULL, 'Q'},
+    { "amino-acid", 0, NULL, 'c'},
+    { "in-list", 0, NULL, 'l' },
+    { "increment", 1, NULL, 'i' },
+    
+
+    { "amb-strand", 0, NULL, 's' },
+    { "output", 1, NULL, 'o' },
+    { "result", 1, NULL, 'r' },
+    { "out-pog", 1, NULL, 'g' },
+    { "max-num-cons", 1, NULL, 'd', },
+    { "min-freq", 1, NULL, 'q', },
+
+    { "help", 0, NULL, 'h' },
+    { "version", 0, NULL, 'v' },
+
+    { 0, 0, 0, 0}
+};
+
+int abpoa_usage(void)
+{
+    err_printf("\n");
+    err_printf("%s: %s \n\n", PROG, DESCRIPTION);
+    err_printf("Version: %s\t", VERSION);
+	err_printf("Contact: %s\n\n", CONTACT);
+    err_printf("Usage: %s [options] <in.fa/fq> > cons.fa/msa.out/abpoa.gfa\n\n", PROG);
+    err_printf("Options:\n");
+    err_printf("  Alignment:\n");
+    err_printf("    -m --aln-mode   INT     alignment mode [%d]\n", ABPOA_GLOBAL_MODE);
+    err_printf("                              %d: global, %d: local, %d: extension\n", ABPOA_GLOBAL_MODE, ABPOA_LOCAL_MODE, ABPOA_EXTEND_MODE);
+    err_printf("    -M --match      INT     match score [%d]\n", ABPOA_MATCH);
+    err_printf("    -X --mismatch   INT     mismatch penalty [%d]\n", ABPOA_MISMATCH);
+    err_printf("    -t --matrix    FILE     scoring matrix file, \'-M\' and \'-X\' are not used when \'-t\' is used [Null]\n");
+    err_printf("                            e.g., \'HOXD70.mtx, BLOSUM62.mtx\'\n");
+    err_printf("    -O --gap-open INT(,INT) gap opening penalty (O1,O2) [%d,%d]\n", ABPOA_GAP_OPEN1, ABPOA_GAP_OPEN2);
+    err_printf("    -E --gap-ext  INT(,INT) gap extension penalty (E1,E2) [%d,%d]\n", ABPOA_GAP_EXT1, ABPOA_GAP_EXT2);
+    err_printf("                            %s provides three gap penalty modes, cost of a g-long gap:\n", NAME);
+    err_printf("                            - convex (default): min{O1+g*E1, O2+g*E2}\n");
+    err_printf("                            - affine (set O2 as 0): O1+g*E1\n");
+    err_printf("                            - linear (set O1 as 0): g*E1\n");
+    err_printf("    -s --amb-strand         ambiguous strand mode [False]\n");
+    err_printf("                            for each input sequence, try the reverse complement if the current\n");
+    err_printf("                            alignment score is too low, and pick the strand with a higher score\n");
+    err_printf("  Adaptive banded DP:\n");
+    err_printf("    -b --extra-b    INT     first adaptive banding parameter [%d]\n", ABPOA_EXTRA_B);
+    err_printf("                            set b as < 0 to disable adaptive banded DP\n");
+    err_printf("    -f --extra-f  FLOAT     second adaptive banding parameter [%.2f]\n", ABPOA_EXTRA_F);
+    err_printf("                            the number of extra bases added on both sites of the band is\n");
+    err_printf("                            b+f*L, where L is the length of the aligned sequence\n");
+    // err_printf("    -z --zdrop    INT       Z-drop score in extension alignment [-1]\n");
+    // err_printf("                            set as <= 0 to disable Z-drop extension\n");
+    // err_printf("    -e --bonus    INT       end bonus score in extension alignment [-1]\n");
+    // err_printf("                            set as <= 0 to disable end bounus\n");
+    err_printf("  Minimizer-based seeding and partition (only effective in global alignment mode):\n");
+    err_printf("    -S --seeding            enable minimizer-based seeding and anchoring [False]\n");
+    err_printf("    -k --k-mer       INT    minimizer k-mer size [%d]\n", ABPOA_MMK);
+    err_printf("    -w --window      INT    minimizer window size [%d]\n", ABPOA_MMW);
+    err_printf("    -n --min-poa-win INT    min. size of window to perform POA [%d]\n", ABPOA_MIN_POA_WIN);
+    err_printf("    -p --progressive        build guide tree and perform progressive partial order alignment [False]\n");
+    // err_printf("    -n --par-size           minimal partition size [%d]\n", ABPOA_W);
+
+    err_printf("  Input/Output:\n");
+    err_printf("    -Q --use-qual-weight    take base quality score from FASTQ input file as graph edge weight [False]\n");
+    err_printf("    -c --amino-acid         input sequences are amino acid (default is nucleotide) [False]\n");
+    err_printf("    -l --in-list            input file is a list of sequence file names [False]\n");
+    err_printf("                            each line is one sequence file containing a set of sequences\n");
+    err_printf("                            which will be aligned by abPOA to generate a consensus sequence\n");
+    err_printf("    -i --incrmnt    FILE    incrementally align sequences to an existing graph/MSA [Null]\n");
+    err_printf("                            graph could be in GFA or MSA format generated by abPOA\n");
+    err_printf("    -o --output     FILE    ouput to FILE [stdout]\n");
+    err_printf("    -r --result      INT    output result mode [%d]\n", ABPOA_OUT_CONS);
+    err_printf("                            - %d: consensus in FASTA format\n", ABPOA_OUT_CONS);
+    err_printf("                            - %d: MSA in PIR format\n", ABPOA_OUT_MSA);
+    err_printf("                            - %d: both 0 & 1\n", ABPOA_OUT_CONS_MSA);
+    err_printf("                            - %d: graph in GFA format\n", ABPOA_OUT_GFA);
+    err_printf("                            - %d: graph with consensus path in GFA format\n", ABPOA_OUT_CONS_GFA);
+    err_printf("                            - %d: consensus in FASTQ format\n", ABPOA_OUT_CONS_FQ);
+    err_printf("    -d --maxnum-cons INT    max. number of consensus sequence to generate [1]\n");
+    err_printf("    -q --min-freq  FLOAT    min. frequency of each consensus sequence (only effective when -d/--num-cons > 1) [%.2f]\n", MULTIP_MIN_FREQ);
+    err_printf("    -g --out-pog    FILE    dump final alignment graph to FILE (.pdf/.png) [Null]\n\n");
+
+    err_printf("    -h --help               print this help usage information\n");
+    err_printf("    -v --version            show version number\n");
+
+
+    err_printf("\n");
+    return 1;
+}
+
+int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){
+    double realtime0 = realtime();
+    // TODO abpoa_init for each input file ???
+    abpoa_t *ab = abpoa_init();
+    if (is_list) { // input file list
+        FILE *list_fp = fopen(file_fn, "r"); char read_fn[1024];
+        while (fgets(read_fn, sizeof(read_fn), list_fp)) {
+            read_fn[strlen(read_fn)-1] = '\0';
+            abpoa_msa1(ab, abpt, read_fn, stdout);
+        }
+        fclose(list_fp);
+    } else // input file
+        abpoa_msa1(ab, abpt, file_fn, stdout);
+
+    abpoa_free(ab);
+	err_func_printf(__func__, "Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB.", realtime() - realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para();
+    while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpso:r:g:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) {
+        switch(c)
+        {
+            case 'm': m = atoi(optarg);
+                      if (m != ABPOA_GLOBAL_MODE && m != ABPOA_EXTEND_MODE && m != ABPOA_LOCAL_MODE) { 
+                          err_printf("Unknown alignment mode: %d.\n", m); return abpoa_usage();
+                      } abpt->align_mode=m; break;
+            case 'M': abpt->match = atoi(optarg); break;
+            case 'X': abpt->mismatch = atoi(optarg); break;
+            case 't': abpt->use_score_matrix = 1; abpt->mat_fn = strdup(optarg); break;
+            case 'O': abpt->gap_open1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_open2 = strtol(s+1, &s, 10); break;
+            case 'E': abpt->gap_ext1 = strtol(optarg, &s, 10); if (*s == ',') abpt->gap_ext2 = strtol(s+1, &s, 10); break;
+
+            case 'b': abpt->wb = atoi(optarg); break;
+            case 'f': abpt->wf = atof(optarg); break;
+            case 'z': abpt->zdrop = atoi(optarg); break;
+            case 'e': abpt->end_bonus= atoi(optarg); break;
+
+            case 'Q': abpt->use_qv = 1; break;
+            case 'S': abpt->disable_seeding = 0; break;
+            case 'k': abpt->k = atoi(optarg); break;
+            case 'w': abpt->w = atoi(optarg); break;
+            case 'n': abpt->min_w = atoi(optarg); break;
+
+            case 'c': abpt->m = 27; abpt->mat = (int*)_err_realloc(abpt->mat, abpt->m * abpt->m * sizeof(int)); break;
+            case 'i': abpt->incr_fn = strdup(optarg); break;
+            case 'l': in_list = 1; break;
+            case 'p': abpt->progressive_poa = 1; break;
+            case 's': abpt->amb_strand = 1; break;
+            case 'o': if (strcmp(optarg, "-") != 0) {
+                          if (freopen(optarg, "wb", stdout) == NULL)
+                              err_fatal(__func__, "Failed to open the output file %s", optarg);
+                      } break;
+            case 'r': if (atoi(optarg) == ABPOA_OUT_CONS) abpt->out_cons = 1, abpt->out_msa = 0;
+                      else if (atoi(optarg) == ABPOA_OUT_MSA) abpt->out_cons = 0, abpt->out_msa = 1;
+                      else if (atoi(optarg) == ABPOA_OUT_CONS_MSA) abpt->out_cons = abpt->out_msa = 1;
+                      else if (atoi(optarg) == ABPOA_OUT_GFA) abpt->out_cons = 0, abpt->out_gfa = 1;
+                      else if (atoi(optarg) == ABPOA_OUT_CONS_GFA) abpt->out_cons = 1, abpt->out_gfa = 1;
+                      else if (atoi(optarg) == ABPOA_OUT_CONS_FQ) abpt->out_cons = 1, abpt->out_fq = 1;
+                      else err_printf("Error: unknown output result mode: %s.\n", optarg);
+                      break;
+            case 'g': abpt->out_pog= strdup(optarg); break;
+
+            case 'd': abpt->max_n_cons = atoi(optarg); break; 
+            case 'q': abpt->min_freq = atof(optarg); break;
+
+            case 'h': return abpoa_usage();
+            case 'V': abpt->verbose = atoi(optarg); break;
+            case 'v': printf("%s\n", VERSION); goto End; break;
+            default:
+                      err_printf("Error: unknown option: %s.\n", optarg);
+                      return abpoa_usage();
+                      break;
+        }
+    }
+
+    if (argc - optind != 1) return abpoa_usage();
+
+    abpoa_post_set_para(abpt);
+	fprintf(stderr, "[%s] CMD: ", __func__);
+    for (c = 0; c < argc; ++c)
+        fprintf(stderr, " %s", argv[c]);
+    fprintf(stderr, "\n");
+    abpoa_main(argv[optind], in_list, abpt);
+
+End:
+    abpoa_free_para(abpt);
+    return 0;
+}
diff --git a/src/abpoa.h b/src/abpoa.h
new file mode 100644
index 0000000..87bbdcf
--- /dev/null
+++ b/src/abpoa.h
@@ -0,0 +1,223 @@
+#ifndef ABPOA_H
+#define ABPOA_H
+
+#include <stdint.h>
+#include "simd_instruction.h"
+
+#define ABPOA_GLOBAL_MODE 0
+#define ABPOA_LOCAL_MODE  1
+#define ABPOA_EXTEND_MODE 2
+//#define ABPOA_SEMI_MODE 3
+
+// gap mode
+#define ABPOA_LINEAR_GAP 0
+#define ABPOA_AFFINE_GAP 1
+#define ABPOA_CONVEX_GAP 2
+
+#define ABPOA_EXTRA_B 10
+#define ABPOA_EXTRA_F 0.01
+
+#define ABPOA_CIGAR_STR "MIDXSH"
+#define ABPOA_CMATCH     0
+#define ABPOA_CINS       1
+#define ABPOA_CDEL       2
+#define ABPOA_CDIFF      3
+#define ABPOA_CSOFT_CLIP 4
+#define ABPOA_CHARD_CLIP 5
+
+#define ABPOA_SRC_NODE_ID  0
+#define ABPOA_SINK_NODE_ID 1
+
+#define ABPOA_OUT_CONS     0
+#define ABPOA_OUT_MSA      1
+#define ABPOA_OUT_CONS_MSA 2
+#define ABPOA_OUT_GFA      3
+#define ABPOA_OUT_CONS_GFA 4
+#define ABPOA_OUT_CONS_FQ  5
+
+#define ABPOA_HB 0
+#define ABPOA_HC 1
+
+// NOTE: upper boundary of in_edge_n is pow(2,30)
+// for MATCH/MISMATCH: node_id << 34  | query_id << 4 | op
+// for INSERTION:      query_id << 34 | op_len << 4   | op
+// for DELETION:       node_id << 34  | op_len << 4   | op // op_len is always equal to 1
+// for CLIP            query_id << 34 | op_len << 4   | op 
+#define abpoa_cigar_t uint64_t 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+    int n_cigar, m_cigar; abpoa_cigar_t *graph_cigar;
+    int node_s, node_e, query_s, query_e; // for local and  extension mode
+    int n_aln_bases, n_matched_bases;
+    int32_t best_score; 
+    // uint8_t is_rc:1; // is_rc: best_score is from the reverse complement
+                        // now is_rc is determined based on minimizer-based seeding and chaining
+} abpoa_res_t;
+
+typedef struct {
+    int m; int *mat; char *mat_fn; // score matrix
+    int use_score_matrix; // set _mat_ based on score matrix file, then _match_/_mismatch_ is not used.
+    int match, max_mat, mismatch, min_mis, gap_open1, gap_open2, gap_ext1, gap_ext2; int inf_min;
+    // minimizer seeding parameter
+    int k, w, min_w;
+    int wb; float wf; // extra band width
+    int zdrop, end_bonus; // from minimap2
+    // int simd_flag; // available SIMD instruction
+    // alignment mode
+    uint8_t ret_cigar:1, rev_cigar:1, out_msa:1, out_cons:1, out_gfa:1, out_fq:1, use_read_ids:1, amb_strand:1;
+    uint8_t use_qv:1, disable_seeding:1, progressive_poa:1;
+    char *incr_fn, *out_pog;
+    int align_mode, gap_mode, max_n_cons;
+    double min_freq; // for multiploid data
+    int verbose; // to control output msg
+
+    // char LogTable65536[65536];
+    // char bit_table16[65536];
+} abpoa_para_t;
+
+typedef struct {
+    int node_id;
+    int in_edge_n, in_edge_m, *in_id;
+    int out_edge_n, out_edge_m, *out_id; int *out_weight;
+    int *read_weight, n_read, m_read; // weight of each read, valid when use_qv=1
+    uint64_t **read_ids; int read_ids_n; // for each edge
+
+    int aligned_node_n, aligned_node_m, *aligned_node_id; // mismatch; aligned node will have same rank
+    // int heaviest_weight, heaviest_out_id; // for consensus
+    uint8_t base; // 0~m
+    // ID, pos ???
+} abpoa_node_t;
+
+typedef struct {
+    abpoa_node_t *node; int node_n, node_m, index_rank_m; 
+    int *index_to_node_id;
+    int *node_id_to_index, *node_id_to_max_pos_left, *node_id_to_max_pos_right, *node_id_to_max_remain, *node_id_to_msa_rank;
+    uint8_t is_topological_sorted:1, is_called_cons:1, is_set_msa_rank:1;
+} abpoa_graph_t;
+
+typedef struct {
+    int n_cons, n_seq, msa_len; // # cons, # of total seq, length of row-column MSA (including gaps)
+    int *clu_n_seq;      // # of reads in each read cluster/group, size: n_cons
+    int **clu_read_ids; // read ids for each cluster/group, size: n_cons * clu_n_seq[i]
+    int *cons_len;       // length of each consensus sequence, size: n_cons
+    int **cons_node_ids; // node id of each consensus, size: n_cons * cons_len[i]
+    uint8_t **cons_base; // sequence base of each consensus, size: n_cons * cons_len[i]
+    uint8_t **msa_base;  // sequence base of RC-MSA, size: (n_seq + n_cons) * msa_len
+    int **cons_cov;      // coverage of each consensus base, size: n_cons * cons_len[i]
+    int **cons_phred_score; // phred score for each consensus base, size: n_cons * cons_len[i]
+} abpoa_cons_t;
+
+typedef struct {
+    int l, m; char *s;
+} abpoa_str_t;
+
+typedef struct {
+    int n_seq, m_seq;
+    abpoa_str_t *seq, *name, *comment, *qual;
+    uint8_t *is_rc;
+} abpoa_seq_t;
+
+typedef struct {
+    SIMDi *s_mem; uint64_t s_msize; // qp, DP_HE, dp_f OR qp, DP_H, dp_f : based on (qlen, num_of_value, m, node_n)
+    int *dp_beg, *dp_end, *dp_beg_sn, *dp_end_sn, rang_m; // if band : based on (node_m)
+} abpoa_simd_matrix_t;
+
+typedef struct {
+    abpoa_graph_t *abg;
+    abpoa_seq_t *abs;
+    abpoa_simd_matrix_t *abm;
+    abpoa_cons_t *abc;
+} abpoa_t;
+
+// init for abpoa parameters
+abpoa_para_t *abpoa_init_para(void);
+void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mat_fn);
+void abpoa_post_set_para(abpoa_para_t *abpt);
+void abpoa_free_para(abpoa_para_t *abpt);
+
+// init for alignment
+abpoa_t *abpoa_init(void);
+void abpoa_free(abpoa_t *ab);
+
+// perform msa
+int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seqs, char **seq_names, int *seq_lens, uint8_t **seqs, int **qual_weights, FILE *out_fp);
+
+int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp);
+
+// clean alignment graph
+void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen);
+
+// restore graph from GFA/FASTA file
+abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt);
+
+// for development:
+// align a sequence to a graph
+int abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res);
+// align a sequence to a graph between beg_node_id and end_node_id (both are excluded)
+void abpoa_subgraph_nodes(abpoa_t *ab, abpoa_para_t *abpt, int inc_beg, int inc_end, int *exc_beg, int *exc_end);
+int abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res);
+
+// add a node to a graph
+// para:
+//   base: 0123 for ACGT
+int abpoa_add_graph_node(abpoa_graph_t *abg, uint8_t base);
+
+// add an edge to a graph
+// para:
+//   from_id/to_id: ids of from and to nodes
+//   check_edge: set as 1 if this edge maybe alread exist and only need to update weight, set as 0 if the edge is new
+//   add_read_id: set as 1 if read_id is used (to use row-column algorithm/generate MSA result/multiple consensus)
+//   read_id: is of sequence
+//   read_ids_n: size of read_id array, each one is 64-bit (1+(tot_read_n-1)/64)
+int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_edge, int w, uint8_t add_read_id, uint8_t add_read_weight, int read_id, int read_ids_n, int tot_read_n);
+
+// add an alignment to a graph
+// para:
+//   query: 0123 for ACGT
+//   qlen: query length
+//   n_cigar/abpoa_cigar: from alignment result (abpoa_res_t)
+//   read_id: id of sequence
+//   tot_read_n: total number of sequence
+int abpoa_add_graph_alignment(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int *weight, int qlen, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends);
+int abpoa_add_subgraph_alignment(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int *weight, int qlen, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends);
+
+void abpoa_BFS_set_node_index(abpoa_graph_t *abg, int src_id, int sink_id);
+void abpoa_BFS_set_node_remain(abpoa_graph_t *abg, int src_id, int sink_id);
+
+// topological sortting of graph
+void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt);
+
+// generate consensus sequence from graph
+// para:
+//   out_fp: consensus sequence output in FASTA format, set as NULL to disable
+//   cons_seq, cons_l, cons_n: store consensus sequences in variables, set cons_n as NULL to disable. 
+//     cons_seq: store consensus sequences
+//     cons_l: store consensus sequences length
+//     cons_n: store number of consensus sequences
+//     Note: cons_seq and cons_l need to be freed by user.
+void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt);
+void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// generate column multiple sequence alignment from graph
+void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt);
+void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// generate graph in GFA format to _out_fp_
+void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// output cons/msa
+void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp);
+
+// generate DOT graph plot and dump graph into PDF/PNG format file
+void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/abpoa_align.c b/src/abpoa_align.c
new file mode 100644
index 0000000..00bf94e
--- /dev/null
+++ b/src/abpoa_align.c
@@ -0,0 +1,503 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "abpoa.h"
+#include "simd_abpoa_align.h"
+#include "abpoa_align.h"
+#include "abpoa_seq.h"
+#include "abpoa_output.h"
+#include "utils.h"
+#include "abpoa_seed.h"
+
+void gen_simple_mat(abpoa_para_t *abpt) {
+    int m = abpt->m, i, j;
+    int match = abpt->match < 0 ? -abpt->match : abpt->match;
+    int mismatch = abpt->mismatch > 0? -abpt->mismatch : abpt->mismatch;
+    for (i = 0; i < m - 1; ++i) {
+        for (j = 0; j < m - 1; ++j)
+            abpt->mat[i * m + j] = i == j ? match : mismatch;
+        abpt->mat[i * m + m - 1] = 0;
+    }
+    for (j = 0; j < m; ++j)
+        abpt->mat[(m - 1) * m + j] = 0;
+    abpt->max_mat = match;
+    abpt->min_mis = -mismatch;
+}
+
+extern char ab_nt4_table[256];
+extern char ab_nt256_table[256];
+extern char ab_aa26_table[256];
+extern char ab_aa256_table[256];
+extern char ab_char26_table[256];
+extern char ab_char256_table[256];
+
+void parse_mat_first_line(char *l, int *order) {
+    int i, n;
+    for (i = n = 0; l[i]; ++i) {
+        if (isspace(l[i])) continue;
+        order[n++] = ab_char26_table[(int)l[i]];
+    }
+}
+
+void parse_mat_score_line(char *l, int *order, int m, int *mat) {
+    int n, is_base=1, _i=-1; long s; char *str = l, *pEnd=NULL;
+    for (n = 0; *str; ++str) {
+        if (!isalpha(*str) && !isdigit(*str) && *str != '+' && *str != '-') continue;
+        if (is_base) { // get base
+            _i = ab_char26_table[(int)*str];
+            if (_i >= m) err_fatal(__func__, "Unknown base: \"%c\" (%d).\n", *str, _i);
+            is_base = 0;
+        } else { // get score
+            if (n == m) 
+                err_fatal_simple("Too many scores in matrix.\n");
+            s = strtol(str, &pEnd, 10);
+            str = pEnd;
+            mat[_i *m + order[n]] = s;
+            n++;
+        }
+    }
+}
+
+void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mat_fn) {
+    char *l = (char*)_err_malloc(1024 * sizeof(char)); FILE *fp;
+    if ((fp = fopen(mat_fn, "r")) == NULL) err_fatal(__func__, "Unable to open scoring matrix file: \"%s\"\n", mat_fn);
+    int first_line = 1;
+    int *order = (int*)_err_malloc(abpt->m * sizeof(int));
+    while (fgets(l, 1024, fp) != NULL) {
+        if (l[0] == '#') continue;
+        if (first_line) {
+            first_line = 0;
+            // get A/C/G/T/N bases
+            parse_mat_first_line(l, order);
+        } else {
+            // get match/mismatch scores
+            parse_mat_score_line(l, order, abpt->m, abpt->mat);
+        }
+    }
+    int i; abpt->min_mis = 0, abpt->max_mat = 0;
+    for (i = 0; i < abpt->m * abpt->m; ++i) {
+        if (abpt->mat[i] > abpt->max_mat)
+            abpt->max_mat = abpt->mat[i];
+        if (-abpt->mat[i] > abpt->min_mis) 
+            abpt->min_mis = -abpt->mat[i];
+    }
+    free(l); free(order); fclose(fp);
+}
+
+void abpoa_set_gap_mode(abpoa_para_t *abpt) {
+    if (abpt->gap_open1 == 0) abpt->gap_mode = ABPOA_LINEAR_GAP;
+    else if (abpt->gap_open1 > 0 && abpt->gap_open2 == 0) abpt->gap_mode = ABPOA_AFFINE_GAP;
+    else abpt->gap_mode = ABPOA_CONVEX_GAP;
+}
+
+abpoa_para_t *abpoa_init_para(void) {
+    abpoa_para_t *abpt = (abpoa_para_t*)_err_malloc(sizeof(abpoa_para_t));
+    abpt->align_mode = ABPOA_GLOBAL_MODE;
+    abpt->gap_mode = ABPOA_CONVEX_GAP;
+    abpt->zdrop = -1;     // disable zdrop
+    abpt->end_bonus = -1; // disable end bouns
+    abpt->wb = ABPOA_EXTRA_B; // extra bandwidth
+    abpt->wf = ABPOA_EXTRA_F; // extra bandwidth
+
+    abpt->amb_strand = 0; // ambiguous strand
+    abpt->ret_cigar = 1;  // return cigar
+    abpt->rev_cigar = 0;  // reverse cigar
+    abpt->out_cons = 1;   // output consensus sequence in fasta
+    abpt->out_fq = 0;     // output consensus sequence in fastq
+    abpt->out_gfa = 0;    // out graph in GFA format
+    abpt->out_msa = 0;    // output msa
+    abpt->max_n_cons = 1; // number of max. generated consensus sequence
+    abpt->min_freq = MULTIP_MIN_FREQ; 
+    abpt->use_read_ids = 0;
+    abpt->incr_fn = NULL; // incrementally align seq to an existing graph
+    abpt->out_pog = NULL; // dump partial order graph to file
+
+    // number of residue types
+    abpt->m = 5; // nucleotide
+    abpt->mat = (int*)_err_malloc(abpt->m * abpt->m * sizeof(int));
+
+    // score matrix
+    abpt->use_score_matrix = 0;
+    abpt->mat_fn = NULL;
+    abpt->match = ABPOA_MATCH;
+    abpt->mismatch = ABPOA_MISMATCH;
+    abpt->gap_open1 = ABPOA_GAP_OPEN1;
+    abpt->gap_open2 = ABPOA_GAP_OPEN2;
+    abpt->gap_ext1 = ABPOA_GAP_EXT1;
+    abpt->gap_ext2 = ABPOA_GAP_EXT2;
+
+    abpt->use_qv = 0;
+    abpt->disable_seeding = 1; // no seeding by default
+    abpt->k = ABPOA_MMK;
+    abpt->w = ABPOA_MMW;
+    abpt->min_w = ABPOA_MIN_POA_WIN;
+    abpt->progressive_poa = 0; // progressive partial order alignment
+
+    abpt->verbose = 0;
+
+    // abpt->simd_flag = simd_check();
+
+    return abpt;
+}
+
+void abpoa_post_set_para(abpoa_para_t *abpt) {
+    abpoa_set_gap_mode(abpt);
+    if (abpt->out_msa || abpt->out_gfa || abpt->max_n_cons > 1) {
+        abpt->use_read_ids = 1;
+        set_65536_table();
+        if (abpt->max_n_cons > 1) set_bit_table16();
+    }
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) abpt->wb = -1;
+    int i;
+    if (abpt->m > 5) { // for aa sequence
+        for (i = 0; i < 256; ++i) {
+            ab_char26_table[i] = ab_aa26_table[i];
+            ab_char256_table[i] = ab_aa256_table[i];
+        }
+        if (abpt->k > 11) {
+            abpt->k = 7, abpt->w = 4;
+        }
+    } else {
+        for (i = 0; i < 256; ++i) {
+            ab_char26_table[i] = ab_nt4_table[i];
+            ab_char256_table[i] = ab_nt256_table[i];
+        }
+    }
+    if (abpt->use_score_matrix == 0) gen_simple_mat(abpt);
+    else abpoa_set_mat_from_file(abpt, abpt->mat_fn);
+}
+
+void abpoa_free_para(abpoa_para_t *abpt) {
+    if (abpt->mat != NULL) free(abpt->mat);
+    if (abpt->mat_fn != NULL) free(abpt->mat_fn);
+    if (abpt->out_pog != NULL) free(abpt->out_pog);
+    if (abpt->incr_fn != NULL) free(abpt->incr_fn);
+    free(abpt);
+}
+
+int abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int exc_beg_node_id, int exc_end_node_id, uint8_t *query, int qlen, abpoa_res_t *res) {
+    if (ab->abg->node_n <= 2) return -1;
+    if (ab->abg->is_topological_sorted == 0) abpoa_topological_sort(ab->abg, abpt);
+    simd_abpoa_align_sequence_to_subgraph(ab, abpt, exc_beg_node_id, exc_end_node_id, query, qlen, res);
+    return 0;
+}
+
+int abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res) {
+    if (ab->abg->node_n <= 2) return -1;
+    if (ab->abg->is_topological_sorted == 0) abpoa_topological_sort(ab->abg, abpt);
+    simd_abpoa_align_sequence_to_graph(ab, abpt, query, qlen, res);
+    return 0;
+}
+
+int abpoa_anchor_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weights, int *seq_lens, ab_u64_v par_anchors, int *par_c, int *tpos_to_node_id, int *qpos_to_node_id, int *read_id_map, int exist_n_seq, int n_seq) {
+    // err_func_format_printf(__func__, "Performing POA between anchors ...");
+    abpoa_res_t res; int read_id, last_read_id = -1, m_c = 0, k = abpt->k, qlen;
+    abpoa_seq_t *abs = ab->abs;
+    int *tmp;
+    int i, _i, ai, j, tot_n_seq = exist_n_seq + n_seq;
+    uint8_t *qseq; int *weight; abpoa_res_t whole_res;
+    // uint8_t *seq1;
+    for (_i = 0; _i < n_seq; ++_i) {
+        i = read_id_map[_i]; read_id = exist_n_seq + i; qlen = seq_lens[i]; whole_res.n_cigar = 0, whole_res.m_cigar = 0, whole_res.graph_cigar = 0;
+#ifdef __DEBUG__
+        fprintf(stderr, "seq: # %d\n", i);
+#endif
+        // seq-to-graph alignment and add alignment within each split window
+        if (_i == 0) ai = 0; else ai = par_c[_i-1];
+
+        int beg_id = ABPOA_SRC_NODE_ID, beg_qpos = 0, end_id=-1, end_tpos=-1, end_qpos=-1;
+        if (ai < par_c[_i]) {
+            abs->is_rc[read_id] = (abs->is_rc[last_read_id] ^ (par_anchors.a[ai] >> 63));
+            // construct rc qseq
+            if (abs->is_rc[read_id]) {
+                qseq = (uint8_t*)_err_malloc(qlen * sizeof(uint8_t));
+                weight = (int*)_err_malloc(qlen * sizeof(int));
+                for (j = 0; j < qlen; ++j) {
+                    if (seqs[i][qlen-j-1] < 4) qseq[j] = 3 - seqs[i][qlen-j-1];
+                    else qseq[j] = 4;
+                    weight[j] = weights[i][qlen-j-1];
+                }
+                if (abs->is_rc[last_read_id]) { // reset tpos/qpos in par_anchors
+                    int last_qlen = seq_lens[read_id_map[_i-1]];
+                    for (j = ai; j < par_c[_i]; ++j) {
+                        end_tpos = ((par_anchors.a[j] >> 32) & 0x7fffffff); end_qpos = (uint32_t)par_anchors.a[j];
+                        par_anchors.a[j] = (par_anchors.a[j] >> 63) << 63 | (uint64_t)(last_qlen-end_tpos+k) << 32 | (qlen-end_qpos+k);
+                    }
+                    for (j = 0; j < (par_c[_i]-ai)/2; ++j) {
+                        uint64_t tmp = par_anchors.a[ai+j];
+                        par_anchors.a[ai+j] = par_anchors.a[par_c[_i]-1-j];
+                        par_anchors.a[par_c[_i]-1-j] = tmp;
+                    }
+                }
+            } else { 
+                qseq = seqs[i];
+                weight = weights[i];
+                if (abs->is_rc[last_read_id]) { // reset tpos/qpos in par_anchors 
+                    int last_qlen = seq_lens[read_id_map[_i-1]];
+                    for (j = ai; j < par_c[_i]; ++j) {
+                        end_tpos = ((par_anchors.a[j] >> 32) & 0x7fffffff); end_qpos = (uint32_t)par_anchors.a[j];
+                        par_anchors.a[j] = (par_anchors.a[j] >> 63) << 63 | (uint64_t)(last_qlen-end_tpos+k) << 32 | (qlen-end_qpos+k);
+                    }
+                    for (j = 0; j < (par_c[_i]-ai)/2; ++j) {
+                        uint64_t tmp = par_anchors.a[ai+j];
+                        par_anchors.a[ai+j] = par_anchors.a[par_c[_i]-1-j];
+                        par_anchors.a[par_c[_i]-1-j] = tmp;
+                    }
+                }
+            }
+        } else {
+            abs->is_rc[read_id] = 0, qseq = seqs[i]; weight = weights[i];
+        }
+
+        for (; ai < par_c[_i]; ++ai) {
+            end_tpos = ((par_anchors.a[ai] >> 32) & 0x7fffffff) - k + 1; end_id = tpos_to_node_id[end_tpos];
+            end_qpos = (uint32_t)par_anchors.a[ai] - k + 1;
+
+#ifdef __DEBUG__
+            fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos);
+#endif
+
+            res.graph_cigar = 0; res.n_cigar = 0;
+            abpoa_align_sequence_to_subgraph(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, &res);
+            abpoa_push_whole_cigar(&whole_res.n_cigar, &whole_res.m_cigar, &whole_res.graph_cigar, res.n_cigar, res.graph_cigar);
+            if (res.n_cigar) free(res.graph_cigar);
+            // abpoa_add_subgraph_alignment(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, qpos_to_node_id+beg_qpos, res, read_id, tot_n_seq, 1);
+
+            // add alignment for anchors
+            res.graph_cigar = (abpoa_cigar_t*)_err_malloc((k) * sizeof(abpoa_cigar_t)); res.n_cigar = 0; m_c = k;
+            for (j = 0; j < k; ++j)
+                res.graph_cigar = abpoa_push_cigar(&(res.n_cigar), &m_c, res.graph_cigar, ABPOA_CMATCH, 1, tpos_to_node_id[end_tpos+j], j);
+            // for (j = 0; j < k; ++j) qpos_to_node_id[end_qpos+j] = tpos_to_node_id[end_tpos+j];
+            // abpoa_add_subgraph_alignment(ab, abpt, end_id, tpos_to_node_id[end_tpos+k-1], qseq+end_qpos, k, NULL, res, read_id, tot_n_seq, 1);
+            abpoa_push_whole_cigar(&whole_res.n_cigar, &whole_res.m_cigar, &whole_res.graph_cigar, res.n_cigar, res.graph_cigar);
+            if (res.n_cigar) free(res.graph_cigar);
+
+            // for next anchor
+            beg_id = tpos_to_node_id[end_tpos+k-1]; beg_qpos = end_qpos+k;
+        }
+        end_id = ABPOA_SINK_NODE_ID; end_qpos = seq_lens[i];
+
+#ifdef __DEBUG__
+            fprintf(stderr, "\tanchor: t: %d (id: %d), q: %d\n", end_tpos, end_id, end_qpos);
+#endif
+        res.graph_cigar = 0; res.n_cigar = 0;
+        abpoa_align_sequence_to_subgraph(ab, abpt, beg_id, end_id, qseq+beg_qpos, end_qpos-beg_qpos, &res);
+        abpoa_push_whole_cigar(&whole_res.n_cigar, &whole_res.m_cigar, &whole_res.graph_cigar, res.n_cigar, res.graph_cigar);
+        if (res.n_cigar) free(res.graph_cigar);
+
+        abpoa_add_subgraph_alignment(ab, abpt, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, qseq, weight, qlen, qpos_to_node_id, whole_res, read_id, tot_n_seq, 1);
+        if (abs->is_rc[read_id]) {
+            free(qseq); free(weight);
+        }
+        if (whole_res.n_cigar) free(whole_res.graph_cigar);
+
+        tmp = qpos_to_node_id; qpos_to_node_id = tpos_to_node_id; tpos_to_node_id = tmp;
+        last_read_id = read_id;
+    }
+    // err_func_format_printf(__func__, "Performing POA between anchors done.");
+    return 0;
+}
+
+// simply partial order alignment, no seeding-based anchor or progressive tree
+int abpoa_poa(abpoa_t *ab, abpoa_para_t *abpt, uint8_t **seqs, int **weights, int *seq_lens, int exist_n_seq, int n_seq) {
+    // err_func_format_printf(__func__, "Performing POA ...");
+    abpoa_seq_t *abs = ab->abs;
+    abpoa_res_t res; int i, j, read_id, qlen, tot_n_seq = exist_n_seq + n_seq;
+    uint8_t *qseq, *rc_qseq; int *weight, *rc_weight;
+    // uint8_t *seq1;
+    for (i = 0; i < n_seq; ++i) {
+        qlen = seq_lens[i]; qseq = seqs[i]; weight = weights[i]; read_id = exist_n_seq + i;
+#ifdef __DEBUG__
+        fprintf(stderr, "seq: # %d\n", i);
+#endif
+        res.graph_cigar = 0; res.n_cigar = 0;
+        if (abpoa_align_sequence_to_graph(ab, abpt, qseq, qlen, &res) >= 0) {
+            if (abpt->amb_strand && (res.best_score < MIN_OF_TWO(qlen, ab->abg->node_n-2) * abpt->max_mat * .3333)) { // TODO .3333
+                rc_qseq = (uint8_t*)_err_malloc(sizeof(uint8_t) * qlen);
+                for (j = 0; j < qlen; ++j) {
+                    if (qseq[qlen-j-1] < 4) rc_qseq[j] = 3 - qseq[qlen-j-1];
+                    else rc_qseq[j] = 4;
+                }
+                rc_weight = (int*)_err_malloc(sizeof(int) * qlen);
+                for (j = 0; j < qlen; ++j) {
+                    rc_weight[j] = weight[qlen-j-1];
+                }
+                abpoa_res_t rc_res; rc_res.n_cigar = 0, rc_res.graph_cigar = 0;
+                simd_abpoa_align_sequence_to_graph(ab, abpt, rc_qseq, qlen, &rc_res);
+                if (rc_res.best_score > res.best_score) {
+                    abpoa_res_copy(&res, &rc_res);
+                    qseq = rc_qseq;
+                    weight = rc_weight;
+                    abs->is_rc[read_id] = 1;
+                } else {
+                    free(rc_qseq); free(rc_weight);
+                }
+                if (rc_res.n_cigar) free(rc_res.graph_cigar);
+            } 
+        }
+        abpoa_add_graph_alignment(ab, abpt, qseq, weight, qlen, NULL, res, read_id, tot_n_seq, 1);
+        if (abs->is_rc[read_id]) { free(qseq); free(weight); }
+        if (res.n_cigar) free(res.graph_cigar);
+    }
+    // err_func_format_printf(__func__, "Performing POA ... done.");
+    return 0;
+}
+
+void abpoa_output(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
+    // generate & output GFA
+    if (abpt->out_gfa) abpoa_generate_gfa(ab, abpt, out_fp);
+    else {
+        // generate rc-msa/cons
+        if (abpt->out_msa) abpoa_generate_rc_msa(ab, abpt);
+        if (abpt->out_cons) {
+            abpoa_generate_consensus(ab, abpt);
+            if (ab->abg->is_called_cons == 0) err_printf("Warning: no consensus sequence generated.\n");
+        }
+        // output cons/rc-msa
+        if (abpt->out_msa) abpoa_output_rc_msa(ab, abpt, out_fp);
+        else if (abpt->out_cons) abpoa_output_fx_consensus(ab, abpt, out_fp);
+    }
+    // plot partial-order graph using dot
+    if (abpt->out_pog) abpoa_dump_pog(ab, abpt);
+}
+
+// do msa for a set of input sequences
+// @function: 
+//    generate consensus sequence
+//    generate rc-msa (row column multiple sequence alignment)
+// @para:
+//    ab/abpt: abpoa related variable and parameter 
+//    n_seq: number of input sequences
+//    seq_len: array of input sequence length, size: seq_n
+//    seqs: array of input sequences, 0123 for ACGT, size: seq_n * seq_len[]
+int abpoa_msa(abpoa_t *ab, abpoa_para_t *abpt, int n_seq, char **seq_names, int *seq_lens, uint8_t **seqs, int **qual_weights, FILE *out_fp) {
+    if ((!abpt->out_msa && !abpt->out_cons && !abpt->out_gfa) || n_seq <= 0) return 0;
+    abpoa_reset(ab, abpt, 1024);
+    if (abpt->incr_fn) abpoa_restore_graph(ab, abpt); // restore existing graph
+    abpoa_seq_t *abs = ab->abs; int i, exist_n_seq = abs->n_seq;
+
+    // set ab->abs, name
+    abs->n_seq += n_seq; abpoa_realloc_seq(abs);
+
+    if (seq_names) {
+        for (i = 0; i < n_seq; ++i) {
+            abpoa_cpy_str(abs->name+exist_n_seq+i, seq_names[i], strlen(seq_names[i]));
+        }
+    } else {
+        for (i = 0; i < n_seq; ++i) {
+            abs->name[exist_n_seq+i].l = 0; abs->name[exist_n_seq+i].m = 0;
+        }
+    }
+
+    // always reset graph before perform POA
+    int max_len = 0;
+    for (i = 0; i < n_seq; ++i) {
+        if (seq_lens[i] > max_len) max_len = seq_lens[i];
+    }
+
+    int j, **weights = (int**)_err_malloc(n_seq * sizeof(int*));
+    for (i = 0; i < n_seq; ++i) {
+        weights[i] = (int*)_err_malloc(seq_lens[i] * sizeof(int));
+        if (abpt->use_qv && qual_weights != NULL && qual_weights[i] != NULL) {
+            for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = (int)qual_weights[i][j];
+        } else {
+            for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = 1;
+        }
+    }
+
+    if ((abpt->disable_seeding && abpt->progressive_poa==0) || abpt->align_mode != ABPOA_GLOBAL_MODE) {
+        abpoa_poa(ab, abpt, seqs, weights, seq_lens, exist_n_seq, n_seq);
+    } else {
+        // sequence pos to node id
+        int *tpos_to_node_id = (int*)_err_calloc(max_len, sizeof(int)), *qpos_to_node_id = (int*)_err_calloc(max_len, sizeof(int));
+        // seeding, build guide tree, and partition into small windows
+        int *read_id_map = (int*)_err_malloc(sizeof(int) * n_seq); // guide tree order -> input order
+        ab_u64_v par_anchors = {0, 0, 0}; int *par_c = (int*)_err_calloc(n_seq, sizeof(int));
+
+        abpoa_build_guide_tree_partition(seqs, seq_lens, n_seq, abpt, read_id_map, &par_anchors, par_c);
+        if (abpt->incr_fn) { // collect anchors between last one path and first seq
+            // anchors
+            // new_par_anchors
+            // push anchors 
+            // free(par_anchors.a);
+            // par_anchors = new_par_anchors;
+            // collect tpos_to_node_id for last one path
+        }
+
+        // perform partial order alignment
+        abpoa_anchor_poa(ab, abpt, seqs, weights, seq_lens, par_anchors, par_c, tpos_to_node_id, qpos_to_node_id, read_id_map, exist_n_seq, n_seq);
+        free(read_id_map); free(tpos_to_node_id); free(qpos_to_node_id); free(par_c);
+        if (par_anchors.m > 0) free(par_anchors.a);
+    }
+
+    // output
+    abpoa_output(ab, abpt, out_fp);
+    for (i = 0; i < n_seq; ++i) free(weights[i]); free(weights);
+    return 0;
+}
+
+int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) {
+    if (!abpt->out_msa && !abpt->out_cons && !abpt->out_gfa) return 0;
+    abpoa_reset(ab, abpt, 1024);
+    if (abpt->incr_fn) abpoa_restore_graph(ab, abpt); // restore existing graph
+    abpoa_seq_t *abs = ab->abs; int exist_n_seq = abs->n_seq;
+
+    // read seq from read_fn
+    gzFile readfp = xzopen(read_fn, "r"); kseq_t *ks = kseq_init(readfp);
+    int i, j, n_seq = abpoa_read_seq(abs, ks);
+
+    // always reset graph before perform POA
+    int max_len = 0;
+    for (i = 0; i < abs->n_seq; ++i) {
+        if (abs->seq[i].l > max_len) max_len = abs->seq[i].l;
+    }
+
+    // set seqs, seq_lens
+    extern char ab_char26_table[256];
+    uint8_t **seqs = (uint8_t**)_err_malloc(n_seq * sizeof(uint8_t*)); int *seq_lens = (int*)_err_malloc(n_seq * sizeof(int));
+    int **weights = (int**)_err_malloc(n_seq * sizeof(int*));
+    for (i = 0; i < n_seq; ++i) {
+        seq_lens[i] = abs->seq[exist_n_seq+i].l;
+        seqs[i] = (uint8_t*)_err_malloc(sizeof(uint8_t) * seq_lens[i]);
+        weights[i] = (int*)_err_malloc(sizeof(int) * seq_lens[i]);
+        for (j = 0; j < seq_lens[i]; ++j) seqs[i][j] = ab_char26_table[(int)abs->seq[exist_n_seq+i].s[j]];
+        if (abpt->use_qv && abs->qual[exist_n_seq+i].l > 0) {
+            for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = (int)abs->qual[exist_n_seq+i].s[j]-32;
+        } else {
+            for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = 1;
+        }
+    }
+    if ((abpt->disable_seeding && abpt->progressive_poa==0) || abpt->align_mode != ABPOA_GLOBAL_MODE) {
+        abpoa_poa(ab, abpt, seqs, weights, seq_lens, exist_n_seq, n_seq);
+    } else {
+        // sequence pos to node id
+        int *tpos_to_node_id = (int*)_err_calloc(max_len, sizeof(int)), *qpos_to_node_id = (int*)_err_calloc(max_len, sizeof(int));
+        // seeding, build guide tree, and partition into small windows
+        int *read_id_map = (int*)_err_malloc(sizeof(int) * n_seq); // guide tree order -> input order
+        ab_u64_v par_anchors = {0, 0, 0}; int *par_c = (int*)_err_calloc(n_seq, sizeof(int));
+
+        abpoa_build_guide_tree_partition(seqs, seq_lens, n_seq, abpt, read_id_map, &par_anchors, par_c);
+        if (abpt->incr_fn) { // TODO collect anchors between last one path and first seq
+            // anchors
+            // new_par_anchors
+            // push anchors 
+            // free(par_anchors.a);
+            // par_anchors = new_par_anchors;
+            // collect tpos_to_node_id for last one path
+            // set tpos_to_node_id
+            //
+        }
+        abpoa_anchor_poa(ab, abpt, seqs, weights, seq_lens, par_anchors, par_c, tpos_to_node_id, qpos_to_node_id, read_id_map, exist_n_seq, n_seq);
+        free(read_id_map); free(tpos_to_node_id); free(qpos_to_node_id); free(par_c);
+        if (par_anchors.m > 0) free(par_anchors.a);
+    }
+
+    // output
+    abpoa_output(ab, abpt, out_fp);
+
+    kseq_destroy(ks); gzclose(readfp);
+    for (i = 0; i < n_seq; ++i) {
+        free(seqs[i]); free(weights[i]);
+    } free(seqs); free(weights); free(seq_lens);
+    return 0;
+}
diff --git a/src/abpoa_align.h b/src/abpoa_align.h
new file mode 100644
index 0000000..25cf462
--- /dev/null
+++ b/src/abpoa_align.h
@@ -0,0 +1,129 @@
+#ifndef ABPOA_ALIGN_H
+#define ABPOA_ALIGN_H
+
+#include "abpoa.h"
+#include "abpoa_graph.h"
+
+#define CHUNK_READ_N 1024
+
+#define ABPOA_MATCH  2
+#define ABPOA_MISMATCH  4
+#define ABPOA_GAP_OPEN1  4
+#define ABPOA_GAP_OPEN2  24
+#define ABPOA_GAP_EXT1  2
+#define ABPOA_GAP_EXT2  1
+
+#define ABPOA_MMK 19
+#define ABPOA_MMW 10
+#define ABPOA_MIN_POA_WIN 500
+
+#define ABPOA_M_OP   0x1
+#define ABPOA_E1_OP  0x2
+#define ABPOA_E2_OP  0x4
+#define ABPOA_E_OP   0x6
+#define ABPOA_F1_OP  0x8 
+#define ABPOA_F2_OP  0x10
+#define ABPOA_F_OP   0x18
+#define ABPOA_ALL_OP 0x1f
+
+#define MULTIP_MIN_FREQ    0.25
+
+// start and end of each band:
+//   range: (min_of_two(max_left, qlen-remain), max_of_two(max_right, qlen-remain))
+//   with extra band width: (range_min-w, range_max+w)
+#define GET_AD_DP_BEGIN(graph, w, id, end_id, qlen) MAX_OF_TWO(0,    MIN_OF_TWO(abpoa_graph_node_id_to_max_pos_left(graph, id),  qlen-(abpoa_graph_node_id_to_max_remain(graph,id)-abpoa_graph_node_id_to_max_remain(graph,end_id)-1)) - w)
+#define GET_AD_DP_END(graph, w, id, end_id, qlen)   MIN_OF_TWO(qlen, MAX_OF_TWO(abpoa_graph_node_id_to_max_pos_right(graph, id), qlen-(abpoa_graph_node_id_to_max_remain(graph,id)-abpoa_graph_node_id_to_max_remain(graph,end_id)-1)) + w)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void abpoa_res_copy(abpoa_res_t *dest, abpoa_res_t *src) {
+    int i;
+    if (dest->n_cigar) free(dest->graph_cigar);
+    dest->n_cigar = src->n_cigar;
+    dest->graph_cigar = (abpoa_cigar_t*)_err_malloc(src->n_cigar * sizeof(abpoa_cigar_t));
+    for (i = 0; i < src->n_cigar; ++i) dest->graph_cigar[i] = src->graph_cigar[i];
+    dest->node_s = src->node_s, dest->node_e = src->node_e;
+    dest->query_s = src->query_s, dest->query_e = src->query_e;
+    dest->n_aln_bases = src->n_aln_bases, dest->n_matched_bases = src->n_matched_bases;
+    dest->best_score = src->best_score;
+    // dest->is_rc = src->is_rc;
+}
+
+static inline abpoa_cigar_t *abpoa_push_cigar(int *n_cigar, int *m_cigar, abpoa_cigar_t *cigar, int op, int len, int32_t node_id, int32_t query_id) {
+    abpoa_cigar_t l = len;
+    if (*n_cigar == 0 || (op != ABPOA_CINS && op != ABPOA_CSOFT_CLIP && op != ABPOA_CHARD_CLIP) || op != (cigar[(*n_cigar)-1] & 0xf)) {
+        if (*n_cigar == *m_cigar) {
+            *m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+            cigar = (abpoa_cigar_t*)_err_realloc(cigar, (*m_cigar) * sizeof(abpoa_cigar_t));
+        }
+        abpoa_cigar_t n_id = node_id, q_id = query_id;
+        if (op == ABPOA_CMATCH || op == ABPOA_CDIFF) 
+            cigar[(*n_cigar)++] = n_id << 34 | q_id << 4 | op;
+        else if (op == ABPOA_CINS || op == ABPOA_CSOFT_CLIP || op == ABPOA_CHARD_CLIP) 
+            cigar[(*n_cigar)++] = q_id << 34 | l << 4 | op;
+        else if (op == ABPOA_CDEL)
+            cigar[(*n_cigar)++] = n_id << 34 | l << 4 | op;
+        else
+            err_fatal(__func__, "Unknown cigar operation: %s\n", op);
+    } else cigar[(*n_cigar)-1] += l << 4;
+
+    return cigar;
+}
+
+static inline int abpoa_push_whole_cigar(int *dest_n_cigar, int *dest_m_cigar, abpoa_cigar_t **dest_cigar, int src_n_cigar, abpoa_cigar_t *src_cigar) {
+    int i, dest_n_c = *dest_n_cigar;
+    *dest_n_cigar += src_n_cigar;
+    if (*dest_n_cigar > *dest_m_cigar) {
+        *dest_m_cigar = MAX_OF_TWO((*dest_m_cigar) << 1, *dest_n_cigar);
+        *dest_cigar = (abpoa_cigar_t*)_err_realloc(*dest_cigar, *dest_m_cigar * sizeof(abpoa_cigar_t));
+    }
+    for (i = 0; i < src_n_cigar; ++i) {
+        (*dest_cigar)[dest_n_c+i] = src_cigar[i];
+    }
+    return 0;
+}
+
+static inline abpoa_cigar_t *abpoa_reverse_cigar(int n_cigar, abpoa_cigar_t *cigar) {
+    int i; abpoa_cigar_t tmp;
+    for (i = 0; i < n_cigar >> 1; ++i) {
+        tmp = cigar[i];
+        cigar[i] = cigar[n_cigar-1-i];
+        cigar[n_cigar-1-i] = tmp;
+    }
+    return cigar;
+}
+
+static inline void abpoa_print_cigar(int n_cigar, abpoa_cigar_t *cigar, abpoa_graph_t *graph) {
+    int i, node_id, query_id, index_i; int op, len;
+    int n[6] = {0, 0, 0, 0, 0, 0};
+    for (i = 0; i < n_cigar; ++i) {
+        op = cigar[i] & 0xf; node_id = (int)(cigar[i] >> 34); 
+        len = query_id = (int)(cigar[i] >> 4) & 0x3fffffff;
+        if (op == ABPOA_CMATCH || op == ABPOA_CDIFF) {
+            index_i = abpoa_graph_node_id_to_index(graph, node_id);
+            printf("1%c:%d,%d\t", ABPOA_CIGAR_STR[op], index_i, query_id);
+            n[op] += 1;
+        } else if (op == ABPOA_CDEL) {
+            index_i = abpoa_graph_node_id_to_index(graph, node_id);
+            printf("%d%c:%d\t", len, ABPOA_CIGAR_STR[op], index_i);
+            n[op] += len;
+        } else if (op == ABPOA_CINS || op == ABPOA_CSOFT_CLIP || op == ABPOA_CHARD_CLIP) { 
+            query_id = node_id;
+            printf("%d%c:%d\t", len, ABPOA_CIGAR_STR[op], query_id);
+            n[op] += len;
+        } else {
+            err_fatal(__func__, "Unknown cigar operation: %s\n", op);
+        }
+    } printf("\n");
+    for (i = 0; i < 6; ++i)
+        printf("%d%c ", n[i], ABPOA_CIGAR_STR[i]);
+    printf("\n");
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/abpoa_graph.c b/src/abpoa_graph.c
new file mode 100644
index 0000000..1fe83c4
--- /dev/null
+++ b/src/abpoa_graph.c
@@ -0,0 +1,743 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "abpoa_align.h"
+#include "abpoa_seq.h"
+#include "simd_abpoa_align.h"
+#include "kdq.h"
+
+KDQ_INIT(int)
+#define kdq_int_t kdq_t(int)
+
+abpoa_node_t *abpoa_init_node(int n) {
+    abpoa_node_t *node = (abpoa_node_t*)_err_calloc(n, sizeof(abpoa_node_t));
+    return node;
+}
+
+void abpoa_set_graph_node(abpoa_graph_t *abg, int node_i) {
+    abg->node[node_i].node_id = node_i;
+    abg->node[node_i].in_edge_n = 0; abg->node[node_i].in_edge_m = 0;
+    abg->node[node_i].out_edge_n = 0; abg->node[node_i].out_edge_m = 0;
+    abg->node[node_i].aligned_node_n = 0; abg->node[node_i].aligned_node_m = 0;
+    abg->node[node_i].n_read = 0; abg->node[node_i].m_read = 0; abg->node[node_i].read_weight = NULL;
+    abg->node[node_i].read_ids_n = 0;
+}
+
+void abpoa_free_node(abpoa_node_t *node, int n) {
+    int i, j;
+    for (i = 0; i < n; ++i) {
+        if (node[i].in_edge_m > 0) free(node[i].in_id);
+        if (node[i].out_edge_m > 0) {
+            free(node[i].out_id); free(node[i].out_weight);
+            if (node[i].read_ids_n > 0) {
+                for (j = 0; j < node[i].out_edge_m; ++j) {
+                    free(node[i].read_ids[j]);
+                } 
+                free(node[i].read_ids);
+            }
+        }
+        if (node[i].m_read > 0) free(node[i].read_weight);
+        if (node[i].aligned_node_m > 0) free(node[i].aligned_node_id);
+    }
+    free(node);
+}
+
+// 0: in_edge, 1: out_edge
+abpoa_graph_t *abpoa_realloc_graph_edge(abpoa_graph_t *abg, int io, int id, int use_read_ids) {
+    if (io == 0) {
+        _uni_realloc(abg->node[id].in_id, abg->node[id].in_edge_n, abg->node[id].in_edge_m, int);
+    } else {
+        int edge_m = abg->node[id].out_edge_m;
+        if (edge_m <= 0) {
+            abg->node[id].out_edge_m = MAX_OF_TWO(abg->node[id].out_edge_n, 1);
+            abg->node[id].out_id = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int));
+            abg->node[id].out_weight = (int*)_err_malloc(abg->node[id].out_edge_m * sizeof(int));
+            if (use_read_ids || abg->node[id].read_ids_n > 0) {
+                abg->node[id].read_ids = (uint64_t**)_err_malloc(abg->node[id].out_edge_m * sizeof(uint64_t*));
+                if (abg->node[id].read_ids_n > 0) {
+                    int i;
+                    for (i = 0; i < abg->node[id].out_edge_m; ++i) {
+                        abg->node[id].read_ids[i] = (uint64_t*)_err_calloc(abg->node[id].read_ids_n, sizeof(uint64_t));
+                    }
+                }
+            }
+        } else if (abg->node[id].out_edge_n >= edge_m) {
+            abg->node[id].out_edge_m = abg->node[id].out_edge_n+1; kroundup32(abg->node[id].out_edge_m);
+            abg->node[id].out_id = (int*)_err_realloc(abg->node[id].out_id, abg->node[id].out_edge_m * sizeof(int));
+            abg->node[id].out_weight = (int*)_err_realloc(abg->node[id].out_weight, abg->node[id].out_edge_m * sizeof(int));
+            if (use_read_ids || abg->node[id].read_ids_n > 0) {
+                abg->node[id].read_ids = (uint64_t**)_err_realloc(abg->node[id].read_ids, abg->node[id].out_edge_m * sizeof(uint64_t*));
+                if (abg->node[id].read_ids_n > 0) {
+                    int i;
+                    for (i = edge_m; i < abg->node[id].out_edge_m; ++i) {
+                        abg->node[id].read_ids[i] = (uint64_t*)_err_calloc(abg->node[id].read_ids_n, sizeof(uint64_t));
+                    }
+                }
+            }
+        }
+    }
+    return abg;
+}
+
+abpoa_graph_t *abpoa_realloc_graph_node(abpoa_graph_t *abg) {
+    if (abg->node_m <= 0) {
+        abg->node_m = 1;
+        abg->node = (abpoa_node_t*)_err_calloc(1, sizeof(abpoa_node_t));
+    }
+    if (abg->node_n == abg->node_m) {
+        int i;
+        abg->node_m <<= 1;
+        abg->node = (abpoa_node_t*)_err_realloc(abg->node, abg->node_m * sizeof(abpoa_node_t));
+        for (i = abg->node_m >> 1; i < abg->node_m; ++i) {
+            abpoa_set_graph_node(abg, i);
+        }
+    }
+    return abg;
+}
+
+abpoa_graph_t *abpoa_init_graph(void) {
+    abpoa_graph_t *abg = (abpoa_graph_t*)_err_malloc(sizeof(abpoa_graph_t));
+    abg->node_n = 2, abg->node_m = 2, abg->index_rank_m = 0;
+    abg->node = abpoa_init_node(2);
+    abg->node[0].node_id = 0; abg->node[1].node_id = 1;
+    abg->node[0].read_ids_n = 0; abg->node[1].read_ids_n = 0;
+    abg->is_topological_sorted = abg->is_called_cons = 0;
+    abg->node_id_to_index = NULL; abg->index_to_node_id = NULL; abg->node_id_to_msa_rank = NULL;
+    abg->node_id_to_max_pos_left = NULL; abg->node_id_to_max_pos_right = NULL; abg->node_id_to_max_remain = NULL;
+    return abg;
+}
+
+void abpoa_free_graph(abpoa_graph_t *abg) {
+    if (abg->node_m > 0) abpoa_free_node(abg->node, abg->node_m);
+
+    if (abg->node_n > 0) {
+        free(abg->index_to_node_id);
+        free(abg->node_id_to_index);
+        if (abg->node_id_to_msa_rank) free(abg->node_id_to_msa_rank);
+
+        if (abg->node_id_to_max_pos_left) free(abg->node_id_to_max_pos_left);
+        if (abg->node_id_to_max_pos_right) free(abg->node_id_to_max_pos_right);
+        if (abg->node_id_to_max_remain) free(abg->node_id_to_max_remain);
+    }
+    free(abg);
+}
+
+abpoa_cons_t *abpoa_init_cons(void) {
+    abpoa_cons_t *abc = (abpoa_cons_t*)_err_malloc(sizeof(abpoa_cons_t));
+    abc->n_cons = 0; abc->msa_len = 0;
+    abc->clu_n_seq = NULL;
+    abc->cons_len = NULL;
+    abc->cons_node_ids = NULL;
+    abc->cons_base = NULL;
+    abc->msa_base = NULL;
+    abc->cons_cov = NULL;
+    abc->clu_read_ids = NULL;
+    abc->cons_phred_score = NULL;
+    return abc;
+}
+
+void abpoa_free_cons(abpoa_cons_t *abc) {
+    int i;
+    if (abc->n_cons > 0) {
+        if (abc->clu_n_seq != NULL) free(abc->clu_n_seq);
+        if (abc->cons_len != NULL) free(abc->cons_len);
+        if (abc->cons_node_ids != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_node_ids[i]); free(abc->cons_node_ids);
+        }
+        if (abc->cons_base != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_base[i]); free(abc->cons_base);
+        }
+        if (abc->cons_cov != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_cov[i]); free(abc->cons_cov);
+        }
+        if (abc->clu_read_ids != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->clu_read_ids[i]); free(abc->clu_read_ids);
+        }
+        if (abc->cons_phred_score != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_phred_score[i]); free(abc->cons_phred_score);
+        }
+    }
+    if (abc->msa_len > 0) {
+        if (abc->msa_base != NULL) {
+            for (i = 0; i < abc->n_seq+abc->n_cons; ++i) free(abc->msa_base[i]);
+            free(abc->msa_base);
+        }
+    }
+    free(abc);
+}
+
+abpoa_t *abpoa_init(void) {
+    abpoa_t *ab = (abpoa_t*)_err_malloc(sizeof(abpoa_t));
+    ab->abg = abpoa_init_graph();
+    ab->abs = abpoa_init_seq();
+    ab->abm = abpoa_init_simd_matrix();
+    ab->abc = abpoa_init_cons();
+    return ab;
+}
+
+void abpoa_free(abpoa_t *ab) {
+    abpoa_free_graph(ab->abg);
+    abpoa_free_seq(ab->abs);
+    abpoa_free_simd_matrix(ab->abm);
+    abpoa_free_cons(ab->abc);
+    free(ab);
+}
+
+void abpoa_BFS_set_node_index(abpoa_graph_t *abg, int src_id, int sink_id) {
+    int *id, cur_id, out_id, aligned_id;
+    int index = 0, q_size, new_q_size;
+
+    int *in_degree = (int*)_err_malloc(abg->node_n * sizeof(int));
+    int i, j;
+    for (i = 0; i < abg->node_n; ++i) in_degree[i] = abg->node[i].in_edge_n;
+
+    kdq_int_t *q = kdq_init_int();
+
+    // Breadth-First-Search
+    kdq_push_int(q, src_id); q_size = 1; new_q_size = 0; // node[q.id].in_degree equals 0
+    while (q_size > 0) {
+        if ((id = kdq_shift_int(q)) == 0) err_fatal_simple("Error in queue.");
+        cur_id = *id;
+        abg->index_to_node_id[index] = cur_id;
+        abg->node_id_to_index[cur_id] = index++;
+
+        if (cur_id == sink_id) {
+            kdq_destroy_int(q); free(in_degree);
+            return;
+        }
+        for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+            out_id = abg->node[cur_id].out_id[i];
+            if (--in_degree[out_id] == 0) {
+                for (j = 0; j < abg->node[out_id].aligned_node_n; ++j) {
+                    aligned_id = abg->node[out_id].aligned_node_id[j];
+                    if (in_degree[aligned_id] != 0) goto next_out_node;
+                }
+                kdq_push_int(q, out_id);
+                ++new_q_size;
+                for (j = 0; j < abg->node[out_id].aligned_node_n; ++j) {
+                    aligned_id = abg->node[out_id].aligned_node_id[j];
+                    kdq_push_int(q, aligned_id);
+                    ++new_q_size;
+                }
+            }
+next_out_node:;
+        }
+        if (--q_size == 0) {
+            q_size = new_q_size;
+            new_q_size = 0;
+        }
+    }
+    err_fatal_simple("Failed to set node index.");
+}
+
+void abpoa_BFS_set_node_remain(abpoa_graph_t *abg, int src_id, int sink_id) {
+    int *id, cur_id, i, out_id, in_id;
+
+    int *out_degree = (int*)_err_malloc(abg->node_n * sizeof(int));
+    for (i = 0; i < abg->node_n; ++i) {
+        out_degree[i] = abg->node[i].out_edge_n;
+        abg->node_id_to_max_remain[i] = 0;
+    }
+
+    kdq_int_t *q = kdq_init_int();
+
+    // Breadth-First-Search
+    kdq_push_int(q, sink_id); // node[q.id].in_degree equals 0
+    abg->node_id_to_max_remain[sink_id] = -1; // XXX not 0
+    while ((id = kdq_shift_int(q)) != 0) {
+        cur_id = *id;
+
+        // all out_id of cur_id have beed visited
+        // max weight out_id
+        if (cur_id != sink_id) {
+            int max_w=-1, max_id=-1;
+            for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+                out_id = abg->node[cur_id].out_id[i];
+                if (abg->node[cur_id].out_weight[i] > max_w) {
+                    max_w = abg->node[cur_id].out_weight[i];
+                    max_id = out_id;
+                }
+            }
+            abg->node_id_to_max_remain[cur_id] = abg->node_id_to_max_remain[max_id] + 1;
+            // fprintf(stderr, "%d -> %d\n", abg->node_id_to_index[cur_id], abg->node_id_to_max_remain[cur_id]);
+        }
+        if (cur_id == src_id) {
+            kdq_destroy_int(q); free(out_degree);
+            return;
+        }
+        for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) {
+            in_id = abg->node[cur_id].in_id[i];
+            if (--out_degree[in_id] == 0) kdq_push_int(q, in_id);
+        }
+    }
+    err_fatal_simple("Failed to set node remain.");
+}
+
+// 1. index_to_node_id
+// 2. node_id_to_index
+// 3. node_id_to_rank
+void abpoa_topological_sort(abpoa_graph_t *abg, abpoa_para_t *abpt) {
+    if (abg->node_n <= 0) {
+        err_func_format_printf(__func__, "Empty graph.\n");
+        return;
+    }
+    int node_n = abg->node_n;
+    if (node_n > abg->index_rank_m) {
+        abg->index_rank_m = node_n; kroundup32(abg->index_rank_m);
+        // fprintf(stderr, "node_n: %d, index_rank_m: %d\n", node_n, abg->index_rank_m);
+        abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, abg->index_rank_m * sizeof(int));
+        abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, abg->index_rank_m * sizeof(int));
+        if (abpt->out_msa || abpt->max_n_cons > 1) 
+            abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, abg->index_rank_m * sizeof(int));
+        if (abpt->wb >= 0) {
+            abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, abg->index_rank_m * sizeof(int));
+            abg->node_id_to_max_pos_right = (int*)_err_realloc(abg->node_id_to_max_pos_right, abg->index_rank_m * sizeof(int));
+            abg->node_id_to_max_remain = (int*)_err_realloc(abg->node_id_to_max_remain, abg->index_rank_m * sizeof(int));
+        } else if (abpt->zdrop > 0) {
+            abg->node_id_to_max_remain = (int*)_err_realloc(abg->node_id_to_max_remain, abg->index_rank_m * sizeof(int));
+        }
+    }
+    // start from ABPOA_SRC_NODE_ID to ABPOA_SINK_NODE_ID
+    abpoa_BFS_set_node_index(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID);
+    // init min/max rank
+    if (abpt->wb >= 0) {
+        int i;
+        for (i = 0; i < node_n; ++i) {
+            abg->node_id_to_max_pos_right[i] = 0;
+            abg->node_id_to_max_pos_left[i] = node_n;
+        }
+        abpoa_BFS_set_node_remain(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID);
+    } else if (abpt->zdrop > 0)
+        abpoa_BFS_set_node_remain(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID);
+    abg->is_topological_sorted = 1;
+}
+
+void abpoa_DFS_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id, int *in_degree) {
+    // fprintf(stderr, "node_n: %d, m: %d\n", abg->node_n, abg->index_rank_m);
+    if (abg->node_n > abg->index_rank_m) {
+        int m = abg->node_n; kroundup32(m);
+        abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, m * sizeof(int));
+    }
+    int *id, cur_id, i, j, out_id, aligned_id;
+    int msa_rank = 0;
+    kdq_int_t *q = kdq_init_int();
+
+    // Depth-First-Search
+    kdq_push_int(q, src_id); // node[q.id].in_degree equals 0
+    abg->node_id_to_msa_rank[src_id] = -1;
+    // printf("tot_node_n: %d, node_m: %d\n", abg->node_n, abg->node_m);
+
+    while((id = kdq_pop_int(q)) != 0) {
+        cur_id = *id;
+        if (abg->node_id_to_msa_rank[cur_id] < 0) {
+            abg->node_id_to_msa_rank[cur_id] = msa_rank;
+            for (i = 0; i < abg->node[cur_id].aligned_node_n; ++i) {
+                aligned_id = abg->node[cur_id].aligned_node_id[i];
+                abg->node_id_to_msa_rank[aligned_id] = msa_rank;
+            }
+            msa_rank++;
+        }
+
+        if (cur_id == sink_id) {
+            kdq_destroy_int(q);
+            abg->is_set_msa_rank = 1;
+            return;
+        }
+        for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+            out_id = abg->node[cur_id].out_id[i];
+            if (--in_degree[out_id] == 0) {
+                for (j = 0; j < abg->node[out_id].aligned_node_n; ++j) {
+                    aligned_id = abg->node[out_id].aligned_node_id[j];
+                    if (in_degree[aligned_id] != 0) goto next_out_node;
+                }
+                kdq_push_int(q, out_id);
+                abg->node_id_to_msa_rank[out_id] = -1;
+                for (j = 0; j < abg->node[out_id].aligned_node_n; ++j) {
+                    aligned_id = abg->node[out_id].aligned_node_id[j];
+                    kdq_push_int(q, aligned_id);
+                    // printf("aln_id: %d\n", aligned_id);
+                    abg->node_id_to_msa_rank[aligned_id] = -1;
+                }
+            }
+next_out_node:;
+        }
+    }
+    err_fatal_simple("Error in set_msa_rank.\n");
+}
+
+void abpoa_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id) {
+    if (abg->is_set_msa_rank == 0) {
+        int i, *in_degree = (int*)_err_malloc(abg->node_n * sizeof(int));
+        for (i = 0; i < abg->node_n; ++i) in_degree[i] = abg->node[i].in_edge_n;
+        abpoa_DFS_set_msa_rank(abg, src_id, sink_id, in_degree);
+        free(in_degree);
+    }
+}
+
+int abpoa_get_aligned_id(abpoa_graph_t *abg, int node_id, uint8_t base) {
+    int i, aln_id;
+    abpoa_node_t *node = abg->node;
+    for (i = 0; i < node[node_id].aligned_node_n; ++i) {
+        aln_id = node[node_id].aligned_node_id[i];
+        if (node[aln_id].base == base)
+            return aln_id;
+    }
+    return -1;
+}
+
+void abpoa_add_graph_aligned_node1(abpoa_node_t *node, int aligned_id) {
+    _uni_realloc(node->aligned_node_id, node->aligned_node_n, node->aligned_node_m, int);
+    node->aligned_node_id[node->aligned_node_n++] = aligned_id;
+}
+
+void abpoa_add_graph_aligned_node(abpoa_graph_t *abg, int node_id, int aligned_id) {
+    int i; abpoa_node_t *node = abg->node;
+    for (i = 0; i < node[node_id].aligned_node_n; ++i) {
+        abpoa_add_graph_aligned_node1(node + node[node_id].aligned_node_id[i], aligned_id);
+        abpoa_add_graph_aligned_node1(node + aligned_id, node[node_id].aligned_node_id[i]);
+    }
+    abpoa_add_graph_aligned_node1(abg->node + node_id, aligned_id);
+    abpoa_add_graph_aligned_node1(abg->node + aligned_id, node_id);
+}
+
+void abpoa_set_read_id(uint64_t *read_ids, int read_id) {
+    int n = read_id / 64;
+    uint64_t one = 1; int b = read_id & 0x3f;
+    read_ids[n] |= (one << b);
+}
+
+int abpoa_add_graph_node(abpoa_graph_t *abg, uint8_t base) {
+    int node_id = abg->node_n;
+    abpoa_realloc_graph_node(abg);
+    // add node
+    abg->node[node_id].base = base;
+    ++abg->node_n;
+    return node_id;
+}
+
+int abpoa_add_graph_edge(abpoa_graph_t *abg, int from_id, int to_id, int check_edge, int w, uint8_t add_read_id, uint8_t add_read_weight, int read_id, int read_ids_n, int tot_read_n) {
+    int ret = 1;
+    if (from_id < 0 || from_id >= abg->node_n || to_id < 0 || to_id >= abg->node_n) err_fatal(__func__, "node_n: %d\tfrom_id: %d\tto_id: %d.", abg->node_n, from_id, to_id);
+    // fprintf(stderr, "weigth: %d\n", w);
+    int out_edge_n = abg->node[from_id].out_edge_n;
+    int edge_exist = 0;
+    int out_edge_i = -1;
+    if (check_edge) {
+        int i;
+        for (i = 0; i < out_edge_n; ++i) {
+            if (abg->node[from_id].out_id[i] == to_id) { // edge exists
+                abg->node[from_id].out_weight[i] += w; // update weight on existing edge
+                // update label id
+                edge_exist = 1;
+                out_edge_i = i;
+                break;
+            }
+        }
+    }
+
+    // add edge
+    if (edge_exist == 0) {
+        /// in edge
+        abpoa_realloc_graph_edge(abg, 0, to_id, 0);
+        abg->node[to_id].in_id[abg->node[to_id].in_edge_n] = from_id;
+        ++abg->node[to_id].in_edge_n;
+        /// out edge
+        abpoa_realloc_graph_edge(abg, 1, from_id, add_read_id);
+        abg->node[from_id].out_id[out_edge_n] = to_id;
+        abg->node[from_id].out_weight[out_edge_n] = w; // initial weight for new edge
+        out_edge_i = out_edge_n;
+        ++abg->node[from_id].out_edge_n;
+    }
+    
+    // add read_id to out edge
+    if (add_read_id) {
+        if (out_edge_i < 0) err_fatal_simple("No edge found.");
+        if (read_ids_n <= 0) err_fatal(__func__, "Unexpected read_ids_n: %d.", read_ids_n);
+        int i, j;
+        abpoa_node_t *from_node = abg->node + from_id;
+        if (from_node->read_ids_n == 0) {
+            for (i = 0; i < from_node->out_edge_m; ++i) {
+                from_node->read_ids[i] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t));
+            }
+            from_node->read_ids_n = read_ids_n;
+        } else if (from_node->read_ids_n < read_ids_n) {
+            // reallocate from_node->read_ids
+            for (i = 0; i < from_node->out_edge_m; ++i) {
+                from_node->read_ids[i] = (uint64_t*)_err_realloc(from_node->read_ids[i], read_ids_n * sizeof(uint64_t));
+                for (j = from_node->read_ids_n; j < read_ids_n; ++j) from_node->read_ids[i][j] = 0;
+            }
+            from_node->read_ids_n = read_ids_n;
+        }
+        abpoa_set_read_id(from_node->read_ids[out_edge_i], read_id);
+    }
+    abg->node[from_id].n_read += 1;
+    if (add_read_weight) {
+        if (tot_read_n > abg->node[from_id].m_read) {
+            abg->node[from_id].read_weight = (int*)_err_realloc(abg->node[from_id].read_weight, tot_read_n * sizeof(int));
+            int i;
+            for (i = abg->node[from_id].m_read; i < tot_read_n; ++i) abg->node[from_id].read_weight[i] = 0;
+            abg->node[from_id].m_read = tot_read_n;
+        }
+        abg->node[from_id].read_weight[read_id] = w;
+    }
+    return ret;
+}
+
+void abpoa_add_graph_sequence(abpoa_graph_t *abg, uint8_t *seq, int *weight, int seq_l, int *qpos_to_node_id, int start, int end, uint8_t add_read_id, uint8_t add_read_weight, int read_id, int read_ids_n, int tot_read_n) {
+    if (start >= seq_l || end <= start) err_fatal(__func__, "seq_l: %d\tstart: %d\tend: %d.", seq_l, start, end);
+    if (end > seq_l) end = seq_l;
+
+    int i, last_node_id, cur_node_id;
+    last_node_id = ABPOA_SRC_NODE_ID;
+    for (i = start; i < end; ++i) {
+        cur_node_id = abpoa_add_graph_node(abg, seq[i]);
+        if (qpos_to_node_id) qpos_to_node_id[i] = cur_node_id;
+        abpoa_add_graph_edge(abg, last_node_id, cur_node_id, 0, weight[i], add_read_id, add_read_weight, read_id, read_ids_n, tot_read_n);
+        last_node_id = cur_node_id;
+    }
+
+    abpoa_add_graph_edge(abg, last_node_id, ABPOA_SINK_NODE_ID, 0, weight[seq_l-1], add_read_id, add_read_weight, read_id, read_ids_n, tot_read_n);
+    abg->is_called_cons = abg->is_set_msa_rank = abg->is_topological_sorted = 0;
+    // abpoa_topological_sort(abg, abpt);
+}
+
+int is_full_upstream_subgraph(abpoa_graph_t *abg, int up_index, int down_index) {
+    int i, j, id, in_id;
+    for (i = up_index+1; i <= down_index; ++i) {
+        id = abg->index_to_node_id[i];
+        for (j = 0; j < abg->node[id].in_edge_n; ++j) {
+            in_id = abg->node[id].in_id[j];
+            if (abg->node_id_to_index[in_id] < up_index) return 0;
+        }
+    }
+    return 1;
+}
+
+int abpoa_upstream_index(abpoa_graph_t *abg, int beg_index, int end_index) {
+    int min_index, in_index, i, j, node_id, in_id;
+
+    while (1) {
+        min_index = beg_index;
+        for (i = beg_index; i <= end_index; ++i) {
+            node_id = abg->index_to_node_id[i];
+            for (j = 0; j < abg->node[node_id].in_edge_n; ++j) {
+                in_id = abg->node[node_id].in_id[j];
+                in_index = abg->node_id_to_index[in_id];
+                min_index = MIN_OF_TWO(min_index, in_index);
+            }
+        }
+        if (is_full_upstream_subgraph(abg, min_index, beg_index)) {
+            return min_index;
+        } else {
+            end_index = beg_index;
+            beg_index = min_index; 
+        }
+    }
+}
+
+int is_full_downstream_subgraph(abpoa_graph_t *abg, int up_index, int down_index) {
+    int i, j, id, out_id;
+    for (i = up_index; i < down_index; ++i) {
+        id = abg->index_to_node_id[i];
+        for (j = 0; j < abg->node[id].out_edge_n; ++j) {
+            out_id = abg->node[id].out_id[j];
+            if (abg->node_id_to_index[out_id] > down_index) return 0;
+        }
+    }
+    return 1;
+}
+
+int abpoa_downstream_index(abpoa_graph_t *abg, int beg_index, int end_index) {
+    int max_index, out_index, i, j, node_id, out_id;
+
+    while (1) {
+        max_index = end_index;
+        for (i = beg_index; i <= end_index; ++i) {
+            node_id = abg->index_to_node_id[i];
+            for (j = 0; j < abg->node[node_id].out_edge_n; ++j) {
+                out_id = abg->node[node_id].out_id[j];
+                out_index = abg->node_id_to_index[out_id];
+                max_index = MAX_OF_TWO(max_index, out_index);
+            }
+        }
+        if (is_full_upstream_subgraph(abg, end_index, max_index)) {
+            return max_index;
+        } else {
+            beg_index = end_index;
+            end_index = max_index;
+        }
+    }
+}
+
+//   exc_beg | inc_beg ... inc_end | exc_end
+void abpoa_subgraph_nodes(abpoa_t *ab, abpoa_para_t *abpt, int exc_beg0, int exc_end0, int *exc_beg, int *exc_end) {
+    abpoa_graph_t *abg = ab->abg;
+    if (ab->abg->is_topological_sorted == 0) abpoa_topological_sort(abg, abpt);
+    int inc_beg_index = abg->node_id_to_index[exc_beg0], inc_end_index = abg->node_id_to_index[exc_end0];
+
+    int exc_beg_index = abpoa_upstream_index(abg, inc_beg_index, inc_end_index);
+    int exc_end_index = abpoa_downstream_index(abg, inc_beg_index, inc_end_index);
+
+    if (exc_beg_index < 0 || exc_end_index >= abg->node_n)
+        err_fatal_simple("Error in subgraph_nodes");
+    *exc_beg = abg->index_to_node_id[exc_beg_index];
+    *exc_end = abg->index_to_node_id[exc_end_index];
+}
+
+// fusion stratergy :
+// 1. Match: merge to one node
+// 2. Mismatch: check if B is identical to A' aligned nodes, then merge to node; if not, add node
+// 3. Insertion: add node
+// 4. Deletion: nothing
+// 5. Clipping: add node
+// 6. For all first/last node, link to virtual start/end node
+
+// inc_both_ends: set as 1 to add weight for edge between beg_node_id/end_node_id and internal node
+int abpoa_add_subgraph_alignment(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *seq, int *_weight, int seq_l, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends) {
+    abpoa_graph_t *abg = ab->abg;
+    int n_cigar = res.n_cigar; abpoa_cigar_t *abpoa_cigar = res.graph_cigar;
+    int read_ids_n = 1 + ((tot_read_n-1) >> 6);
+    uint8_t add_read_id = abpt->use_read_ids, add_read_weight = abpt->use_qv & (abpt->max_n_cons>1), add;
+    int i, *weight;
+    if (_weight == NULL) {
+        weight = (int*)_err_malloc(seq_l * sizeof(int));
+        for (i = 0; i < seq_l; ++i) weight[i] = 1;
+    } else weight = _weight;
+
+    if (abg->node_n == 2) { // empty graph
+        abpoa_add_graph_sequence(abg, seq, weight, seq_l, qpos_to_node_id, 0, seq_l, add_read_id, add_read_weight, read_id, read_ids_n, tot_read_n);
+        if (_weight == NULL) free(weight);
+        return 0;
+    } else {
+        if (abg->node_n < 2) {
+            err_fatal(__func__, "Graph node: %d.", abg->node_n);
+        } else if (n_cigar == 0) {
+            if (_weight == NULL) free(weight);
+            return 0;
+            //err_fatal(__func__, "Empty graph cigar.");
+        }
+    }
+    // normal graph, normal graph_cigar
+    int j; int op, len;
+    int node_id, query_id=-1, last_new = 0, last_id = beg_node_id, new_id, aligned_id;
+
+
+    for (i = 0; i < n_cigar; ++i) {
+        op = abpoa_cigar[i] & 0xf;
+        if (op == ABPOA_CMATCH) {
+            node_id = (abpoa_cigar[i] >> 34) & 0x3fffffff;
+            query_id++; // = (abpoa_cigar[i] >> 4) & 0x3fffffff;
+            if (abg->node[node_id].base != seq[query_id]) { // mismatch
+                // check if query base is identical to node_id's aligned node
+                if ((aligned_id = abpoa_get_aligned_id(abg, node_id, seq[query_id])) != -1) {
+                    if (last_id != beg_node_id || inc_both_ends) add = 1; else add = 0;
+                    abpoa_add_graph_edge(abg, last_id, aligned_id, 1-last_new, weight[query_id], add_read_id&add, add_read_weight, read_id, read_ids_n, tot_read_n);
+                    last_id = aligned_id; last_new = 0;
+                } else {
+                    new_id = abpoa_add_graph_node(abg, seq[query_id]);
+                    if (last_id != beg_node_id || inc_both_ends) add = 1; else add = 0;
+                    abpoa_add_graph_edge(abg, last_id, new_id, 0, weight[query_id], add_read_id&add, add_read_weight, read_id, read_ids_n, tot_read_n);
+                    last_id = new_id; last_new = 1;
+                    // add new_id to node_id's aligned node
+                    abpoa_add_graph_aligned_node(abg, node_id, new_id);
+                }
+            } else { // match
+                if (last_id != beg_node_id || inc_both_ends) add = 1; else add = 0;
+                abpoa_add_graph_edge(abg, last_id, node_id, 1-last_new, weight[query_id], add_read_id&add, add_read_weight, read_id, read_ids_n, tot_read_n);
+                last_id = node_id; last_new = 0;
+            }
+            if (qpos_to_node_id) qpos_to_node_id[query_id] = last_id;
+        } else if (op == ABPOA_CINS || op == ABPOA_CSOFT_CLIP || op == ABPOA_CHARD_CLIP) {
+            len = (abpoa_cigar[i] >> 4) & 0x3fffffff;
+            query_id+=len; // = (abpoa_cigar[i] >> 34) & 0x3fffffff;
+            for (j = len-1; j >= 0; --j) { // XXX use dynamic id, instead of static query_id
+                new_id = abpoa_add_graph_node(abg, seq[query_id-j]);
+                if (last_id != beg_node_id || inc_both_ends) add = 1; else add = 0;
+                abpoa_add_graph_edge(abg, last_id, new_id, 0, weight[query_id-j], add_read_id&add, add_read_weight, read_id, read_ids_n, tot_read_n);
+                last_id = new_id; last_new = 1;
+                if (qpos_to_node_id) qpos_to_node_id[query_id-j] = last_id;
+            }
+        } else if (op == ABPOA_CDEL) {
+            // nothing;
+            continue;
+        }
+    } 
+    // if (inc_both_ends) add = 1; else add = 0; XXX end_node_id is always excluded when adding weight
+    // abpoa_add_graph_edge(abg, last_id, end_node_id, 1-last_new, w, add_read_id&add, read_id, read_ids_n);
+    abpoa_add_graph_edge(abg, last_id, end_node_id, 1-last_new, weight[seq_l-1], add_read_id, add_read_weight, read_id, read_ids_n, tot_read_n);
+    abg->is_called_cons = abg->is_topological_sorted = 0;
+    // abpoa_topological_sort(abg, abpt);
+    if (_weight == NULL) free(weight);
+    return 0;
+}
+
+int abpoa_add_graph_alignment(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *seq, int *weight, int seq_l, int *qpos_to_node_id, abpoa_res_t res, int read_id, int tot_read_n, int inc_both_ends) {
+    return abpoa_add_subgraph_alignment(ab, abpt, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, seq, weight, seq_l, qpos_to_node_id, res, read_id, tot_read_n, inc_both_ends);
+}
+
+// reset allocated memery everytime init the graph
+// * node
+// * index_to_node_id/node_id_to_index/node_id_to_max_remain, max_pos_left/right
+void abpoa_reset(abpoa_t *ab, abpoa_para_t *abpt, int qlen) {
+    abpoa_graph_t *abg = ab->abg;
+    int i, j, k, node_m;
+    abg->is_topological_sorted = abg->is_called_cons = 0;
+    for (i = 0; i < abg->node_n; ++i) {
+        for (j = 0; j < abg->node[i].out_edge_n; ++j) {
+            for (k = 0; k < abg->node[i].read_ids_n; ++k) abg->node[i].read_ids[j][k] = 0;
+        }
+        abg->node[i].in_edge_n = abg->node[i].out_edge_n = abg->node[i].aligned_node_n = 0;
+        abg->node[i].n_read = 0;
+            
+    }
+    abg->node_n = 2;
+    if (qlen+2 > abg->node_m) {
+        node_m = qlen+2; kroundup32(node_m);
+        abg->node = (abpoa_node_t*)_err_realloc(abg->node, node_m * sizeof(abpoa_node_t));
+        for (i = abg->node_m; i < node_m; ++i) 
+            abpoa_set_graph_node(abg, i);
+        abg->node_m = abg->index_rank_m = node_m;
+        abg->index_to_node_id = (int*)_err_realloc(abg->index_to_node_id, node_m * sizeof(int));
+        abg->node_id_to_index = (int*)_err_realloc(abg->node_id_to_index, node_m * sizeof(int));
+        if (abpt->out_msa || abpt->max_n_cons > 1) 
+            abg->node_id_to_msa_rank = (int*)_err_realloc(abg->node_id_to_msa_rank, node_m * sizeof(int));
+        if (abpt->wb >= 0) {
+            abg->node_id_to_max_pos_left = (int*)_err_realloc(abg->node_id_to_max_pos_left, node_m * sizeof(int));
+            abg->node_id_to_max_pos_right = (int*)_err_realloc(abg->node_id_to_max_pos_right, node_m * sizeof(int));
+            abg->node_id_to_max_remain = (int*)_err_realloc(abg->node_id_to_max_remain, node_m * sizeof(int));
+        } else if (abpt->zdrop > 0) {
+            abg->node_id_to_max_remain = (int*)_err_realloc(abg->node_id_to_max_remain, node_m * sizeof(int));
+        }
+    }
+    // fprintf(stderr, "qlen: %d, node_n: %d, node_m: %d\n", qlen, abg->node_n, abg->node_m);
+    // reset abs
+    ab->abs->n_seq = 0;
+    // reset cons
+    abpoa_cons_t *abc = ab->abc;
+    if (abc->n_cons > 0) {
+        if (abc->clu_n_seq != NULL) free(abc->clu_n_seq);
+        if (abc->cons_len != NULL) free(abc->cons_len);
+        if (abc->cons_node_ids != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_node_ids[i]); free(abc->cons_node_ids);
+        }
+        if (abc->cons_base != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_base[i]); free(abc->cons_base);
+        }
+        if (abc->cons_cov != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_cov[i]); free(abc->cons_cov);
+        }
+        if (abc->clu_read_ids != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->clu_read_ids[i]); free(abc->clu_read_ids);
+        }
+        if (abc->cons_phred_score != NULL) {
+            for (i = 0; i < abc->n_cons; ++i) free(abc->cons_phred_score[i]); free(abc->cons_phred_score);
+        }
+    }
+    if (abc->msa_len > 0) {
+        if (abc->msa_base != NULL) {
+            for (i = 0; i < abc->n_seq+abc->n_cons; ++i) free(abc->msa_base[i]);
+            free(abc->msa_base);
+        }
+    }
+    abc->n_seq = abc->n_cons = abc->msa_len = 0;
+}
diff --git a/src/abpoa_graph.h b/src/abpoa_graph.h
new file mode 100644
index 0000000..c565189
--- /dev/null
+++ b/src/abpoa_graph.h
@@ -0,0 +1,61 @@
+#ifndef ABPOA_GRAPH_H
+#define ABPOA_GRAPH_H
+
+#include <stdint.h>
+#include "abpoa.h"
+#include "utils.h"
+
+//#define CIGAR_STR "MIDNSHP=XB"
+//#define ABPOA_GRAPH_CIGAR_STR "=XIDNSH"
+//#define ABPOA_GRAPH_CEQUAL 0
+//#define ABPOA_GRAPH_CMISMATCH 1
+//#define ABPOA_GRAPH_CINS 2
+//#define ABPOA_GRAPH_CDEL 3
+//#define ABPOA_GRAPH_CREF_SKIP 4
+//#define ABPOA_GRAPH_CCLIP 5
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int abpoa_get_aligned_id(abpoa_graph_t *abg, int node_id, uint8_t base);
+void abpoa_add_graph_aligned_node(abpoa_graph_t *abg, int node_id, int aligned_id);
+void abpoa_set_msa_rank(abpoa_graph_t *abg, int src_id, int sink_id);
+abpoa_graph_t *abpoa_init_graph(void);
+void abpoa_free_graph(abpoa_graph_t *graph);
+
+static inline int abpoa_graph_node_id_to_index(abpoa_graph_t *graph, int node_id) {
+    if (node_id < 0 || node_id >= graph->node_n) err_fatal(__func__, "Wrong node id: %d\n", node_id);
+    return graph->node_id_to_index[node_id];
+}
+
+static inline int abpoa_graph_node_id_to_max_pos_right(abpoa_graph_t *graph, int node_id) {
+    if (node_id < 0 || node_id >= graph->node_n) err_fatal(__func__, "Wrong node id: %d\n", node_id);
+    return graph->node_id_to_max_pos_right[node_id];
+}
+
+static inline int abpoa_graph_node_id_to_max_pos_left(abpoa_graph_t *graph, int node_id) {
+    if (node_id < 0 || node_id >= graph->node_n) err_fatal(__func__, "Wrong node id: %d\n", node_id);
+    return graph->node_id_to_max_pos_left[node_id];
+}
+
+static inline int abpoa_graph_node_id_to_max_remain(abpoa_graph_t *graph, int node_id) {
+    if (node_id < 0 || node_id >= graph->node_n) err_fatal(__func__, "Wrong node id: %d\n", node_id);
+    return graph->node_id_to_max_remain[node_id];
+}
+
+static inline int abpoa_graph_index_to_node_id(abpoa_graph_t *graph, int index_i) {
+    if (index_i < 0 || index_i >= graph->node_n) err_fatal(__func__, "Wrong index: %d\n", index_i);
+    return graph->index_to_node_id[index_i];
+}
+
+static inline int abpoa_graph_node_id_to_msa_rank(abpoa_graph_t *graph, int node_id) {
+    if (node_id < 0 || node_id >= graph->node_n) err_fatal(__func__, "Wrong node id: %d\n", node_id);
+    return graph->node_id_to_msa_rank[node_id];
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/abpoa_output.c b/src/abpoa_output.c
new file mode 100644
index 0000000..0941d02
--- /dev/null
+++ b/src/abpoa_output.c
@@ -0,0 +1,921 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "abpoa.h"
+#include "abpoa_graph.h"
+#include "utils.h"
+#include "abpoa_seq.h"
+#include "kdq.h"
+
+extern char ab_char256_table[256];
+char ab_LogTable65536[65536];
+char ab_bit_table16[65536];
+
+#define NAT_E 2.718281828459045
+static const char ab_LogTable256[256] = {
+#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
+    -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+    LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
+    LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
+};
+
+static inline int ilog2_32(uint32_t v)
+{
+    uint32_t t, tt;
+    if ((tt = v>>16)) return (t = tt>>8) ? 24 + ab_LogTable256[t] : 16 + ab_LogTable256[tt];
+    return (t = v>>8) ? 8 + ab_LogTable256[t] : ab_LogTable256[v];
+}
+
+void set_65536_table(void) {
+    int i;
+    for (i = 0; i < 65536; ++i) {
+        ab_LogTable65536[i] = ilog2_32(i);
+    }
+}
+
+void set_bit_table16(void) {
+    int i; ab_bit_table16[0] = 0;
+    for (i = 0; i != 65536; ++i) ab_bit_table16[i] = (i&1) + ab_bit_table16[i>>1];
+}
+
+#define get_bit_cnt4(table, b) (table[(b)&0xffff] + table[(b)>>16&0xffff] + table[(b)>>32&0xffff] + table[(b)>>48&0xffff])
+
+static inline int ilog2_64(uint64_t v) {
+    uint64_t t, tt;
+    if ((tt = v >> 32)) return (t = tt >> 16) ? 48 + ab_LogTable65536[t] : 32 + ab_LogTable65536[tt];
+    return (t = v>>16) ? 16 + ab_LogTable65536[t] : ab_LogTable65536[v];
+}
+
+KDQ_INIT(int)
+#define kdq_int_t kdq_t(int)
+
+static inline int get_read_cnt(uint64_t *read_ids, int read_ids_n) {
+    int i, c;
+    for (i = c =0; i < read_ids_n; ++i) {
+        c += get_bit_cnt4(ab_bit_table16, read_ids[i]);
+    }
+    return c;
+}
+
+abpoa_cons_t *abpoa_allocate_rc_msa(abpoa_cons_t *abc, int msa_len, int n_seq, int n_cons) {
+    int i;
+    abc->n_seq = n_seq; abc->msa_len = msa_len;
+    abc->msa_base = (uint8_t**)_err_malloc((n_seq+n_cons) * sizeof(uint8_t*));
+    for (i = 0; i < n_seq+n_cons; ++i) {
+        abc->msa_base[i] = (uint8_t*)_err_malloc(msa_len * sizeof(uint8_t));
+    }
+    return abc;
+}
+
+void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
+    if (out_fp == NULL) return;
+    int i, j;
+    abpoa_seq_t *abs = ab->abs; abpoa_cons_t *abc = ab->abc;
+    if (abc->msa_len <= 0) return;
+    for (i = 0; i < abs->n_seq; ++i) {
+        if (abs->name[i].l > 0) {
+            if (abs->is_rc[i]) fprintf(out_fp, ">%s_reverse_complement\n", abs->name[i].s);
+            else fprintf(out_fp, ">%s\n", abs->name[i].s);
+        } else {
+            fprintf(out_fp, ">Seq_%d\n", i+1);
+        }
+        for (j = 0; j < abc->msa_len; ++j) fprintf(out_fp, "%c", ab_char256_table[abc->msa_base[i][j]]);
+        fprintf(out_fp, "\n");
+    }
+    if (abpt->out_cons) { // RC-MSA for consensus sequence
+        int cons_i;
+        for (cons_i = 0; cons_i < abc->n_cons; cons_i++) {
+            fprintf(out_fp, ">Consensus_sequence");
+            if (abc->n_cons > 1) {
+                fprintf(out_fp, "_%d ", cons_i+1);
+                for (j = 0; j < abc->clu_n_seq[cons_i]; ++j) { // cluter read_id
+                    if (j != 0) fprintf(out_fp, ",");
+                    fprintf(out_fp, "%d", abc->clu_read_ids[cons_i][j]);
+                }
+            }
+            fprintf(out_fp, "\n");
+            for (i = 0; i < abc->msa_len; ++i) fprintf(out_fp, "%c", ab_char256_table[abc->msa_base[abc->n_seq+cons_i][i]]);
+            fprintf(out_fp, "\n");
+        }
+    }
+}
+
+void abpoa_set_msa_seq(abpoa_node_t node, int rank, uint8_t **msa_base) {
+    int i, j, b, read_id; uint8_t base = node.base;
+    uint64_t num, tmp;
+
+    b = 0;
+    for (i = 0; i < node.read_ids_n; ++i) {
+        for (j = 0; j < node.out_edge_n; ++j) {
+            num = node.read_ids[j][i];
+            while (num) {
+                tmp = num & -num;
+                read_id = ilog2_64(tmp);
+                msa_base[b+read_id][rank-1] = base;
+                num ^= tmp;
+            }
+        }
+        b += 64;
+    }
+}
+
+// only generate rc-msa, output in separated func
+void abpoa_generate_rc_msa(abpoa_t *ab, abpoa_para_t *abpt) {
+    abpoa_graph_t *abg = ab->abg;
+    if (abg->node_n <= 2) return;
+    abpoa_set_msa_rank(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID);
+    if (abpt->out_cons) abpoa_generate_consensus(ab, abpt);
+
+    abpoa_seq_t *abs = ab->abs; abpoa_cons_t *abc = ab->abc;
+    int i, j, aligned_id, n_seq = abs->n_seq;
+    int msa_len = abg->node_id_to_msa_rank[ABPOA_SINK_NODE_ID]-1;
+
+    abpoa_allocate_rc_msa(abc, msa_len, n_seq, abc->n_cons);
+    for (i = 0; i < n_seq; ++i) {
+        for (j = 0; j < abc->msa_len; ++j) 
+            abc->msa_base[i][j] = abpt->m;
+    }
+
+    int rank;
+    // if (out_fp && abpt->out_msa_header == 0) fprintf(out_fp, ">Multiple_sequence_alignment\n");
+    for (i = 2; i < abg->node_n; ++i) {
+        // get msa rank
+        rank = abpoa_graph_node_id_to_msa_rank(abg, i);
+        for (j = 0; j < abg->node[i].aligned_node_n; ++j) {
+            aligned_id = abg->node[i].aligned_node_id[j];
+            rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id));
+        }
+        // assign seq
+        abpoa_set_msa_seq(abg->node[i], rank, abc->msa_base);
+    }
+    if (abpt->out_cons) {
+        int cons_i, cur_id;
+        for (cons_i = 0; cons_i < abc->n_cons; cons_i++) {
+            for (i = 0; i < msa_len; ++i) abc->msa_base[n_seq+cons_i][i] = abpt->m;
+            for (i = 0; i < abc->cons_len[cons_i]; ++i) {
+                cur_id = abc->cons_node_ids[cons_i][i];
+                rank = abpoa_graph_node_id_to_msa_rank(abg, cur_id);
+                for (j = 0; j < abg->node[cur_id].aligned_node_n; ++j) {
+                    aligned_id = abg->node[cur_id].aligned_node_id[j];
+                    rank = MAX_OF_TWO(rank, abpoa_graph_node_id_to_msa_rank(abg, aligned_id));
+                }
+                abc->msa_base[n_seq+cons_i][rank-1] = abc->cons_base[cons_i][i];
+            }
+        }
+    }
+}
+
+// generate & output gfa
+void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
+    if (out_fp == NULL) return;
+    abpoa_seq_t *abs = ab->abs; abpoa_graph_t *abg = ab->abg;
+    if (abg->node_n <= 2) return;
+
+    // traverse graph 
+    int *in_degree = (int*)_err_malloc(abg->node_n * sizeof(int));
+    int n_seq = abs->n_seq;
+    int **read_paths = (int**)_err_malloc(n_seq * sizeof(int*)), *read_path_i = (int*)_err_calloc(n_seq, sizeof(int));
+    int i, j, cur_id, pre_id, out_id, *id;
+    for (i = 0; i < abg->node_n; ++i) in_degree[i] = abg->node[i].in_edge_n;
+    for (i = 0; i < n_seq; ++i) read_paths[i] = (int*)_err_malloc(abg->node_n * sizeof(int));
+
+    // output comment and header
+    int nl = 0;
+    for (i = 2; i < abg->node_n; ++i) nl += abg->node[i].in_edge_n;
+    fprintf(out_fp, "H\tVN:Z:1.0\tNS:i:%d\tNL:i:%d\tNP:i:%d\n", abg->node_n-2, nl - abg->node[ABPOA_SRC_NODE_ID].out_edge_n, n_seq + abpt->out_cons);
+
+    kdq_int_t *q = kdq_init_int();
+
+    // Breadth-First-Search
+    kdq_push_int(q, ABPOA_SRC_NODE_ID); 
+    while ((id = kdq_shift_int(q)) != 0) {
+        cur_id = *id;
+        if (cur_id == ABPOA_SINK_NODE_ID) {
+            kdq_destroy_int(q);
+            break;
+        } else {
+            if (cur_id != ABPOA_SRC_NODE_ID) {
+                // output node
+                fprintf(out_fp, "S\t%d\t%c\n", cur_id-1, ab_char256_table[abg->node[cur_id].base]);
+                // output all links based pre_ids
+                for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) {
+                    pre_id = abg->node[cur_id].in_id[i];
+                    if (pre_id != ABPOA_SRC_NODE_ID)
+                        fprintf(out_fp, "L\t%d\t+\t%d\t+\t0M\n", pre_id-1, cur_id-1);
+                }
+                // add node id to read path
+                int b, read_id; uint64_t num, tmp;
+                b = 0;
+                for (i = 0; i < abg->node[cur_id].read_ids_n; ++i) {
+                    for (j = 0; j < abg->node[cur_id].out_edge_n; ++j) {
+                        num = abg->node[cur_id].read_ids[j][i];
+                        while (num) {
+                            tmp = num & -num;
+                            read_id = ilog2_64(tmp);
+                            read_paths[b+read_id][read_path_i[b+read_id]++] = cur_id-1;
+                            num ^= tmp;
+                        }
+                    }
+                    b += 64;
+                }
+            }
+            for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+                out_id = abg->node[cur_id].out_id[i];
+                if (--in_degree[out_id] == 0) {
+                    kdq_push_int(q, out_id);
+                }
+            }
+        }
+    }
+    // output read paths
+    for (i = 0; i < n_seq; ++i) {
+        if (abs->name[i].l > 0) fprintf(out_fp, "P\t%s\t", abs->name[i].s);
+        else fprintf(out_fp, "P\t%d\t", i+1);
+        if (abs->is_rc[i]) {
+            for (j = read_path_i[i]-1; j >= 0; --j) {
+                fprintf(out_fp, "%d-", read_paths[i][j]);
+                if (j != 0) fprintf(out_fp, ",");
+                else fprintf(out_fp, "\t*\n");
+            }
+        } else {
+            for (j = 0; j < read_path_i[i]; ++j) {
+                fprintf(out_fp, "%d+", read_paths[i][j]);
+                if (j != read_path_i[i]-1) fprintf(out_fp, ",");
+                else fprintf(out_fp, "\t*\n");
+            }
+        }
+    }
+    if (abpt->out_cons) {
+        abpoa_generate_consensus(ab, abpt);
+        abpoa_cons_t *abc = ab->abc;
+        int cons_i;
+        for (cons_i = 0; cons_i < abc->n_cons; ++cons_i) {
+            fprintf(out_fp, "P\tConsensus_sequence");
+            if (abc->n_cons > 1) fprintf(out_fp, "_%d", cons_i+1);
+            fprintf(out_fp, "\t");
+            for (i = 0; i < abc->cons_len[cons_i]; ++i) {
+                cur_id = abc->cons_node_ids[cons_i][i];
+                fprintf(out_fp, "%d+", cur_id-1);
+                if (i != abc->cons_len[cons_i]-1) fprintf(out_fp, ",");
+                else fprintf(out_fp, "\t*\n"); 
+            }
+
+        }
+    }
+    free(in_degree);
+    for (i = 0; i < n_seq; ++i) free(read_paths[i]); 
+    free(read_paths); free(read_path_i);
+}
+
+int abpoa_cons_phred_score(int n_cov, int n_seq) {
+    if (n_cov > n_seq) err_fatal(__func__, "Error: unexpected n_cov/n_seq (%d/%d).", n_cov, n_seq);
+    double x, p;
+    x = 13.8 * (1.25 * n_cov / n_seq - 0.25);
+    p = 1 - 1.0 / (1.0 + pow(NAT_E, -1 * x));
+    return (33 + (int)(-10 * log10(p) + 0.499));
+}
+
+int get_read_ids_clu_count(uint64_t *cur_read_ids, int read_ids_n, uint64_t *clu_read_ids) {
+    int n = 0, i; uint64_t b;
+    for (i = 0; i < read_ids_n; ++i) {
+        b = cur_read_ids[i] & clu_read_ids[i];
+        n += get_bit_cnt4(ab_bit_table16, b);
+    }
+    return n;
+}
+
+int get_read_ids_clu_weight(uint64_t *cur_read_ids, int read_ids_n, uint64_t *clu_read_ids, uint8_t use_qv, int *read_weight, int m_read) {
+    if (use_qv == 0) return get_read_ids_clu_count(cur_read_ids, read_ids_n, clu_read_ids);
+    int w = 0, i; uint64_t b;
+    for (i = 0; i < read_ids_n; ++i) {
+        b = cur_read_ids[i] & clu_read_ids[i];
+
+        w += get_bit_cnt4(ab_bit_table16, b);
+    }
+    uint64_t one = 1;
+    for (i = 0; i < m_read; ++i) {
+        if (read_weight[i] > 0) {
+            int n = i / 64, b = i & 0x3f;
+            if ((cur_read_ids[n] & clu_read_ids[n] & (one << b)) > 0)
+                w += read_weight[i];
+        }
+    }
+    return w;
+}
+
+int abpoa_consensus_cov(abpoa_graph_t *abg, int id, uint64_t *clu_read_ids) {
+    int i, j, in_id, left_n, right_n;
+    // for each id: get max{left_weigth, right_weight}
+    left_n = right_n = 0;
+    for (i = 0; i < abg->node[id].in_edge_n; ++i) {
+        in_id = abg->node[id].in_id[i];
+        for (j = 0; j < abg->node[in_id].out_edge_n; ++j) {
+            if (abg->node[in_id].out_id[j] == id) {
+                left_n += get_read_ids_clu_count(abg->node[in_id].read_ids[j], abg->node[in_id].read_ids_n, clu_read_ids);
+                break;
+            }
+        }
+    }
+    for (i = 0; i < abg->node[id].out_edge_n; ++i) {
+        right_n += get_read_ids_clu_count(abg->node[id].read_ids[i], abg->node[id].read_ids_n, clu_read_ids);
+    }
+    return MAX_OF_TWO(left_n, right_n);
+}
+
+void abpoa_set_hb_cons(abpoa_graph_t *abg, int **max_out_id, int n_cons, uint64_t **clu_read_ids, int src_id, int sink_id, abpoa_cons_t *abc) {
+    abc->n_cons = n_cons;
+    int i, j, cur_id;
+    for (i = 0; i < n_cons; ++i) {
+        cur_id = max_out_id[i][src_id];
+        j = 0;
+        while (cur_id != sink_id) {
+            abc->cons_node_ids[i][j] = cur_id;
+            abc->cons_base[i][j] = abg->node[cur_id].base;
+            abc->cons_cov[i][j] = abpoa_consensus_cov(abg, cur_id, clu_read_ids[i]);
+            abc->cons_phred_score[i][j] = abpoa_cons_phred_score(abc->cons_cov[i][j], abc->clu_n_seq[i]);
+            ++j;
+            cur_id = max_out_id[i][cur_id];
+        }
+        abc->cons_len[i] = j;
+    }
+}
+
+void abpoa_set_hb_cons1(abpoa_graph_t *abg, int *max_out_id, int src_id, int sink_id, abpoa_cons_t *abc) {
+    int i = 0, cur_id;
+    abc->n_cons = 1;
+    cur_id = max_out_id[src_id];
+    while (cur_id != sink_id) {
+        abc->cons_node_ids[0][i] = cur_id;
+        abc->cons_base[0][i] = abg->node[cur_id].base;
+        abc->cons_cov[0][i] = abg->node[cur_id].n_read;
+        abc->cons_phred_score[0][i] = abpoa_cons_phred_score(abc->cons_cov[0][i], abc->n_seq);
+        cur_id = max_out_id[cur_id];
+        ++i;
+    }
+    abc->cons_len[0] = i;
+}
+
+// heaviest_bundling
+// 1. argmax{cur->weight}
+// 2. argmax{out_node->weight}
+void abpoa_heaviest_bundling(abpoa_graph_t *abg, int src_id, int sink_id, int *out_degree, abpoa_cons_t *abc) {
+    int *id, i, cur_id, in_id, out_id, max_id; int max_w, out_w;
+    int *score = (int*)_err_malloc(abg->node_n * sizeof(int));
+    int *max_out_id = (int*)_err_malloc(abg->node_n * sizeof(int));
+
+    kdq_int_t *q = kdq_init_int();
+    kdq_push_int(q, sink_id);
+    // reverse Breadth-First-Search
+    while ((id = kdq_shift_int(q)) != 0) {
+        cur_id = *id;
+        if (cur_id == sink_id) {
+            max_out_id[cur_id] = -1;
+            score[cur_id] = 0;
+        } else {
+            max_id = -1;
+            if (cur_id == src_id) {
+                int path_score = -1, path_max_w = -1;
+                for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+                    out_id = abg->node[cur_id].out_id[i];
+                    out_w = abg->node[cur_id].out_weight[i];
+                    if (out_w > path_max_w || (out_w == path_max_w && score[out_id] > path_score)) {
+                        max_id = out_id;
+                        path_score = score[out_id];
+                        path_max_w = out_w;
+                    }
+                }
+                max_out_id[cur_id] = max_id;
+                kdq_destroy_int(q);
+                break;
+            } else {
+                max_w = INT32_MIN;
+                for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+                    out_id = abg->node[cur_id].out_id[i];
+                    out_w = abg->node[cur_id].out_weight[i];
+                    if (max_w < out_w) {
+                        max_w = out_w; max_id = out_id;
+                    } else if (max_w == out_w && score[max_id] <= score[out_id]) {
+                        max_id = out_id;
+                    }
+                }
+                score[cur_id] = max_w + score[max_id];
+                max_out_id[cur_id] = max_id;
+            }
+        }
+        for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) {
+            in_id = abg->node[cur_id].in_id[i];
+            if (--out_degree[in_id] == 0) kdq_push_int(q, in_id);
+        }
+    }
+    abc->clu_n_seq[0] = abc->n_seq;
+    // set cons read ids
+    for (i = 0; i < abc->n_seq; ++i) abc->clu_read_ids[0][i] = i;
+    abpoa_set_hb_cons1(abg, max_out_id, src_id, sink_id, abc);
+    free(score); free(max_out_id);
+}
+
+void set_clu_read_ids(abpoa_cons_t *abc, uint64_t **read_ids, int cons_i, int n_seq) {
+    int n, i, j; uint64_t b, one = 1;
+    for (i = n = 0; i < n_seq; ++i) {
+        j = i / 64; b = i & 0x3f;
+        if (read_ids[cons_i][j] & (one << b)) {
+            abc->clu_read_ids[cons_i][n++] = i;
+        }
+    }
+    if (n != abc->clu_n_seq[cons_i])
+        err_fatal(__func__, "Error in set cluster read ids. (%d, %d)", n, abc->clu_n_seq[cons_i]);
+}
+
+void abpoa_multip_heaviest_bundling(abpoa_graph_t *abg, abpoa_para_t *abpt, int src_id, int sink_id, int *out_degree, int n_clu, int read_ids_n, uint64_t **clu_read_ids, abpoa_cons_t *abc) {
+    int *id, cons_i, i, cur_id, in_id, out_id, max_id; int max_w, out_w;
+
+    int *_out_degree = (int*)_err_malloc(abg->node_n * sizeof(int));
+    int *score = (int*)_err_malloc(abg->node_n * sizeof(int));
+    int **max_out_id = (int**)_err_malloc(n_clu * sizeof(int*));
+    for (i = 0; i < n_clu; ++i) max_out_id[i] = (int*)_err_malloc(abg->node_n * sizeof(int));
+
+    for (cons_i = 0; cons_i < n_clu; cons_i++) {
+        for (i = 0; i < abg->node_n; ++i) _out_degree[i] = out_degree[i];
+        abc->clu_n_seq[cons_i] = get_read_cnt(clu_read_ids[cons_i], read_ids_n);
+        set_clu_read_ids(abc, clu_read_ids, cons_i, abc->n_seq);
+        kdq_int_t *q = kdq_init_int();
+        kdq_push_int(q, sink_id);
+        // reverse Breadth-First-Search
+        while ((id = kdq_shift_int(q)) != 0) {
+            cur_id = *id;
+            if (cur_id == sink_id) {
+                max_out_id[cons_i][cur_id] = -1;
+                score[cur_id] = 0;
+            } else {
+                max_id = -1;
+                if (cur_id == src_id) {
+                    int path_score = -1, path_max_w = -1;
+                    for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+                        out_id = abg->node[cur_id].out_id[i];
+                        out_w = get_read_ids_clu_weight(abg->node[cur_id].read_ids[i], abg->node[cur_id].read_ids_n, clu_read_ids[cons_i], abpt->use_qv, abg->node[cur_id].read_weight, abg->node[cur_id].m_read);
+                        // out_w = abg->node[cur_id].out_weight[i];
+                        if (out_w > path_max_w || (out_w == path_max_w && score[out_id] > path_score)) {
+                            max_id = out_id;
+                            path_score = score[out_id];
+                            path_max_w = out_w;
+                        }
+                    }
+                    max_out_id[cons_i][cur_id] = max_id;
+                    kdq_destroy_int(q);
+                    break;
+                } else {
+                    max_w = INT32_MIN;
+                    for (i = 0; i < abg->node[cur_id].out_edge_n; ++i) {
+                        out_id = abg->node[cur_id].out_id[i];
+                        out_w = get_read_ids_clu_weight(abg->node[cur_id].read_ids[i], abg->node[cur_id].read_ids_n, clu_read_ids[cons_i], abpt->use_qv, abg->node[cur_id].read_weight, abg->node[cur_id].m_read);
+                        if (max_w < out_w) {
+                            max_w = out_w;
+                            max_id = out_id;
+                        } else if (max_w == out_w && score[max_id] <= score[out_id]) {
+                            max_id = out_id;
+                        }
+                    }
+                    score[cur_id] = max_w + score[max_id];
+                    max_out_id[cons_i][cur_id] = max_id;
+                }
+            }
+            for (i = 0; i < abg->node[cur_id].in_edge_n; ++i) {
+                in_id = abg->node[cur_id].in_id[i];
+                if (--_out_degree[in_id] == 0) 
+                    kdq_push_int(q, in_id);
+            }
+        }
+    }
+    abpoa_set_hb_cons(abg, max_out_id, n_clu, clu_read_ids, src_id, sink_id, abc);
+
+    free(score); free(_out_degree);
+    for (i = 0; i < n_clu; ++i) free(max_out_id[i]); free(max_out_id);
+}
+
+void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) {
+    if (out_fp == NULL) return;
+    int cons_i, j;
+    abpoa_cons_t *abc = ab->abc;
+    for (cons_i = 0; cons_i < abc->n_cons; ++cons_i) {
+        if (abpt->out_fq) fprintf(out_fp, "@Consensus_sequence");
+        else fprintf(out_fp, ">Consensus_sequence");
+        if (abc->n_cons > 1) {
+            fprintf(out_fp, "_%d ", cons_i+1); // cons_id
+            for (j = 0; j < abc->clu_n_seq[cons_i]; ++j) { // cluter read_id
+                if (j != 0) fprintf(out_fp, ",");
+                fprintf(out_fp, "%d", abc->clu_read_ids[cons_i][j]);
+            }
+        }
+        fprintf(out_fp, "\n");
+        for (j = 0; j < abc->cons_len[cons_i]; ++j) {
+            fprintf(out_fp, "%c", ab_char256_table[abc->cons_base[cons_i][j]]);
+        } fprintf(out_fp, "\n");
+        if (abpt->out_fq) {
+            fprintf(out_fp, "+Consensus_sequence");
+            if (abc->n_cons > 1) {
+                fprintf(out_fp, "_%d ", cons_i+1); // cons_id
+                for (j = 0; j < abc->clu_n_seq[cons_i]; ++j) { // cluter read_id
+                    if (j != 0) fprintf(out_fp, ",");
+                    fprintf(out_fp, "%d", abc->clu_read_ids[cons_i][j]);
+                }
+            }
+            fprintf(out_fp, "\n");
+            for (j = 0; j < abc->cons_len[cons_i]; ++j) {
+                fprintf(out_fp, "%c", abc->cons_phred_score[cons_i][j]);
+            } fprintf(out_fp, "\n");
+        }
+    }
+}
+
+abpoa_cons_t *abpoa_allocate_cons(abpoa_cons_t *abc, int n_node, int n_seq, int n_cons) {
+    int i;
+    abc->n_cons = n_cons, abc->n_seq = n_seq;
+    abc->clu_n_seq = (int*)_err_calloc(n_cons, sizeof(int));
+    abc->cons_len = (int*)_err_calloc(n_cons, sizeof(int));
+    abc->cons_node_ids = (int**)_err_malloc(n_cons * sizeof(int*));
+    abc->cons_base = (uint8_t**)_err_malloc(n_cons * sizeof(uint8_t*));
+    abc->cons_cov = (int**)_err_malloc(n_cons * sizeof(int*));
+    abc->clu_read_ids = (int**)_err_malloc(n_cons * sizeof(int*));
+    abc->cons_phred_score = (int**)_err_malloc(n_cons * sizeof(int*));
+    for (i = 0; i < n_cons; ++i) {
+        abc->cons_node_ids[i] = (int*)_err_malloc(n_node * sizeof(int));
+        abc->cons_base[i] = (uint8_t*)_err_malloc(n_node * sizeof(uint8_t));
+        abc->cons_cov[i] = (int*)_err_malloc(n_node * sizeof(int));
+        abc->clu_read_ids[i] = (int*)_err_malloc(n_seq * sizeof(int));
+        abc->cons_phred_score[i] = (int*)_err_malloc(n_node * sizeof(int));
+    }
+    return abc;
+}
+
+int abpoa_check_iden_read_ids(int **rc_weight, uint64_t ***read_ids, int m, int read_ids_n, int pos1, int pos2) {
+    int i, j, k, iden = 1;
+    uint8_t *map = (uint8_t*)_err_calloc(m, sizeof(uint8_t));
+
+    for (i = 0; i < m ; ++i) {
+        if (rc_weight[pos1][i] == 0) continue;
+        int found_iden = 0;
+        for (j = 0; j < m; ++j) { // find from 0~m that is identical to i'th read_ids
+            if (map[j] == 1 || rc_weight[pos1][i] != rc_weight[pos2][j]) continue;
+            // iden rc_weight
+            int diff = 0;
+            for (k = 0; k < read_ids_n; ++k) {
+                if (read_ids[pos1][i][k] != read_ids[pos2][j][k]) {
+                    diff = 1; break;
+                }
+            }
+            if (diff == 0) { // i is identical to j
+                found_iden = 1; 
+                map[j] = 1;
+                break;
+            }
+        }
+        if (found_iden == 0) { // no iden for i'th base
+            iden = 0; break;
+        }
+    }
+    free(map);
+    return iden;
+}
+
+// return: 1 if redundent else 0
+int check_redundent_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int n_clu, int new_clu_i, int n_het_pos, int read_id_i, uint64_t read_id) {
+    int i, j, redundent = 0;
+    for (i = n_clu-1; i >= 0; --i) {
+        int iden = 1;
+        for (j = 0; j < n_het_pos; ++j) {
+            if (clu_haps[i][j] != clu_haps[new_clu_i][j]) {
+                iden = 0; break;
+            }
+        }
+        if (iden == 1) {
+            clu_size[i] += 1;
+            clu_read_ids[i][read_id_i] |= read_id;
+            redundent = 1; break;
+        }
+    }
+    if (redundent == 0) {
+        clu_size[new_clu_i] += 1;
+        clu_read_ids[new_clu_i][read_id_i] |= read_id;
+    }
+    return redundent;
+}
+
+int reassign_hap_by_min_w(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int min_w, int n_het_pos) {
+    int i, j, k, n_reassign = 0;
+    for (i = 0; i < n_clu; ++i) {
+        if (clu_size[i] >= min_w || clu_size[i] == 0) continue;
+        int reassign_i = -1, max_iden_pos = 0;
+        for (j = 0; j < n_clu; ++j) {
+            int n_iden_pos = 0;
+            if (clu_size[j] < min_w) continue;
+            // i < min_w, j >= min_w
+            for (k = 0; k < n_het_pos; ++k) {
+                if (clu_haps[i][k] == clu_haps[j][k]) n_iden_pos++;
+            }
+            if (n_iden_pos > max_iden_pos) {
+                max_iden_pos = n_iden_pos;
+                reassign_i = j;
+            }
+        }
+        if (reassign_i >= 0) {
+            for (j = 0; j < read_ids_n; ++j) {
+                clu_read_ids[reassign_i][j] |= clu_read_ids[i][j];
+                clu_read_ids[i][j] = 0;
+            }
+            clu_size[reassign_i] += clu_size[i];
+            clu_size[i] = 0;
+            n_reassign += 1;
+        }
+    }
+    return n_clu - n_reassign;
+}
+
+int reassign_max_n_hap1(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int *clu_poss, int max_n_cons, int n_het_pos) {
+    int i, j, k, n_reassign = 0;
+    for (i = 0; i < n_clu; ++i) {
+        int is_clu = 0;
+        if (clu_size[i] == 0) continue;
+        for (j = 0; j < max_n_cons; ++j) {
+            if (i == clu_poss[j]) {
+                is_clu = 1;
+                break;
+            }
+        }
+        if (is_clu) continue;
+
+        int reassign_i = -1, max_iden_pos = 0;
+        for (j = 0; j < max_n_cons; ++j) {
+            int clu_i = clu_poss[j], n_iden_pos = 0;
+            // i < min_w, clu_i >= min_w
+            for (k = 0; k < n_het_pos; ++k) {
+                if (clu_haps[i][k] == clu_haps[clu_i][k]) n_iden_pos++;
+            }
+            if (n_iden_pos > max_iden_pos) {
+                max_iden_pos = n_iden_pos;
+                reassign_i = clu_i;
+            }
+        }
+        if (reassign_i >= 0) {
+            for (j = 0; j < read_ids_n; ++j) {
+                clu_read_ids[reassign_i][j] |= clu_read_ids[i][j];
+                clu_read_ids[i][j] = 0;
+            }
+            clu_size[reassign_i] += clu_size[i];
+            clu_size[i] = 0;
+            n_reassign += 1;
+        } else {
+            clu_size[i] = 0;
+        }
+    }
+    return n_clu - n_reassign;
+}
+
+typedef struct {
+    int size, pos;
+} clu_hap_tuple_t;
+
+// descending order
+int tup_cmpfunc (const void * a, const void * b) {
+    return -(((clu_hap_tuple_t*)a)->size - ((clu_hap_tuple_t*)b)->size);
+}
+
+int reassign_max_n_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int n_het_pos, int max_n_cons) {
+    int i;
+    clu_hap_tuple_t *tup = (clu_hap_tuple_t*)_err_malloc(n_clu * sizeof(clu_hap_tuple_t));
+    int *clu_poss = (int*)_err_malloc(max_n_cons * sizeof(int));
+
+    while (n_clu > max_n_cons) {
+        for (i = 0; i < n_clu; ++i) {
+            tup[i].size = clu_size[i];
+            tup[i].pos = i;
+        }
+        qsort(tup, n_clu, sizeof(clu_hap_tuple_t), tup_cmpfunc);
+        // new min_w
+        for (i = 0; i < max_n_cons; ++i) clu_poss[i] = tup[i].pos;
+        int new_n_clu = reassign_max_n_hap1(clu_haps, clu_size, clu_read_ids, read_ids_n, n_clu, clu_poss, max_n_cons, n_het_pos);
+        if (new_n_clu == n_clu) { // no further reassignment, but still have more than _max_n_cons_ clus
+            err_func_printf(__func__, "%d small clusters of sequences remain un-assigned.", n_clu-max_n_cons);
+            break;
+        }
+        n_clu = new_n_clu;
+    }
+    free(tup); free(clu_poss);
+    return n_clu;
+}
+
+int reassign_hap(int **clu_haps, int *clu_size, uint64_t **clu_read_ids, int read_ids_n, int n_clu, int min_w, int max_n_cons, int n_het_pos) {
+    // assign haplotype with reads < min_w to haplotype with reads >= min_w
+    int new_n_clu = reassign_hap_by_min_w(clu_haps, clu_size, clu_read_ids, read_ids_n, n_clu, min_w, n_het_pos);
+    if (new_n_clu > max_n_cons) // keep at most _max_n_cons_
+        new_n_clu = reassign_max_n_hap(clu_haps, clu_size, clu_read_ids, read_ids_n, n_clu, n_het_pos, max_n_cons);
+    // move max_n_cons to the front
+    int i, j, pos_i;
+    for (i = pos_i = 0; i < n_clu; ++i) {
+        if (clu_size[i] == 0) continue;
+        if (i == pos_i) {
+            pos_i++; continue;
+        }
+        // move i to pos_i
+        for (j = 0; j < read_ids_n; ++j) {
+            clu_read_ids[pos_i][j] = clu_read_ids[i][j];
+            clu_size[pos_i] = clu_size[i];
+        }
+        pos_i++;
+    }
+    if (pos_i > max_n_cons) err_fatal_core(__func__, "Error: collected %d clusters.", pos_i);
+    return pos_i;
+}
+
+// read_weight is NOT used here, no matter use_qv is set or not.
+// collect minimized set of het bases
+int abpoa_set_het_row_column_ids_weight(abpoa_graph_t *abg, uint64_t ***read_ids, int *het_poss, int **rc_weight, int msa_l, int n_seq, int m, int min_w, int read_ids_n) {
+    int i, j, k, n, rank;
+    uint64_t b, one = 1, *whole_read_ids = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t));
+    for (i = 0; i < n_seq; ++i) {
+        j = i / 64; b = i & 0x3f;
+        whole_read_ids[j] |= (one << b);
+    }
+    for (i = 0; i < msa_l; ++i) {
+        for (j = 0; j < read_ids_n; ++j) {
+            read_ids[i][m-1][j] = whole_read_ids[j];
+        }
+    } free(whole_read_ids);
+
+    uint8_t *node_map = (uint8_t*)_err_calloc(abg->node_n, sizeof(uint8_t));
+    int *n_branch = (int*)_err_calloc(msa_l, sizeof(int)), n_het_pos = 0;
+    for (i = 2; i < abg->node_n; ++i) {
+        if (abg->node[i].out_edge_n < 2) continue;
+
+        for (j = 0; j < abg->node[i].out_edge_n; ++j) {
+            int out_id = abg->node[i].out_id[j];
+            if (node_map[out_id]) continue;
+            else node_map[out_id] = 1;
+            int sum_out_w = 0;
+            for (k = 0; k < abg->node[out_id].out_edge_n; ++k)
+                sum_out_w += abg->node[out_id].n_read;
+            if (sum_out_w < min_w || sum_out_w > n_seq-min_w) continue;
+            rank = abpoa_graph_node_id_to_msa_rank(abg, out_id); 
+            n_branch[rank-1] += 1;
+            // assign seq
+            for (n = 0; n < abg->node[out_id].out_edge_n; ++n) {
+                for (k = 0; k < abg->node[out_id].read_ids_n; ++k) {
+                    b = abg->node[out_id].read_ids[n][k];
+                    rc_weight[rank-1][abg->node[out_id].base] += get_bit_cnt4(ab_bit_table16, b);
+                    read_ids[rank-1][abg->node[out_id].base][k] |= b;
+                    read_ids[rank-1][m-1][k] ^= b;
+                }
+            }
+            rc_weight[rank-1][m-1] -= rc_weight[rank-1][abg->node[out_id].base];
+        }
+    }
+    for (rank = 0; rank < msa_l; ++rank) {
+        if (rc_weight[rank][m-1] >= min_w && rc_weight[rank][m-1] <= n_seq-min_w) n_branch[rank]++;
+        if (n_branch[rank] > 1) {
+            // filter out identical read_ids
+            int iden = 0;
+            for (i = n_het_pos-1; i >= 0; i--) {
+                int het_pos = het_poss[i];
+                // remove het bases that share the identical read groups
+                iden = abpoa_check_iden_read_ids(rc_weight, read_ids, m, read_ids_n, rank, het_pos);
+                if (iden == 1) break;
+            }
+            if (iden == 1) continue;
+
+            het_poss[n_het_pos++] = rank;
+#ifdef __DEBUG__
+            fprintf(stderr, "%d\t", rank);
+            for (j = 0; j < m; ++j) {
+                fprintf(stderr, "%c: %d\t", "ACGT-"[j], rc_weight[rank][j]);
+            } fprintf(stderr, "\n");
+#endif
+        }
+    }
+    free(n_branch); free(node_map);
+    return n_het_pos;
+}
+
+// group read into clusters based on all het bases
+// initial cluster size could be > max_n_cons
+int abpoa_collect_clu_hap_read_ids(int *het_poss, int n_het_pos, uint64_t ***read_ids, int read_ids_n, int n_seq, int m, int min_w, int max_n_cons, uint64_t ***clu_read_ids, int *_m_clu) {
+    if (n_het_pos == 0) return 1;
+    int i, j, k, n_clu = 0, m_clu = 2;
+    int **clu_haps = (int**)_err_malloc(2 * sizeof(int*));
+    int *clu_size = (int*)_err_calloc(2, sizeof(int));
+    *clu_read_ids = (uint64_t**)_err_malloc(2 * sizeof(uint64_t**));
+    for (i = 0; i < 2; ++i) {
+        clu_haps[i] = (int*)_err_calloc(n_het_pos, sizeof(int));
+        (*clu_read_ids)[i] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t));
+    }
+    
+    for (i = 0; i < n_seq; ++i) { // collect haplotype for each sequence
+        int read_id_i = i / 64; uint64_t read_id = 1ULL << (i & 0x3f);
+        for (j = 0; j < n_het_pos; ++j) {
+            int het_pos = het_poss[j];
+            for (k = 0; k < m; ++k) {
+                if (read_ids[het_pos][k][read_id_i] & read_id) {
+                    clu_haps[n_clu][j] = k; break;
+                }
+            }
+        }
+        if (check_redundent_hap(clu_haps, clu_size, *clu_read_ids, n_clu, n_clu, n_het_pos, read_id_i, read_id) == 0) {
+            if (++n_clu == m_clu) {
+                m_clu <<= 1;
+                clu_haps = (int**)_err_realloc(clu_haps, m_clu * sizeof(int*));
+                clu_size = (int*)_err_realloc(clu_size, m_clu * sizeof(int));
+                (*clu_read_ids) = (uint64_t**)_err_realloc(*clu_read_ids, m_clu * sizeof(uint64_t**));
+                for (j = n_clu; j < m_clu; ++j) {
+                    clu_haps[j] = (int*)_err_calloc(n_het_pos, sizeof(int));
+                    clu_size[j] = 0;
+                    (*clu_read_ids)[j] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t)); // mem may lost
+                }
+            }
+        }
+    }
+    if (n_clu < 2) err_fatal(__func__, "# haplotypes: %d\n", n_clu);
+#ifdef __DEBUG__
+    fprintf(stderr, "n_clu: %d\n", n_clu);
+    for (i = 0; i < n_clu; ++i) {
+        for (j = 0; j < n_het_pos; ++j) {
+            fprintf(stderr, "%d\t", clu_haps[i][j]);
+        }
+        fprintf(stderr, "\tsize: %d\n", clu_size[i]);
+    }
+#endif
+
+    // assign haplotype with reads < min_w to haplotype with reads >= min_w
+    // keep at most _max_n_cons_ haps and read ids, weight need to >= min_w
+    n_clu = reassign_hap(clu_haps, clu_size, *clu_read_ids, read_ids_n, n_clu, min_w, max_n_cons, n_het_pos);
+#ifdef __DEBUG__
+    fprintf(stderr, "After re-assign: n_clu: %d\n", n_clu);
+    for (i = 0; i < n_clu; ++i) {
+        fprintf(stderr, "%d:\tsize: %d\n", i, clu_size[i]);
+    }
+#endif
+    for (i = 0; i < m_clu; ++i) free(clu_haps[i]); free(clu_haps); free(clu_size);
+    *_m_clu = m_clu;
+    return n_clu;
+}
+
+// read_weight is NOT used here
+// cluster reads into _n_clu_ groups based on heterogeneous bases
+int abpoa_multip_read_clu(abpoa_graph_t *abg, int src_id, int sink_id, int n_seq, int m, int max_n_cons, double min_freq, uint64_t ***clu_read_ids, int *_m_clu) {
+    abpoa_set_msa_rank(abg, src_id, sink_id);
+    int i, j, n_clu, m_clu, read_ids_n = (n_seq-1)/64+1;
+    int msa_l = abg->node_id_to_msa_rank[sink_id]-1, min_w = MAX_OF_TWO(1, n_seq * min_freq); // TODO fastq-qual weight
+    
+    // read_ids: support reads for each base (A/C/G/T) at each position
+    uint64_t ***read_ids = (uint64_t***)_err_malloc(sizeof(uint64_t**) * msa_l);
+    for (i = 0; i < msa_l; ++i) {
+        read_ids[i] = (uint64_t**)_err_malloc(sizeof(uint64_t*) * m);
+        for (j = 0; j < m; ++j) read_ids[i][j] = (uint64_t*)_err_calloc(read_ids_n, sizeof(uint64_t));
+    }
+
+    // is rc_weight necessary?
+    int **rc_weight = (int**)_err_malloc(msa_l * sizeof(int*));
+    for (i = 0; i < msa_l; ++i) {
+        rc_weight[i] = (int*)_err_calloc(m, sizeof(int)); // ACGT
+        rc_weight[i][m-1] = n_seq;
+    } 
+    // find min set of het nodes
+    int *het_poss = (int*)_err_calloc(msa_l, sizeof(int));
+    int n_het_pos = abpoa_set_het_row_column_ids_weight(abg, read_ids, het_poss, rc_weight, msa_l, n_seq, m, min_w, read_ids_n);
+    
+    if (n_het_pos < 1) n_clu = 1;
+    // collect at most _max_n_cons_ haplotypes and corresponding read ids
+    else n_clu = abpoa_collect_clu_hap_read_ids(het_poss, n_het_pos, read_ids, read_ids_n, n_seq, m, min_w, max_n_cons, clu_read_ids, &m_clu);
+
+    for (i = 0; i < msa_l; ++i) {
+        for (j = 0; j < m; ++j) free(read_ids[i][j]);
+        free(read_ids[i]); free(rc_weight[i]);
+    } free(read_ids); free(rc_weight); free(het_poss);
+
+    *_m_clu = m_clu;
+    return n_clu;
+}
+
+// should always do topological sort first, then generate consensus
+void abpoa_generate_consensus(abpoa_t *ab, abpoa_para_t *abpt) {
+    if (ab->abg->is_called_cons == 1) return;
+    abpoa_graph_t *abg = ab->abg;
+    if (abg->node_n <= 2) return;
+    int i, *out_degree = (int*)_err_malloc(abg->node_n * sizeof(int));
+    for (i = 0; i < abg->node_n; ++i) {
+        out_degree[i] = abg->node[i].out_edge_n;
+    }
+
+    int n_clu, m_clu, n_seq = ab->abs->n_seq; uint64_t **clu_read_ids;
+    int read_ids_n = (n_seq-1)/64+1;
+
+    if (abpt->max_n_cons > 1) n_clu = abpoa_multip_read_clu(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, n_seq, abpt->m, abpt->max_n_cons, abpt->min_freq, &clu_read_ids, &m_clu);
+    else n_clu = 1;
+
+    abpoa_cons_t *abc = ab->abc;
+    abpoa_allocate_cons(abc, abg->node_n, ab->abs->n_seq, n_clu);
+    if (n_clu > 1) {
+         abpoa_multip_heaviest_bundling(abg, abpt, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, out_degree, n_clu, read_ids_n, clu_read_ids, abc);
+        for (i = 0; i < m_clu; ++i) free(clu_read_ids[i]); free(clu_read_ids);
+    } else {
+        abpoa_heaviest_bundling(abg, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, out_degree, abc);
+    }
+    abg->is_called_cons = 1; free(out_degree);
+}
diff --git a/src/abpoa_output.h b/src/abpoa_output.h
new file mode 100644
index 0000000..9952c56
--- /dev/null
+++ b/src/abpoa_output.h
@@ -0,0 +1,15 @@
+#ifndef ABPOA_OUTPUT_H
+#define ABPOA_OUTPUT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void set_65536_table(void);
+void set_bit_table16(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/abpoa_plot.c b/src/abpoa_plot.c
new file mode 100644
index 0000000..d35fbcf
--- /dev/null
+++ b/src/abpoa_plot.c
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "abpoa.h"
+#include "abpoa_graph.h"
+#include "utils.h"
+
+/* example of dot file for graphviz */
+/*
+   digraph test1 {
+       a -> b -> c;
+       a -> {x y};
+       b [shape=box];
+       c [label="hello\nworld",color=blue,fontsize=24,
+         fontname="Palatino-Italic",fontcolor=red,style=filled];
+       a -> z [label="hi", weight=100];
+       x -> z [label="multi-line\nlabel"];
+       edge [style=dashed,color=red];
+       b -> x;
+       {rank=same; b x}
+   }
+   graph test2 {
+       a -- b -- c [style=dashed];
+       a -- {x y};
+       x -- c [w=10.0];
+       x -- y [w=5.0,len=3];
+   }
+*/
+
+
+extern char ab_nt256_table[256];
+// base (index, rank, node_id)
+// A (1, 1, 2) A: base 1: index 1: rank 2: node_id
+void abpoa_dump_pog(abpoa_t *ab, abpoa_para_t *abpt) {
+    char PROG[20] = "abpoa"; int font_size=24;
+
+    abpoa_graph_t *abg = ab->abg;
+    if (abg->is_topological_sorted == 0) abpoa_topological_sort(abg, abpt);
+
+    // all settings
+    // char node_color[5][10] = {"purple3", "red3", "seagreen4", "gold2", "gray"}; // ACGTN
+    char node_color[5][10] = {"pink1", "red1", "gold2", "seagreen4", "gray"}; // ACGTN
+    // float dpi_size = 3000, graph_width = 100, graph_height = 6; 
+    float node_width=1;
+    char rankdir[5] = "LR", node_style[10]="filled", node_fixedsize[10]="true", node_shape[10]="circle";
+    int show_aligned_mismatch = 1;
+
+    int i, j, id, index, out_id; char base;
+    char **node_label = (char**)_err_malloc(abg->node_n * sizeof(char*));
+    for (i = 0; i < abg->node_n; ++i) node_label[i] = (char*)_err_malloc(sizeof(char) * 128);
+ 
+    char *dot_fn = (char*)malloc(strlen(abpt->out_pog) + 10);
+    strcpy(dot_fn, abpt->out_pog);
+    FILE *fp = xopen(strcat(dot_fn, ".dot"), "w");
+    fprintf(fp, "// %s graph dot file.\n// %d nodes.\n", PROG, abg->node_n);
+    // fprintf(fp, "digraph ABPOA_graph {\n\tgraph [dpi=%f]; size=\"%f,%f\";\n\trankdir=\"%s\";\n\tnode [width=%f, style=%s, fixedsize=%s, shape=%s];\n", dpi_size, graph_width, graph_height, rankdir, node_width, node_style, node_fixedsize, node_shape);
+    fprintf(fp, "digraph ABPOA_graph {\n\tgraph [rankdir=\"%s\"];\n\tnode [width=%f, style=%s, fixedsize=%s, shape=%s];\n", rankdir, node_width, node_style, node_fixedsize, node_shape);
+
+    for (i = 0; i < abg->node_n; ++i) {
+        id = abpoa_graph_index_to_node_id(abg, i);
+        index = i;
+        if (id == ABPOA_SRC_NODE_ID) {
+            base = 'S';
+            //sprintf(node_label[id], "\"%c\n(%d,%d,%d)\"", base, index, rank, id);
+            // only show seq
+            sprintf(node_label[id], "\"%c\n%d\"", base,index);
+            fprintf(fp, "%s [color=%s, fontsize=%d]\n", node_label[id], node_color[4], font_size);
+        } else if (id == ABPOA_SINK_NODE_ID) {
+            base = 'E';
+            //sprintf(node_label[id], "\"%c\n(%d,%d,%d)\"", base, index, rank, id);
+            // only show seq
+            sprintf(node_label[id], "\"%c\n%d\"", base,index);
+            fprintf(fp, "%s [color=%s, fontsize=%d]\n", node_label[id], node_color[4], font_size);
+        } else {
+            base = ab_nt256_table[abg->node[id].base];
+            //sprintf(node_label[id], "\"%c\n(%d,%d,%d)\"", base, index, rank, id);
+            // only show seq
+            sprintf(node_label[id], "\"%c\n%d\"", base,index);
+            fprintf(fp, "%s [color=%s, fontsize=%d]\n", node_label[id], node_color[abg->node[id].base], font_size);
+        }
+    }
+    int x_index = -1;
+    for (i = 0; i < abg->node_n; ++i) {
+        id = abpoa_graph_index_to_node_id(abg, i);
+        // out_edge
+        for (j = 0; j < abg->node[id].out_edge_n; ++j) {
+            out_id = abg->node[id].out_id[j];
+            fprintf(fp, "\t%s -> %s [label=\"%d\", penwidth=%d]\n", node_label[id], node_label[out_id], abg->node[id].out_weight[j], abg->node[id].out_weight[j]+1);
+        }
+        if (abg->node[id].aligned_node_n > 0) {
+            fprintf(fp, "\t{rank=same; %s ", node_label[id]);
+            for (j = 0; j < abg->node[id].aligned_node_n; ++j)
+                fprintf(fp, "%s ", node_label[abg->node[id].aligned_node_id[j]]);
+            fprintf(fp, "};\n");
+            if (show_aligned_mismatch) {
+                if (i > x_index) {
+                    x_index = i;
+                    // mismatch dashed line
+                    fprintf(fp, "\t{ edge [style=dashed, arrowhead=none]; %s ", node_label[id]);
+                    for (j = 0; j < abg->node[id].aligned_node_n; ++j) {
+                        fprintf(fp, "-> %s ", node_label[abg->node[id].aligned_node_id[j]]);
+                        index = abpoa_graph_node_id_to_index(abg, abg->node[id].aligned_node_id[j]);
+                        x_index = index > x_index ? index : x_index;
+                    }
+                    fprintf(fp, "}\n");
+                }
+            }
+        }
+    }
+    fprintf(fp, "}\n");
+
+    for (i = 0; i < abg->node_n; ++i) free(node_label[i]); free(node_label);
+    err_fclose(fp);
+
+    char cmd[1024];
+    char *type = strrchr(abpt->out_pog, '.');
+    if (strcmp(type+1, "pdf") != 0 && strcmp(type+1, "png") != 0)
+        err_fatal_simple("POG can only be dump to .pdf/.png file");
+    sprintf(cmd, "dot %s -T%s > %s", dot_fn, type+1, abpt->out_pog);
+    free(dot_fn);
+    if (system(cmd) != 0) err_fatal(__func__, "Fail to plot %s DAG.", PROG);
+}
diff --git a/src/abpoa_seed.c b/src/abpoa_seed.c
new file mode 100644
index 0000000..602dc31
--- /dev/null
+++ b/src/abpoa_seed.c
@@ -0,0 +1,745 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include "abpoa.h"
+#include "abpoa_seed.h"
+#include "utils.h"
+#include "kvec.h"
+#include "ksort.h"
+
+const char LogTable256[256] = {
+#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
+    -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+    LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
+    LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
+};
+
+static inline int ilog2_32(uint32_t v) {
+    uint32_t t, tt;
+    if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
+    return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
+}
+
+#define ab_sort_key_128x(a) ((a).x)
+KRADIX_SORT_INIT(ab_128x, ab_u128_t, ab_sort_key_128x, 8)
+
+#define ab_sort_key_128y(a) ((a).y)
+KRADIX_SORT_INIT(ab_128y, ab_u128_t, ab_sort_key_128y, 8)
+
+#define ab_sort_key_64(a) (a)
+KRADIX_SORT_INIT(64, uint64_t, ab_sort_key_64, 8)
+
+/* from lh3/minimap2/sketch.c */
+/********** start *************/
+static inline uint64_t hash64(uint64_t key, uint64_t mask)
+{
+    key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
+    key = key ^ key >> 24;
+    key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
+    key = key ^ key >> 14;
+    key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
+    key = key ^ key >> 28;
+    key = (key + (key << 31)) & mask;
+    return key;
+}
+
+typedef struct { // a simplified version of kdq
+    int front, count;
+    int a[32];
+} tiny_queue_t;
+
+static inline void tq_push(tiny_queue_t *q, int x)
+{
+    q->a[((q->count++) + q->front) & 0x1f] = x;
+}
+
+static inline int tq_shift(tiny_queue_t *q)
+{
+    int x;
+    if (q->count == 0) return -1;
+    x = q->a[q->front++];
+    q->front &= 0x1f;
+    --q->count;
+    return x;
+}
+
+/**
+ * Find symmetric (w,k)-minimizers on a DNA sequence
+ *
+ * @param km     thread-local memory pool; using NULL falls back to malloc()
+ * @param str    DNA sequence
+ * @param len    length of $str
+ * @param w      find a minimizer for every $w consecutive k-mers
+ * @param k      k-mer size
+ * @param rid    reference ID; will be copied to the output $p array
+ * @param is_hpc homopolymer-compressed or not
+ * @param p      minimizers
+ *               p->a[i].x = kMer<<8 | kmerSpan
+ *               p->a[i].y = rid<<32 | lastPos<<1 | strand
+ *               where lastPos is the position of the last base of the i-th minimizer,
+ *               and strand indicates whether the minimizer comes from the top or the bottom strand.
+ *               Callers may want to set "p->n = 0"; otherwise results are appended to p
+ */
+void mm_sketch(void *km, const uint8_t *str, int len, int w, int k, uint32_t rid, int is_hpc, int both_strand, ab_u128_v *p)
+{
+    uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
+    int i, j, l, buf_pos, min_pos, kmer_span = 0;
+    ab_u128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
+    tiny_queue_t tq;
+
+    assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
+    memset(buf, 0xff, w * 16);
+    memset(&tq, 0, sizeof(tiny_queue_t));
+    kv_resize(ab_u128_t, km, *p, p->n + len/w);
+
+    for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
+        int c = str[i];
+        ab_u128_t info = { UINT64_MAX, UINT64_MAX };
+        if (c < 4) { // not an ambiguous base
+            uint32_t z;
+            if (is_hpc) {
+                int skip_len = 1;
+                if (i + 1 < len && str[i + 1] == c) {
+                    for (skip_len = 2; i + skip_len < len; ++skip_len)
+                        if (str[i + skip_len] != c)
+                            break;
+                    i += skip_len - 1; // put $i at the end of the current homopolymer run
+                }
+                tq_push(&tq, skip_len);
+                kmer_span += skip_len;
+                if (tq.count > k) kmer_span -= tq_shift(&tq);
+            } else kmer_span = l + 1 < k? l + 1 : k;
+            if (both_strand) {
+                kmer[0] = (kmer[0] << 2 | c) & mask;           // forward k-mer
+                kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
+                if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
+                z = kmer[0] < kmer[1]? 0 : 1; // strand
+            } else {
+                kmer[0] = (kmer[0] << 2 | c) & mask;           // forward k-mer
+                z = 0;
+            }
+            ++l;
+            if (l >= k && kmer_span < 256) {
+                info.x = hash64(kmer[z], mask) << 8 | kmer_span;
+                info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
+            }
+        } else l = 0, tq.count = tq.front = 0, kmer_span = 0;
+        buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
+        if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
+            for (j = buf_pos + 1; j < w; ++j)
+                if (min.x == buf[j].x && buf[j].y != min.y) kv_push(ab_u128_t, km, *p, buf[j]);
+            for (j = 0; j < buf_pos; ++j)
+                if (min.x == buf[j].x && buf[j].y != min.y) kv_push(ab_u128_t, km, *p, buf[j]);
+        }
+        if (info.x <= min.x) { // a new minimum; then write the old min
+            if (l >= w + k && min.x != UINT64_MAX) kv_push(ab_u128_t, km, *p, min);
+            min = info, min_pos = buf_pos;
+        } else if (buf_pos == min_pos) { // old min has moved outside the window
+            if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(ab_u128_t, km, *p, min);
+            for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
+                if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
+            for (j = 0; j <= buf_pos; ++j)
+                if (min.x >= buf[j].x) min = buf[j], min_pos = j;
+            if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
+                for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
+                    if (min.x == buf[j].x && min.y != buf[j].y) kv_push(ab_u128_t, km, *p, buf[j]);
+                for (j = 0; j <= buf_pos; ++j)
+                    if (min.x == buf[j].x && min.y != buf[j].y) kv_push(ab_u128_t, km, *p, buf[j]);
+            }
+        }
+        if (++buf_pos == w) buf_pos = 0;
+    }
+    if (min.x != UINT64_MAX)
+        kv_push(ab_u128_t, km, *p, min);
+}
+
+// For amino acid sequence
+void mm_aa_sketch(void *km, const uint8_t *str, int len, int w, int k, uint32_t rid, int is_hpc, ab_u128_v *p)
+{
+    uint64_t mask = (1ULL<<5*k) - 1, kmer[2] = {0,0};
+    int i, j, l, buf_pos, min_pos, kmer_span = 0;
+    ab_u128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
+    tiny_queue_t tq;
+
+    assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 11)); // 56 / 5 == 11
+    memset(buf, 0xff, w * 16);
+    memset(&tq, 0, sizeof(tiny_queue_t));
+    kv_resize(ab_u128_t, km, *p, p->n + len/w);
+
+    for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
+        int c = str[i];
+        ab_u128_t info = { UINT64_MAX, UINT64_MAX };
+        if (c < 26) { // not an ambiguous base
+            uint32_t z;
+            if (is_hpc) {
+                int skip_len = 1;
+                if (i + 1 < len && str[i + 1] == c) {
+                    for (skip_len = 2; i + skip_len < len; ++skip_len)
+                        if (str[i + skip_len] != c)
+                            break;
+                    i += skip_len - 1; // put $i at the end of the current homopolymer run
+                }
+                tq_push(&tq, skip_len);
+                kmer_span += skip_len;
+                if (tq.count > k) kmer_span -= tq_shift(&tq);
+            } else kmer_span = l + 1 < k? l + 1 : k;
+            
+            kmer[0] = (kmer[0] << 5 | c) & mask; // only forward k-mer for aa seq
+            z = 0;
+            ++l;
+            if (l >= k && kmer_span < 256) {
+                info.x = hash64(kmer[z], mask) << 8 | kmer_span;
+                info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
+            }
+        } else l = 0, tq.count = tq.front = 0, kmer_span = 0;
+        buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
+        if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
+            for (j = buf_pos + 1; j < w; ++j)
+                if (min.x == buf[j].x && buf[j].y != min.y) kv_push(ab_u128_t, km, *p, buf[j]);
+            for (j = 0; j < buf_pos; ++j)
+                if (min.x == buf[j].x && buf[j].y != min.y) kv_push(ab_u128_t, km, *p, buf[j]);
+        }
+        if (info.x <= min.x) { // a new minimum; then write the old min
+            if (l >= w + k && min.x != UINT64_MAX) kv_push(ab_u128_t, km, *p, min);
+            min = info, min_pos = buf_pos;
+        } else if (buf_pos == min_pos) { // old min has moved outside the window
+            if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(ab_u128_t, km, *p, min);
+            for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
+                if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
+            for (j = 0; j <= buf_pos; ++j)
+                if (min.x >= buf[j].x) min = buf[j], min_pos = j;
+            if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
+                for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
+                    if (min.x == buf[j].x && min.y != buf[j].y) kv_push(ab_u128_t, km, *p, buf[j]);
+                for (j = 0; j <= buf_pos; ++j)
+                    if (min.x == buf[j].x && min.y != buf[j].y) kv_push(ab_u128_t, km, *p, buf[j]);
+            }
+        }
+        if (++buf_pos == w) buf_pos = 0;
+    }
+    if (min.x != UINT64_MAX)
+        kv_push(ab_u128_t, km, *p, min);
+}
+/************ end *************/
+
+// tree_id_map: guide tree node id -> original input order id
+/* mm: is unsorted
+ * a.x = kMer<<8 | kmerSpan
+ * a.y = rid<<32 | strand<<31 | lastPos
+ */
+int abpoa_build_guide_tree(abpoa_para_t *abpt, int n_seq, ab_u128_v *mm, int *tree_id_map) {
+    if (mm->n == 0) return 0;
+
+    if (abpt->verbose > 0) fprintf(stderr, "[%s] Building progressive guide tree ... ", __func__);
+    size_t i, _i, j; int rid1, rid2;                                          // mm_hit_n: mimizer hits between each two sequences
+                                                                              // 0: 0
+                                                                              // 1: 0 1
+                                                                              // 2: 0 1 2
+    int *mm_hit_n = (int*)_err_calloc((n_seq * (n_seq+1)) >> 1, sizeof(int)); //  ...
+                                                                              // n: 0 1 ... n-1 n
+                                                                              // 
+                                                                              // # total mimizers of i: mm_hit_n[(i*(i+1))/2+i]
+                                                                              // # total hits for i and j (i>j): mm_hit_n[(i*(i+1)/2)+j]
+    radix_sort_ab_128x(mm->a, mm->a + mm->n); // sort mm by k-mer hash values
+    uint64_t last_x = mm->a[0].x;
+    int *mm_cnt = (int*)_err_malloc(n_seq * sizeof(int));
+    for (_i = 0, i = 1; i < mm->n; ++i) { // collect mm hits
+        if (mm->a[i].x != last_x) {
+            // now [_i, i-1] have the same minimizer k-mer
+            memset(mm_cnt, 0, n_seq * sizeof(int));
+            for (j = _i; j < i; ++j) {
+                // count mm->a[j]
+                rid1 = mm->a[j].y >> 32;
+                ++mm_cnt[rid1];
+                ++mm_hit_n[((rid1 * (rid1+1)) >> 1) + rid1];
+            }
+            for (rid1 = 0; rid1 < n_seq-1; ++rid1) {
+                for (rid2 = rid1+1; rid2 < n_seq; ++rid2) {
+                    mm_hit_n[((rid2 * (rid2 + 1)) >> 1) + rid1] += MIN_OF_TWO(mm_cnt[rid1], mm_cnt[rid2]);
+                }
+            }
+            // next minimizer
+            last_x = mm->a[i].x, _i = i;
+        }
+    }
+    // now [_i, i-1] have the same minimizer k-mer
+    memset(mm_cnt, 0, n_seq * sizeof(int));
+    for (j = _i; j < i; ++j) {
+        // count mm->a[j]
+        rid1 = mm->a[j].y >> 32;
+        ++mm_cnt[rid1];
+        ++mm_hit_n[((rid1 * (rid1+1)) >> 1) + rid1];
+    }
+    for (rid1 = 0; rid1 < n_seq-1; ++rid1) {
+        for (rid2 = rid1+1; rid2 < n_seq; ++rid2) {
+            mm_hit_n[((rid2 * (rid2 + 1)) >> 1) + rid1] += MIN_OF_TWO(mm_cnt[rid1], mm_cnt[rid2]);
+        }
+    }
+    free(mm_cnt);
+
+    // calculate jaccard similarity between each two sequences
+    double *jac_sim = (double*)_err_calloc((n_seq * (n_seq-1)) >> 1, sizeof(double));
+    double max_jac = -1.0, jac; int max_i=-1, max_j=-1;
+    for (i = 1; i < (size_t)n_seq; ++i) {
+        for (j = 0; j < i; ++j) {
+            int tot_n = mm_hit_n[((i*(i+1))>>1)+i] + mm_hit_n[((j*(j+1))>>1)+j] - mm_hit_n[((i*(i+1))>>1)+j];
+            if (tot_n == 0) jac = 0;
+            else if (tot_n < 0) err_fatal(__func__, "Bug in progressive tree building. (1)");
+            else jac = (0.0+mm_hit_n[((i*(i+1))>>1)+j]) / tot_n;
+            jac_sim[((i * (i-1)) >> 1) + j] = jac; // jac_sim[rid2*(rid2-1)/2 + rid1]
+            if (jac > max_jac) {
+                max_jac = jac; max_i = i, max_j = j;
+            }
+        }
+    }
+
+    // build guide tree
+    // first pick two with the biggest jac (max_i, max_j)
+    int n_in_map = 2; tree_id_map[0] = max_j, tree_id_map[1] = max_i;
+
+    // then, pick one with biggest jac sum with existing sequence in tree_id_map
+    while (n_in_map < n_seq) {
+        max_jac = -1.0, max_i = n_seq;
+        for (rid1 = 0; rid1 < n_seq; ++rid1) {
+            jac = 0.0;
+            for (i = 0; i < (size_t)n_in_map; ++i) {
+                rid2 = tree_id_map[i];
+                if (rid1 == rid2) { jac = -1.0; break; }
+                else if (rid1 > rid2) jac += jac_sim[((rid1 * (rid1-1)) >> 1) + rid2];
+                else jac += jac_sim[((rid2 * (rid2-1)) >> 1) + rid1];
+            }
+            if (jac > max_jac) {
+                max_jac = jac;
+                max_i = rid1;
+            }
+        }
+        if (max_i == n_seq) err_fatal(__func__, "Bug in progressive tree building. (2)");
+        tree_id_map[n_in_map++] = max_i;
+    }
+
+    free(mm_hit_n); free(jac_sim);
+    if (abpt->verbose > 0) fprintf(stderr, "done!\n");
+    return 0;
+}
+
+// t's mm: is sorted, q's mm is unsorted
+//       | r1's minimizers | r2's minimizers | ... | rn's minimizers |
+// mm_c: | 0 | n_r1_mm | n_r1..2_mm | ... | n_r1..n-1_mm | n_r1..n_mm |
+// t is already in the graph, q is query sequence
+// merge sort for t and q's minimizer buckets
+int collect_anchors1(void *km, ab_u64_v *anchors, ab_u128_v mm, int *mm_c, int tid, int qid, int qlen, int k) {
+    int i, j, _i, _j; uint64_t xi, xj, _xi, _xj, _yi, _yj, a;
+    i = mm_c[tid], j = mm_c[qid];
+    // t's mm is already sorted XXX
+    radix_sort_ab_128x(mm.a + j, mm.a + mm_c[qid+1]);
+
+    while (i < mm_c[tid+1] && j < mm_c[qid+1]) {
+        xi = mm.a[i].x, xj = mm.a[j].x;
+        if (xi == xj) {
+            for (_i = i; _i < mm_c[tid+1]; ++_i) {
+                _xi = mm.a[_i].x;
+                if (_xi != xi) break;
+                _yi = mm.a[_i].y;
+                for (_j = j; _j < mm_c[qid+1]; ++_j) {
+                    _xj = mm.a[_j].x;
+                    if (_xj != xj) break;
+                    _yj = mm.a[_j].y;
+                    // t_strand<<63 | t_lastPos<<32 | q_lastPos
+                    if ((_yi & 1) == (_yj & 1)) { // same strand
+                        a = (uint64_t)((uint32_t)_yi>>1)<<32 | ((uint32_t)_yj>>1);
+                    } else { // different strand
+                        a = 1ULL<<63 | (uint64_t)((uint32_t)_yi>>1)<<32 | (qlen - (((uint32_t)_yj>>1)+1-k) - 1); // XXX qlen < pow(2,28)
+                    }
+                    kv_push(uint64_t, km, *anchors, a);
+                }
+            }
+            i = _i, j = _j;
+        } else if (xi < xj) ++i;
+        else if (xi > xj) ++j;
+    }
+    // sort by tpos
+    radix_sort_64(anchors->a, anchors->a + anchors->n);
+    return anchors->n;
+}
+
+int get_local_chain_score(int j_end_tpos, int j_end_qpos, int i_end_anchor_i, ab_u64_v *anchors, int *pre_id, int *score) {
+    int i = i_end_anchor_i, chain_score = 0;
+    int i_tpos, i_qpos;
+    do {
+        i_tpos = (anchors->a[i] >> 32) & 0x7fffffff, i_qpos = (int32_t)anchors->a[i];
+
+        if (i_tpos <= j_end_tpos && i_qpos <= j_end_qpos) break;
+        i = pre_id[i];
+    } while (i != -1);
+
+    if (i == -1) chain_score = score[i_end_anchor_i];
+    else chain_score = score[i_end_anchor_i] - score[i];
+    return chain_score;
+}
+
+
+// local chains:
+//   x: strand | end_tpos | end_qpos
+//   y: end_anchor_i | start_anchor_i
+int abpoa_dp_chaining_of_local_chains(void *km, ab_u128_t *local_chains, int n_local_chains, ab_u64_v *anchors, int *score, int *pre_id, ab_u64_v *par_anchors, int min_w, int tlen, int qlen) {
+    int i, j, st, score1, global_max_score=INT32_MIN, global_max_i=-1;
+    int *chain_score = (int*)kmalloc(km, n_local_chains * 4), *pre_chain_id = (int*)kmalloc(km, n_local_chains * 4);
+    size_t _n = par_anchors->n;
+
+    for (i = st = 0; i < n_local_chains; ++i) {
+        uint64_t ix = local_chains[i].x, iy = local_chains[i].y;
+        int istrand = ix >> 63, i_end_qpos = (int32_t)ix, i_end_anchor_i = iy >> 32, i_start_anchor_i = (int32_t)iy;
+        int i_start_tpos = (anchors->a[i_start_anchor_i] >> 32) & 0x7fffffff, i_start_qpos = (int32_t)anchors->a[i_start_anchor_i];
+        int max_j = -1, max_score = score[i_end_anchor_i];
+        while (st < i) {
+            if ((local_chains[st].x) >> 63 != istrand) ++st;
+            else break;
+        }
+        for (j = i-1; j >= st; --j) {
+            uint64_t jx = local_chains[j].x;
+            int j_end_tpos = (jx >> 32) & 0x7fffffff, j_end_qpos = (int32_t)jx; //, j_end_anchor_i = iy >> 32;
+            if (j_end_qpos >= i_end_qpos) continue;
+
+            if (i_start_tpos > j_end_tpos && i_start_qpos > j_end_qpos) score1 = chain_score[j] + score[i_end_anchor_i];
+            else score1 = chain_score[j] + get_local_chain_score(j_end_tpos, j_end_qpos, i_end_anchor_i, anchors, pre_id, score);
+
+            if (score1 > max_score) {
+                max_score = score1; max_j = j;
+            }
+        }
+        chain_score[i] = max_score; pre_chain_id[i] = max_j;
+        if (max_score > global_max_score) {
+            global_max_score = max_score;
+            global_max_i = i;
+        }
+    }
+    if (global_max_i < 0) return 0;
+    // collect anchors based on global_max_i
+    int cur_i = global_max_i, pre_i = pre_chain_id[global_max_i];
+    uint64_t cur_y = local_chains[cur_i].y, pre_x, pre_y;
+    int last_tpos=tlen, last_qpos=qlen;
+    while (pre_i != -1) { // collect valid anchors in local_chains[cur_i], constrained by local_chains[pre_i]
+        pre_x = local_chains[pre_i].x, pre_y = local_chains[pre_i].y;
+        int pre_end_tpos = (pre_x >> 32) & 0x7fffffff, pre_end_qpos = (int32_t)pre_x;
+        i = cur_y >> 32;
+        while (i != -1) {
+            int cur_tpos = (anchors->a[i] >> 32) & 0x7fffffff, cur_qpos = (int32_t)anchors->a[i];
+            if (cur_tpos > pre_end_tpos && cur_qpos > pre_end_qpos) {
+                if (last_tpos - cur_tpos >= min_w && last_qpos - cur_qpos >= min_w) {
+                    kv_push(uint64_t, 0, *par_anchors, anchors->a[i]);
+                    last_tpos = cur_tpos, last_qpos = cur_qpos;
+                }
+            } else break;
+            i = pre_id[i];
+        }
+        cur_i = pre_i, pre_i = pre_chain_id[pre_i], cur_y = pre_y;
+    }
+    // collect anchors of last chain: local_chains[cur_i]
+    i = cur_y >> 32;
+    while (i != -1) {
+        int cur_tpos = (anchors->a[i] >> 32) & 0x7fffffff, cur_qpos = (int32_t)anchors->a[i];
+        if (last_tpos - cur_tpos >= min_w && last_qpos - cur_qpos >= min_w) {
+            kv_push(uint64_t, 0, *par_anchors, anchors->a[i]);
+            last_tpos = cur_tpos, last_qpos = cur_qpos;
+        }
+        i = pre_id[i];
+    }
+    // reverse order of par_anchors
+    for (i = 0; i < (int)(par_anchors->n-_n) >> 1; ++i) {
+        uint64_t tmp = par_anchors->a[_n+i];
+        par_anchors->a[_n+i] = par_anchors->a[par_anchors->n-i-1];
+        par_anchors->a[par_anchors->n-i-1] = tmp;
+    }
+
+#ifdef __DEBUG__
+    for (i = _n; i < par_anchors->n; ++i) {
+        uint64_t ia = par_anchors->a[i];
+        // strand, rpos, qpos
+        fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia));
+    }
+#endif
+    kfree(km, chain_score), kfree(km, pre_chain_id);
+    return 0;
+}
+
+// for DP chaining
+static int get_chain_score(int max_bw, int *score, int i_qpos, int i_tpos, int j_qpos, int j_tpos, int k) {
+    int delta_q, delta_t, delta_tq, min_d;
+
+    delta_q = i_qpos - j_qpos; delta_t = i_tpos - j_tpos;
+    min_d = MIN_OF_THREE(delta_q, delta_t, k);
+    *score = min_d;
+    if (delta_q >= delta_t) {
+        if ((delta_tq = delta_q - delta_t) > max_bw) return 0;
+    } else {
+        if ((delta_tq = delta_t - delta_q) > max_bw) return 0;
+    }
+    *score -= ((ilog2_32(delta_tq) >> 1) + delta_tq * 0.01 * k);
+    return 1;
+}
+
+// Dynamic Programming-based Chaining for global alignment mode
+// anchors:
+//          strand<<63 | tpos<<32 | qpos
+int abpoa_dp_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, abpoa_para_t *abpt, int tlen, int qlen) {
+    int i, j, st, n_a = anchors->n;
+    int *score = (int32_t*)kmalloc(km, n_a * 4), *pre_id = (int32_t*)kmalloc(km, n_a * 4), *end_pos = (int32_t*)kmalloc(km, n_a * 4);
+    memset(end_pos, 0, n_a * 4);
+
+    int max_bw = 100, max_dis = 100, max_skip_anchors = 25, max_non_best_anchors = 50, min_local_chain_score = 100;
+    int min_w = abpt->min_w+abpt->k;
+    int i_qpos, i_tpos, i_tstrand, j_qpos, j_tpos;
+    for (i = st = 0; i < (int)anchors->n; ++i) {
+        uint64_t ia = anchors->a[i];
+        i_qpos = (int32_t)ia, i_tpos = (ia >> 32) & 0x7fffffff, i_tstrand = ia >> 63;
+        int max_j = -1, n_skip=0, non_best_iter_n = 0, max_score=abpt->k, _score;
+        while (st < i) {
+            uint64_t st_a = anchors->a[st];
+            if ((st_a >> 63) != i_tstrand || (int)((st_a >> 32) & 0x7fffffff) + max_dis < i_tpos) ++st;
+            else break;
+        }
+
+        for (j = i-1; j >= st; --j) { // check if j is i's optimal pre anchor
+            uint64_t ja = anchors->a[j];
+            j_qpos = (uint32_t)ja; j_tpos = (ja >> 32) & 0x7fffffff;
+            if (j_qpos >= i_qpos || j_qpos + max_dis < i_qpos) continue;
+            if (!get_chain_score(max_bw, &_score, i_qpos, i_tpos, j_qpos, j_tpos, abpt->k)) continue;
+            _score += score[j];
+            if (_score > max_score) {
+                max_score = _score; max_j = j;
+                non_best_iter_n = 0;
+                if (n_skip > 0) --n_skip;
+            } else if (end_pos[j] == i) {
+                if (++n_skip > max_skip_anchors) break;
+            } else if (++non_best_iter_n > max_non_best_anchors) break;
+
+            if (pre_id[j] >= 0) end_pos[pre_id[j]] = i;
+        }
+#ifdef __DEBUG__
+        fprintf(stderr, "%d pre_id: %d, score: %d, tpos: %d, qpos: %d\n", i, max_j, max_score, i_tpos, i_qpos);
+#endif
+        score[i] = max_score, pre_id[i] = max_j;
+    }
+
+    memset(end_pos, 0, n_a * 4);
+    int n_local_chains = 0;
+    for (i = n_a-1; i >= 0; --i) {
+        if (pre_id[i] >= 0) end_pos[pre_id[i]] = 1;
+        if (end_pos[i] == 0 && score[i] >= min_local_chain_score) {
+            end_pos[i] = 2;
+            ++n_local_chains;
+        }
+    }
+    // collect local chains
+    // x: score
+    // y: e_a_i
+    ab_u128_t *local_chains = (ab_u128_t*)kmalloc(km, n_local_chains * sizeof(ab_u128_t));
+    for (i = n_local_chains = 0; i < n_a; ++i) {
+        if (end_pos[i] == 2) {
+            local_chains[n_local_chains].x = score[i];
+            local_chains[n_local_chains++].y = i;
+        }
+    }
+    radix_sort_ab_128x(local_chains, local_chains + n_local_chains);
+
+    // collect local chains
+    // x: strand | endpos | score
+    // y: s_a_i | e_a_i
+    int32_t *anchor_map = end_pos; memset(anchor_map, 0, n_a * 4);
+    int start_id, end_id, tot_chain_i; uint64_t strand, tpos, qpos;
+
+    for (i = tot_chain_i = n_local_chains-1; i >=0; --i) {
+        j = local_chains[i].y; end_id = j; strand = anchors->a[i] >> 63; tpos = (anchors->a[j] >> 32) & 0x7fffffff, qpos = (int32_t)anchors->a[j];
+        do {
+            start_id = j;
+            anchor_map[j] = 1;
+            j = pre_id[j];
+        } while (j >= 0 && anchor_map[j] == 0);
+
+        if (j < 0) { // reach the start of the chain
+            local_chains[tot_chain_i].x = strand << 63 | tpos << 32 | qpos;
+            local_chains[tot_chain_i--].y = (uint64_t)end_id << 32 | start_id;
+        } 
+        // not keep branched chains
+        /*else if ((int32_t)local_chains[i].x - score[j] >= min_local_chain_score) { // anchor_map == 1, anchor was already used in other chain
+            local_chains[tot_chain_i].x = strand << 63 | tpos << 32 | qpos;
+            local_chains[tot_chain_i--].y = (uint64_t)end_id << 32 | ((int32_t)local_chains[i].x - score[j]);
+            pre_id[start_id] = -1;
+        }*/
+    }
+
+    radix_sort_ab_128x(local_chains+tot_chain_i+1, local_chains + n_local_chains);
+    abpoa_dp_chaining_of_local_chains(km, local_chains+tot_chain_i+1, n_local_chains-1-tot_chain_i, anchors, score, pre_id, par_anchors, min_w, tlen, qlen);
+
+    kfree(km, score); kfree(km, pre_id); kfree(km, end_pos); kfree(km, local_chains);
+    return 0;
+}
+
+int bin_search_min_larger(int *lis, int left, int right, int key) {
+    int mid;
+    while (right - left > 1) {
+        mid = ((right - left) >> 1) + left;
+
+        if (lis[mid] >= key) right = mid;
+        else left = mid;
+    }
+    return right;
+}
+
+// rank: qpos<<32 | tpos_rank
+int LIS(void *km, int tot_n, uint64_t *rank, int n) {
+    int *pre_rank = (int*)kcalloc(km, tot_n+1, sizeof(int));
+    int *lis = (int*)kmalloc(km, n * sizeof(int));
+    int i, n_lis, irank, idx;
+    lis[0] = (uint32_t)rank[0]; n_lis = 1;
+
+    // calculate LIS length
+    for (i = 1; i < n; ++i) {
+        irank = (uint32_t)rank[i];
+        if (irank < lis[0]) {
+            lis[0] = irank;
+        } else if (irank > lis[n_lis-1]) {
+            lis[n_lis] = irank;
+            pre_rank[irank] = lis[n_lis-1];
+            ++n_lis;
+        } else {
+            idx = bin_search_min_larger(lis, -1, n_lis-1, irank);
+            lis[idx] = irank;
+            if (idx > 0) pre_rank[irank] = lis[idx-1];
+        }
+    }
+    // collect LIS, store ids in rank
+    int r = lis[n_lis-1]; i = n_lis-1;
+    while (r != 0) {
+        if (i < 0) err_fatal_simple("Error in LIS.");
+        rank[i--] = r;
+        r = pre_rank[r];
+    }
+    kfree(km, pre_rank); kfree(km, lis);
+    return n_lis;
+}
+
+// XXX TODO use dp-based chaining
+// XXX TODO remove q_span
+
+// Longest Increasing Subsequence-based Chaining (only works for global alignment mode)
+// input:
+//   anchors: (sorted by tpos)
+//          strand<<63 | tpos<<32 | qpos
+// output:
+//   anchor list size: n
+//   list of anchors: anchors
+int LIS_chaining(void *km, ab_u64_v *anchors, ab_u64_v *par_anchors, int min_w) {
+    size_t i, j, n_a = anchors->n, n_for, n_rev;
+    uint64_t *for_rank = (uint64_t*)kmalloc(km, sizeof(uint64_t) * n_a);
+    uint64_t *rev_rank = (uint64_t*)kmalloc(km, sizeof(uint64_t) * n_a);
+    uint64_t qpos;
+
+    n_for = 0, n_rev = 0;
+    for (i = 0; i < n_a; ++i) {
+        uint64_t ia = anchors->a[i];
+        qpos = (uint32_t)ia;
+        if (ia >> 63) { // reverse
+            rev_rank[n_rev++] = qpos << 32 | (i+1);
+        } else { // forward
+            for_rank[n_for++] = qpos << 32 | (i+1);
+        }
+    }
+
+    if (n_for > 0) {
+        radix_sort_64(for_rank, for_rank + n_for);
+        n_for = LIS(km, n_a, for_rank, n_for);
+    }
+    if (n_rev > 0) {
+        radix_sort_64(rev_rank, rev_rank + n_rev);
+        n_rev = LIS(km, n_a, rev_rank, n_rev);
+    }
+
+    size_t n; uint64_t *rank;
+    if (n_for > n_rev) {
+        n = n_for; rank = for_rank; kfree(km, rev_rank);
+    } else {
+        n = n_rev; rank = rev_rank; kfree(km, for_rank);
+    }
+    // filter anchors
+    int last_tpos = -1, last_qpos = -1, cur_tpos, cur_qpos;
+#ifdef __DEBUG__
+    size_t _n = par_anchors->n;
+#endif
+    for (i = 0; i < n; ++i) {
+        j = (int)rank[i]-1;
+        cur_tpos = (anchors->a[j] >> 32) & 0x7fffffff;
+        if (cur_tpos - last_tpos < min_w) continue;
+        cur_qpos = (uint32_t)anchors->a[j];
+        if (cur_qpos - last_qpos < min_w) continue;
+
+        kv_push(uint64_t, 0, *par_anchors, anchors->a[j]); // store LIS anchors into par_anchors
+        last_tpos = cur_tpos; last_qpos = cur_qpos;
+    }
+#ifdef __DEBUG__
+    for (i = _n; i < par_anchors->n; ++i) {
+        uint64_t ia = par_anchors->a[i];
+        // strand, rpos, qpos
+        fprintf(stderr, "%c\t%ld\t%d\n", "+-"[ia >> 63], (ia>>32) & 0x7fffffff, ((uint32_t)ia));
+    }
+#endif
+    return 0;
+}
+
+int abpoa_collect_mm(void *km, uint8_t **seqs, int *seq_lens, int n_seq, abpoa_para_t *abpt, ab_u128_v *mm, int *mm_c) {
+    if (abpt->verbose > 0) fprintf(stderr, "[%s] Collecting minimizers ... ", __func__);
+    int i;
+    mm_c[0] = 0;
+    for (i = 0; i < n_seq; ++i) { // collect minimizers
+        if (abpt->m > 5) mm_aa_sketch(km, seqs[i], seq_lens[i], abpt->w, abpt->k, i, 0, mm);
+        else mm_sketch(km, seqs[i], seq_lens[i], abpt->w, abpt->k, i, 0, abpt->amb_strand, mm);
+        mm_c[i+1] = mm->n;
+    }
+    if (abpt->verbose > 0) fprintf(stderr, "done!\n");
+    return mm->n;
+}
+
+// split guide tree and seeding and partition
+int abpoa_build_guide_tree_partition(uint8_t **seqs, int *seq_lens, int n_seq, abpoa_para_t *abpt, int *read_id_map, ab_u64_v *par_anchors, int *par_c) {
+    int i; void *km = km_init();
+    for (i = 0; i < n_seq; ++i) read_id_map[i] = i;
+    ab_u128_v mm1 = {0, 0, 0}; int *mm_c = (int*)_err_malloc((n_seq+1) * sizeof(int));
+    abpoa_collect_mm(km, seqs, seq_lens, n_seq, abpt, &mm1, mm_c);
+
+    if (abpt->progressive_poa && n_seq > 2) {
+        // copy mm1 to mm2
+        ab_u128_v mm2 = {0, 0, 0};
+        for (i = 0; i < (int)mm1.n; ++i) kv_push(ab_u128_t, km, mm2, mm1.a[i]);
+        // use mm2 to build guide tree
+        abpoa_build_guide_tree(abpt, n_seq, &mm2, read_id_map);
+        kfree(km, mm2.a);
+    }
+    if (abpt->disable_seeding || n_seq < 2) {
+        kfree(km, mm1.a); free(mm_c); km_destroy(km);
+        return 0; // no anchor
+    }
+    // partition into small windows
+    int qid, tid;
+    tid = read_id_map[0];
+    radix_sort_ab_128x(mm1.a + mm_c[tid], mm1.a + mm_c[tid+1]);
+
+    par_c[0] = 0;
+    for (i = 1; i < n_seq; ++i) {
+        tid = read_id_map[i-1]; qid = read_id_map[i];
+        ab_u64_v anchors = {0, 0, 0};
+        // collect minimizer hit anchors between t and q
+        collect_anchors1(km, &anchors, mm1, mm_c, tid, qid, seq_lens[qid], abpt->k);
+        // filtering and only keep LIS anchors
+#ifdef __DEBUG__
+        fprintf(stderr, "%d vs %d (tot_n: %ld)\n", tid, qid, anchors.n);
+#endif
+        // alignment mode: different chaining result for global/local/extend
+        abpoa_dp_chaining(km, &anchors, par_anchors, abpt, seq_lens[tid], seq_lens[qid]);
+        par_c[i] = par_anchors->n;
+        kfree(km, anchors.a);
+    }
+
+    kfree(km, mm1.a); free(mm_c); km_destroy(km);
+    return 0; // par_anchors->n;
+}
diff --git a/src/abpoa_seed.h b/src/abpoa_seed.h
new file mode 100644
index 0000000..084f2a0
--- /dev/null
+++ b/src/abpoa_seed.h
@@ -0,0 +1,25 @@
+#ifndef _ABPOA_SEED_H
+#define _ABPOA_SEED_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "abpoa.h"
+
+// emulate 128-bit integers and arrays
+typedef struct { uint64_t x, y; } ab_u128_t;
+typedef struct { size_t n, m; ab_u128_t *a; } ab_u128_v;
+
+typedef struct { size_t n, m; uint64_t *a; } ab_u64_v;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int abpoa_build_guide_tree_partition(uint8_t **seqs, int *seq_lens, int n_seq, abpoa_para_t *abpt, int *read_id_map, ab_u64_v *par_anchors, int *par_c);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/src/abpoa_seq.c b/src/abpoa_seq.c
new file mode 100644
index 0000000..db38b00
--- /dev/null
+++ b/src/abpoa_seq.c
@@ -0,0 +1,660 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "abpoa_seq.h"
+#include "abpoa_align.h"
+#include "abpoa_graph.h"
+#include "utils.h"
+#include "kstring.h"
+#include "khash.h"
+
+KHASH_MAP_INIT_STR(abstr, uint32_t)
+
+// for nt
+// AaCcGgTtNn ==> 0,1,2,3,4
+unsigned char ab_nt4_table[256] = {
+       0, 1, 2, 3,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4 /*'-'*/, 4, 4,
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  3, 3, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  3, 3, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+       4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+// 65,97=>A, 67,99=>C, 71,103=>G, 84,85,116,117=>T, else=>N
+const char ab_nt256_table[256] = {
+       'A', 'C', 'G', 'T',  'N', '-', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', '-',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'A', 'N', 'C',  'N', 'N', 'N', 'G',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'T', 'T', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'A', 'N', 'C',  'N', 'N', 'N', 'G',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'T', 'T', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',
+       'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N',  'N', 'N', 'N', 'N'
+};
+
+// for aa
+// AaCcGgTtNn ... ==> 0,1,2,3,4 ...
+// BbDdEeFf   ... ==> 5,6,7,8 ...
+unsigned char ab_aa26_table[256] = {
+	 0,  1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, 
+	16, 17, 18, 19,  20, 21, 22, 23,  24, 25, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26,  0,  5,  1,   6,  7,  8,  2,   9, 10, 11, 12,  13, 14,  4, 15, 
+	16, 17, 18, 19,   3, 20, 21, 22,  23, 24, 25, 26,  26, 26, 26, 26, 
+	26,  0,  5,  1,   6,  7,  8,  2,   9, 10, 11, 12,  13, 14,  4, 15, 
+	16, 17, 18, 19,   3, 20, 21, 22,  23, 24, 25, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26
+};
+
+// 0/1/2/3/4=>ACGTN
+// 5/6/7/8=>BDEF ...
+const char ab_aa256_table[256] = {
+	'A', 'C', 'G', 'T',  'N', 'B', 'D', 'E',  'F', 'H', 'I',  'J', 'K', 'L', 'M', 'O',
+	'P', 'Q', 'R', 'S',  'U', 'V', 'W', 'X',  'Y', 'Z', '*', '-',  '*', '*', '*', '*',
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', 'A', 'B', 'C',  'D', 'E', 'F', 'G',  'H', 'I', 'J', 'K',  'L', 'M', 'N', 'O', 
+	'P', 'Q', 'R', 'S',  'T', 'U', 'V', 'W',  'X', 'Y', 'Z', '*',  '*', '*', '*', '*', 
+	'*', 'A', 'B', 'C',  'D', 'E', 'F', 'G',  'H', 'I', 'J', 'K',  'L', 'M', 'N', 'O', 
+	'P', 'Q', 'R', 'S',  'T', 'U', 'V', 'W',  'X', 'Y', 'Z', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*', 
+	'*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*',  '*', '*', '*', '*'
+};
+
+char ab_char26_table[256];
+char ab_char256_table[256];
+
+abpoa_seq_t *abpoa_init_seq(void) {
+    abpoa_seq_t *abs = (abpoa_seq_t*)_err_malloc(sizeof(abpoa_seq_t));
+    abs->n_seq = 0; abs->m_seq = CHUNK_READ_N;
+    abs->seq = (abpoa_str_t*)_err_calloc(abs->m_seq, sizeof(abpoa_str_t));
+    abs->name = (abpoa_str_t*)_err_calloc(abs->m_seq, sizeof(abpoa_str_t));
+    abs->comment = (abpoa_str_t*)_err_calloc(abs->m_seq, sizeof(abpoa_str_t));
+    abs->qual = (abpoa_str_t*)_err_calloc(abs->m_seq, sizeof(abpoa_str_t));
+    abs->is_rc = (uint8_t*)_err_calloc(abs->m_seq, sizeof(uint8_t));
+    return abs;
+}
+
+void abpoa_free_seq(abpoa_seq_t *abs) {
+    int i;
+    for (i = 0; i < abs->m_seq; ++i) {
+        if (abs->seq[i].m > 0) free(abs->seq[i].s);
+        if (abs->name[i].m > 0) free(abs->name[i].s);
+        if (abs->comment[i].m > 0) free(abs->comment[i].s);
+        if (abs->qual[i].m > 0) free(abs->qual[i].s);
+    }
+    free(abs->seq); free(abs->name); free(abs->comment); free(abs->qual); 
+    free(abs->is_rc); free(abs);
+}
+
+void abpoa_cpy_str(abpoa_str_t *str, char *s, int l) {
+    if (l > 0) {
+        str->l = l; str->m = l + 1;
+        str->s = (char*)_err_malloc(str->m * sizeof(char));
+        memcpy(str->s, s, l);
+        str->s[str->l] = 0;
+    }
+}
+
+void abpoa_cpy_seq(abpoa_seq_t *abs, int seq_i, kseq_t *kseq) {
+    abpoa_cpy_str(abs->seq+seq_i, kseq->seq.s, kseq->seq.l);
+    abpoa_cpy_str(abs->name+seq_i, kseq->name.s, kseq->name.l);
+    abpoa_cpy_str(abs->comment+seq_i, kseq->comment.s, kseq->comment.l);
+    abpoa_cpy_str(abs->qual+seq_i, kseq->qual.s, kseq->qual.l);
+}
+
+abpoa_seq_t *abpoa_realloc_seq(abpoa_seq_t *abs) {
+    if (abs->n_seq >= abs->m_seq) {
+        int m_seq = MAX_OF_TWO(abs->n_seq, abs->m_seq << 1);
+        abs->seq = (abpoa_str_t*)_err_realloc(abs->seq, m_seq * sizeof(abpoa_str_t));
+        abs->name = (abpoa_str_t*)_err_realloc(abs->name, m_seq * sizeof(abpoa_str_t));
+        abs->comment = (abpoa_str_t*)_err_realloc(abs->comment, m_seq * sizeof(abpoa_str_t));
+        abs->qual = (abpoa_str_t*)_err_realloc(abs->qual, m_seq * sizeof(abpoa_str_t));
+        abs->is_rc = (uint8_t*)_err_realloc(abs->is_rc, m_seq * sizeof(uint8_t));
+        int i;
+        for (i = abs->m_seq; i < m_seq; ++i) {
+            abs->seq[i].l = abs->seq[i].m = 0;
+            abs->name[i].l = abs->name[i].m = 0;
+            abs->comment[i].l = abs->comment[i].m = 0;
+            abs->qual[i].l = abs->qual[i].m = 0;
+            abs->is_rc[i] = 0;
+        }
+        abs->m_seq = m_seq;
+    }
+    return abs;
+}
+
+int abpoa_read_nseq(abpoa_seq_t *abs, kseq_t *kseq, int chunk_read_n) {
+    int n = 0;
+    while (n < chunk_read_n && kseq_read(kseq) >= 0) {
+        abpoa_realloc_seq(abs);
+        // copy kseq to abs->seq
+        abpoa_cpy_seq(abs, abs->n_seq, kseq);
+        abs->n_seq++; n++;
+    }
+    return n;
+}
+
+int abpoa_read_seq(abpoa_seq_t *abs, kseq_t *kseq) {
+    int n = 0;
+    while (kseq_read(kseq) >= 0) {
+        abpoa_realloc_seq(abs);
+        // copy kseq to abs->seq
+        abpoa_cpy_seq(abs, abs->n_seq, kseq);
+        abs->n_seq++; n++;
+    }
+    return n;
+}
+
+static long int _strtol10(const char *str, char **endptr) {
+    long int res = 0; unsigned d;
+    char *s;
+    for (s = (char*)str, d = s[0]-'0'; d < 10; ++s, d = s[0]-'0')
+        res = res * 10 + d;
+    if (endptr) *endptr = s;
+    return res;
+}
+
+static unsigned long int _strtoul10(const char *str, char **endptr) {
+    unsigned long int res = 0; unsigned d;
+    char *s;
+    for (s = (char*)str, d = s[0]-'0'; d < 10; ++s, d = s[0]-'0')
+        res = res * 10 + d;
+    if (endptr) *endptr = s;
+    return res;
+}
+
+int gfa_aux_parse(char *s, uint8_t **data, int *max)
+{
+	char *q, *p;
+	kstring_t str;
+	if (s == 0) return 0;
+	str.l = 0, str.m = *max, str.s = (char*)*data;
+	if (*s == '\t') ++s;
+	for (p = q = s;; ++p) {
+		if (*p == 0 || *p == '\t') {
+			int c = *p;
+			*p = 0;
+			if (p - q >= 5 && q[2] == ':' && q[4] == ':' && (q[3] == 'I' || q[3] == 'A' || q[3] == 'i' || q[3] == 'f' || q[3] == 'Z' || q[3] == 'B')) {
+				int type = q[3];
+				kputsn_(q, 2, &str);
+				q += 5;
+				if (type == 'A') {
+					kputc_('A', &str);
+					kputc_(*q, &str);
+                } else if (type == 'I') {
+					uint32_t x;
+					// x = strtol(q, &q, 10);
+                    x = _strtoul10(q, &q);
+					kputc_(type, &str); kputsn_((char*)&x, 4, &str);
+				} else if (type == 'i') {
+					int32_t x;
+					// x = strtol(q, &q, 10);
+                    x = _strtol10(q, &q);
+					kputc_(type, &str); kputsn_((char*)&x, 4, &str);
+				} else if (type == 'f') {
+					float x;
+					x = strtod(q, &q);
+					kputc_('f', &str); kputsn_(&x, 4, &str);
+				} else if (type == 'Z') {
+					kputc_('Z', &str); kputsn_(q, p - q + 1, &str); // note that this include the trailing NULL
+				} else if (type == 'B') {
+					type = *q++; // q points to the first ',' following the typing byte
+					if (p - q >= 2 && (type == 'c' || type == 'C' || type == 's' || type == 'S' || type == 'i' || type == 'I' || type != 'f')) {
+						int32_t n;
+						char *r;
+						for (r = q, n = 0; *r; ++r)
+							if (*r == ',') ++n;
+						kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
+						// TODO: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
+						if (type == 'c')      while (q + 1 < p) { int8_t   x = strtol(q + 1, &q, 0); kputc_(x, &str); }
+						else if (type == 'C') while (q + 1 < p) { uint8_t  x = strtol(q + 1, &q, 0); kputc_(x, &str); }
+						else if (type == 's') while (q + 1 < p) { int16_t  x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
+						else if (type == 'S') while (q + 1 < p) { uint16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
+						// else if (type == 'i') while (q + 1 < p) { int32_t  x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
+						else if (type == 'i') while (q + 1 < p) { int32_t  x = _strtol10(q + 1, &q); kputsn_(&x, 4, &str); }
+						// else if (type == 'I') while (q + 1 < p) { uint32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
+						else if (type == 'I') while (q + 1 < p) { uint32_t x = _strtoul10(q + 1, &q); kputsn_(&x, 4, &str); }
+						else if (type == 'f') while (q + 1 < p) { float    x = strtod(q + 1, &q);    kputsn_(&x, 4, &str); }
+					}
+				} // should not be here, as we have tested all types
+			}
+			q = p + 1;
+			if (c == 0) break;
+		}
+	}
+	if (str.l > 0 && str.l == str.m) ks_resize(&str, str.l + 1);
+	if (str.s) str.s[str.l] = 0;
+	*max = str.m, *data = (uint8_t*)str.s;
+	return str.l;
+}
+
+static inline int gfa_aux_type2size(int x)
+{
+	if (x == 'C' || x == 'c' || x == 'A') return 1;
+	else if (x == 'S' || x == 's') return 2;
+	else if (x == 'I' || x == 'i' || x == 'f') return 4;
+	else return 0;
+}
+
+#define __skip_tag(s) do { \
+		int type = toupper(*(s)); \
+		++(s); \
+		if (type == 'Z') { while (*(s)) ++(s); ++(s); } \
+		else if (type == 'B') (s) += 5 + gfa_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
+		else (s) += gfa_aux_type2size(type); \
+	} while(0)
+
+uint8_t *gfa_aux_get(int l_data, const uint8_t *data, const char tag[2])
+{
+	const uint8_t *s = data;
+	int y = tag[0]<<8 | tag[1];
+	while (s < data + l_data) {
+		int x = (int)s[0]<<8 | s[1];
+		s += 2;
+		if (x == y) return (uint8_t*)s;
+		__skip_tag(s);
+	}
+	return 0;
+}
+
+int gfa_aux_del(int l_data, uint8_t *data, uint8_t *s)
+{
+	uint8_t *p;
+	p = s - 2;
+	__skip_tag(s);
+	memmove(p, s, l_data - (s - data));
+	return l_data - (s - p);
+}
+
+int abpoa_gfa_parse_H(abpoa_graph_t *abg, int *n_s, int *n_l, int *n_p, char *s) {
+    if (s[1] != '\t' || s[2] == '0') return -1;
+    int l_aux, m_aux = 0; uint8_t *aux = 0, *info;
+    l_aux = gfa_aux_parse(s + 2, &aux, &m_aux);
+
+    info = gfa_aux_get(l_aux, aux, "NS");
+    if (info == 0 || info[0] != 'i') err_fatal_simple("Error: no \"NS\" tag in GFA header.");
+    *n_s = *(int32_t*)(info+1);
+    abg->node_m = *n_s + 2;
+    abg->node = (abpoa_node_t*)_err_realloc(abg->node, abg->node_m * sizeof(abpoa_node_t*));
+    l_aux = gfa_aux_del(l_aux, aux, info);
+
+    info = gfa_aux_get(l_aux, aux, "NL");
+    if (info == 0 || info[0] != 'i') err_fatal_simple("Error: no \"NL\" tag in GFA header.");
+    *n_l = *(int32_t*)(info+1);
+    l_aux = gfa_aux_del(l_aux, aux, info);
+
+    info = gfa_aux_get(l_aux, aux, "NP");
+    if (info == 0 || info[0] != 'i') err_fatal_simple("Error: no \"NP\" tag in GFA header.");
+    *n_p = *(int32_t*)(info+1);
+    l_aux = gfa_aux_del(l_aux, aux, info);
+
+    if (aux) free(aux);
+    return 0;
+}
+
+typedef struct {
+    int n, m;
+    kstring_t *seq, *name;
+    khash_t(abstr) *h;
+} seg_seq_t;
+
+seg_seq_t *seg_seq_init(void) {
+    seg_seq_t *s = (seg_seq_t*)_err_malloc(sizeof(seg_seq_t));
+    s->n = s->m = 0; s->seq = 0, s->name = 0;
+    s->h = kh_init(abstr);
+    return s;
+}
+
+seg_seq_t *seg_seq_realloc(seg_seq_t *r) {
+    if (r->n >= r->m) {
+        int m;
+        if (r->m == 0) m = 1;
+        else m = MAX_OF_TWO(r->n, (r->m) << 1);
+        r->seq = (kstring_t*)_err_realloc(r->seq, m * sizeof(kstring_t));
+        r->name = (kstring_t*)_err_realloc(r->name, m * sizeof(kstring_t));
+        int i;
+        for (i = r->m; i < m; ++i) {
+            r->seq[i] = (kstring_t){0,0,0};
+            r->name[i] = (kstring_t){0,0,0};
+        }
+        r->m = m;
+    }
+    return r;
+}
+
+void seg_seq_free(seg_seq_t *s) {
+    if (s->m > 0) {
+        int i;
+        for (i = 0; i < s->m; ++i) {
+            if (s->seq[i].m) free(s->seq[i].s);
+            if (s->name[i].m) free(s->name[i].s);
+        }
+        free(s->seq); free(s->name);
+    }
+    kh_destroy(abstr, s->h);
+    free(s);
+}
+
+int abpoa_gfa_parse_S(seg_seq_t *segs,  char *s) {
+    if (s[1] != '\t' || s[2] == '\0') return -1;
+    char *deli_s, *info_s, *seq = 0;
+    int i, seq_len, seg_name_len, is_ok = 0;
+    char *seg_name=0;
+
+    for (i = 0, deli_s = info_s = s + 2;; ++deli_s) {
+        if (*deli_s == 0 || *deli_s == '\t') {
+            int c = *deli_s;
+            *deli_s = 0;
+            if (i == 0) {
+                seg_name = info_s;
+                seg_name_len = deli_s - info_s;
+            } else if (i == 1) {
+                seq = info_s;
+                seq_len = deli_s - info_s;
+                is_ok = 1;
+                break;
+            }
+            if (c == 0) break;
+            ++i, info_s = deli_s + 1;
+        }
+    }
+
+    if (is_ok) {
+        seg_seq_realloc(segs);
+        kputsn(seg_name, seg_name_len, segs->name+segs->n);
+        kputsn(seq, seq_len, segs->seq+segs->n);
+        int absent;
+        khint_t pos = kh_put(abstr, segs->h, segs->name[segs->n].s, &absent);
+        if (absent) kh_val(segs->h, pos) = segs->n;
+        else err_fatal(__func__, "Duplicated chromosome: \"%s\".", seg_name);
+        ++segs->n;
+    } else err_fatal(__func__, "Error: no seq in GFA segment line (%s).", seg_name);
+    return 0;
+}
+
+/*int abpoa_gfa_parse_S(abpoa_graph_t *abg, char *s) {
+    if (s[1] != '\t' || s[2] == '\0') return -1;
+    char *deli_s, *info_s, *seq = 0;
+    int i, seq_len, is_ok = 0;
+    char *seg_name=0;
+
+    for (i = 0, deli_s = info_s = s + 2;; ++deli_s) {
+        if (*deli_s == 0 || *deli_s == '\t') {
+            int c = *deli_s;
+            *deli_s = 0;
+            if (i == 0) {
+                seg_name = info_s;
+                abpoa_realloc_seq(seg_names);
+                abpoa_cpy_str(seg_names->name+seg_names->n_seq, seg_name, deli_s - info_s);
+                seg_names->n_seq++;
+            } else if (i == 1) {
+                seq = info_s;
+                seq_len = deli_s - info_s;
+                is_ok = 1;
+                break;
+            }
+            if (c == 0) break;
+            ++i, info_s = deli_s + 1;
+        }
+    }
+
+    if (is_ok) {
+        int seg_id, absent;
+        for (i = 0; i < seq_len; ++i) {
+            seg_id = abpoa_add_graph_node(abg, ab_char26_table[(int)(seq[i])]);
+            if (i == 0) {
+                khint_t pos = kh_put(str, seg_name2in_id, seg_names->name[seg_names->n_seq-1].s, &absent);
+                if (absent) kh_val(seg_name2in_id, pos) = seg_id;
+                else err_fatal(__func__, "Error: duplicated seg name (%s).", seg_name);
+            }
+            if (i == seq_len-1) {
+                khint_t pos = kh_put(str, seg_name2out_id,  seg_names->name[seg_names->n_seq-1].s, &absent);
+                if (absent) kh_val(seg_name2out_id, pos) = seg_id;
+                else err_fatal(__func__, "Error: duplicated seg name (%s).", seg_name);
+            }
+        }
+    } else err_fatal(__func__, "Error: no seq in GFA segment line (%s).", seg_name);
+    return 0;
+}*/
+
+int abpoa_gfa_parse_P(abpoa_graph_t *abg, abpoa_seq_t *abs, seg_seq_t *segs, int add_read_id, int p_i, int p_n, khash_t(abstr) *seg_name2in_id, khash_t(abstr) *seg_name2out_id, char *s) {
+    if (s[1] != '\t' || s[2] == '\0') return -1;
+    char *deli_s, *info_s, *path = 0;
+    int i, is_ok = 0, is_rc = -1;
+    char *path_name=0; int path_name_len=0;
+    kstring_t *seg_seq, *seg_name; int read_ids_n = 1 + ((p_n-1) >> 6);
+
+    for (i = 0, deli_s = info_s = s + 2;; ++deli_s) {
+        if (*deli_s == 0 || *deli_s == '\t') {
+            int c = *deli_s;
+            *deli_s = 0;
+            if (i == 0) {
+                path_name = info_s;
+                path_name_len = deli_s - info_s;
+            } else if (i == 1) {
+                path = info_s;
+                is_ok = 1;
+                break;
+            }
+            if (c == 0) break;
+            ++i, info_s = deli_s + 1;
+        }
+    }
+
+    if (is_ok) {
+        char *deli_s, *info_s, *_seg_name; khint_t pos, seg_pos; int absent;
+        int id, in_id=-1, out_id=-1, last_id = ABPOA_SRC_NODE_ID, next_id = ABPOA_SINK_NODE_ID;
+        for (deli_s = info_s = path; ; ++deli_s) {
+            if (*deli_s == '+') {
+                if (is_rc == 1) err_fatal(__func__, "Error: path has both \'+\' and \'-\' seg. (%s)", path_name);
+                is_rc = 0; *deli_s = 0; _seg_name = info_s;
+                seg_pos = kh_get(abstr, segs->h, _seg_name);
+                if (seg_pos == kh_end(segs->h)) err_fatal(__func__, "Error: seg (%s) not exist.", info_s);
+                seg_name = segs->name + kh_val(segs->h, seg_pos);
+                seg_seq = segs->seq + kh_val(segs->h, seg_pos);
+
+                // check if seg already exist
+                pos = kh_put(abstr, seg_name2in_id, seg_name->s, &absent);
+                if (absent) { // add node for seg_seq
+                    for (i = 0; i < (int)seg_seq->l; ++i) {
+                        id = abpoa_add_graph_node(abg, ab_char26_table[(int)(seg_seq->s[i])]);
+                        if (i == 0) in_id = id;
+                        if (i == (int)seg_seq->l-1) out_id = id;
+                    }
+                    kh_val(seg_name2in_id, pos) = in_id;
+                    pos = kh_put(abstr, seg_name2out_id, seg_name->s, &absent);
+                    kh_val(seg_name2out_id, pos) = out_id;
+                } else {
+                    in_id = kh_val(seg_name2in_id, pos);
+                    pos = kh_put(abstr, seg_name2out_id, seg_name->s, &absent);
+                    out_id = kh_val(seg_name2out_id, pos);
+                }
+                // add edge
+                abpoa_add_graph_edge(abg, last_id, in_id, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+                if (in_id < out_id) {
+                    for (i = 0; i < out_id - in_id; ++i)
+                        abpoa_add_graph_edge(abg, in_id+i, in_id+i+1, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+                } else if (in_id > out_id) err_fatal(__func__, "Error: in_id (%d) > out_id (%d).", in_id, out_id);
+
+                last_id = out_id;
+                info_s = deli_s + 2;
+            } else if (*deli_s == '-') {
+                if (is_rc == 0) err_fatal(__func__, "Error: path has both \'+\' and \'-\' seg. (%s)", path_name);
+                is_rc = 1; *deli_s = 0; _seg_name = info_s;
+                seg_pos = kh_get(abstr, segs->h, _seg_name);
+                if (seg_pos == kh_end(segs->h)) err_fatal(__func__, "Error: seg (%s) not exist.", info_s);
+                seg_name = segs->name + kh_val(segs->h, seg_pos);
+                seg_seq = segs->seq + kh_val(segs->h, seg_pos);
+
+                // check if seg exist
+                pos = kh_put(abstr, seg_name2in_id, seg_name->s, &absent);
+                if (absent) { // add node for seg_seq
+                    for (i = 0; i < (int)seg_seq->l; ++i) {
+                        id = abpoa_add_graph_node(abg, ab_char26_table[(int)(seg_seq->s[i])]);
+                        if (i == 0) in_id = id;
+                        if (i == (int)seg_seq->l-1) out_id = id;
+                    }
+                    kh_val(seg_name2in_id, pos) = in_id;
+                    pos = kh_put(abstr, seg_name2out_id, seg_name->s, &absent);
+                    kh_val(seg_name2out_id, pos) = out_id;
+                } else {
+                    in_id = kh_val(seg_name2in_id, pos); out_id = kh_val(seg_name2out_id, pos);
+                }
+
+                // add edge
+                abpoa_add_graph_edge(abg, out_id, next_id, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+                if (in_id < out_id) {
+                    for (i = 0; i < out_id - in_id; ++i)
+                        abpoa_add_graph_edge(abg, in_id+i, in_id+i+1, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+                } else if (in_id > out_id) err_fatal(__func__, "Error: in_id (%d) > out_id (%d).", in_id, out_id);
+
+                next_id = in_id;
+                info_s = deli_s + 2;
+            } else if (*deli_s == 0 || *deli_s == '\t') break;
+        }
+        if (is_rc) abpoa_add_graph_edge(abg, ABPOA_SRC_NODE_ID, next_id, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+        else abpoa_add_graph_edge(abg, last_id, ABPOA_SINK_NODE_ID, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+        // set abs
+        abpoa_realloc_seq(abs);
+        abpoa_cpy_str(abs->name+abs->n_seq, path_name, path_name_len); 
+        abs->is_rc[abs->n_seq] = is_rc; abs->n_seq++;
+    } else err_fatal(__func__, "Error: no path in GFA path line (%s).", path_name);
+    return 0;
+}
+
+int abpoa_fa_parse_seq(abpoa_graph_t *abg, abpoa_seq_t *abs, kstring_t *seq, kstring_t *name, int add_read_id, int p_i, int p_n, int **rank2node_id) {
+    if (*rank2node_id == 0) {
+        *rank2node_id = (int*)_err_calloc(seq->l, sizeof(int));
+    } 
+    char *s = seq->s;
+    int32_t read_ids_n = 1 + ((p_n-1) >> 6);
+    int32_t i, rank, last_id = ABPOA_SRC_NODE_ID, cur_id, aln_id; uint8_t base;
+    for (i = 0; s[i]; ++i) {
+        if (s[i] == '-') continue; // gap
+        else {
+            base = ab_char26_table[(int)(s[i])];
+            rank = i;
+            cur_id = (*rank2node_id)[rank];
+            if (cur_id == 0) {
+                cur_id = abpoa_add_graph_node(abg, base);
+                (*rank2node_id)[rank] = cur_id;
+            } else {
+                if (abg->node[cur_id].base != base) {
+                    aln_id = abpoa_get_aligned_id(abg, cur_id, base);
+                    if (aln_id == -1) {
+                        aln_id = abpoa_add_graph_node(abg, base);
+                        abpoa_add_graph_aligned_node(abg, cur_id, aln_id);
+                    }
+                    cur_id = aln_id;
+                }
+            }
+            abpoa_add_graph_edge(abg, last_id, cur_id, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+            last_id = cur_id;
+        }
+    }
+    abpoa_add_graph_edge(abg, last_id, ABPOA_SINK_NODE_ID, 1, 1, add_read_id, 0, p_i, read_ids_n, p_n);
+    abpoa_realloc_seq(abs);
+    abpoa_cpy_str(abs->name + abs->n_seq, name->s, name->l); abs->n_seq++;
+    return 0;
+}
+
+abpoa_t *abpoa_restore_graph(abpoa_t *ab, abpoa_para_t *abpt) {
+    char *fn = abpt->incr_fn;
+    if (fn == NULL) return ab;
+    gzFile fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r"); if (fp == 0) return NULL;
+    kstream_t *ks = ks_init(fp); kstring_t s={0,0,0}; int dret, line_n=0, read_name_e; 
+    seg_seq_t *seqs = seg_seq_init(); 
+    khash_t(abstr) *seg_name2in_id = kh_init(abstr), *seg_name2out_id = kh_init(abstr);
+    int add_read_id = abpt->use_read_ids;
+    int p_i = -1, is_fa = 0, *rank2node_id=0;
+
+    abpoa_graph_t *abg = ab->abg; abpoa_seq_t *abs = ab->abs;
+    while (ks_getuntil(ks, KS_SEP_LINE, &s, &dret) >= 0) {
+        line_n++;
+        int ret = 0; 
+        if (is_fa) {
+            if (s.l > 0 && s.s[0] == '>') { // name
+                // parse_seq
+                if (seqs->seq[seqs->n].l > 0) {
+                    ret = abpoa_fa_parse_seq(abg, abs, seqs->seq+seqs->n, seqs->name+seqs->n, add_read_id, p_i, p_i+1, &rank2node_id);
+                    seqs->n++;
+                }
+                // kputsn seqs->name
+
+                read_name_e = 1;
+                while (read_name_e < (int)s.l && !isspace(s.s[read_name_e])) read_name_e++;
+                seg_seq_realloc(seqs);
+                kputsn(s.s+1, read_name_e-1, seqs->name + seqs->n);
+                p_i++;
+            } else { // seq
+                // kputsn seqs->seq
+                kputsn(s.s, s.l, seqs->seq + seqs->n);
+            }
+        } else {
+            if (s.l > 0 && s.s[0] == '>') {
+                read_name_e = 1;
+                while (read_name_e < (int)s.l && !isspace(s.s[read_name_e])) read_name_e++;
+                seg_seq_realloc(seqs);
+                kputsn(s.s+1, read_name_e-1, seqs->name + seqs->n);
+                is_fa = 1; p_i++;
+            } 
+            // else if (s.l < 2 || s.s[0] == '#') continue; // comment
+            // else if (s.s[0] == 'H') ret = abpoa_gfa_parse_H(abg, &s_n, &l_n, &p_n, s.s);
+            else if (s.s[0] == 'S') ret = abpoa_gfa_parse_S(seqs, s.s); // include Link information
+            // else if (s.s[0] == 'L') ret = abpoa_gfa_parse_L(abg, seg_name, s.s);
+            else if (s.s[0] == 'P') {
+                p_i++;
+                ret = abpoa_gfa_parse_P(abg, abs, seqs, add_read_id, p_i, p_i+1, seg_name2in_id, seg_name2out_id, s.s);
+            }
+        }
+        if (ret < 0) err_fatal(__func__, "Error in %c-line at line %ld (error code %d)", s.s[0], (long)line_n, ret);
+    }
+    if (is_fa) { // last seq
+        abpoa_fa_parse_seq(abg, abs, seqs->seq+seqs->n, seqs->name+seqs->n, add_read_id, p_i, p_i+1, &rank2node_id);
+        seqs->n++;
+    }
+    if (s.m) free(s.s);
+    ks_destroy(ks); gzclose(fp);
+    seg_seq_free(seqs); kh_destroy(abstr, seg_name2in_id); kh_destroy(abstr, seg_name2out_id);
+    if (rank2node_id) free(rank2node_id);
+    if (abs->n_seq == 0) {
+        err_func_printf(__func__, "Warning: no graph/sequence restored from file \'%s\'.\n", fn);
+        abg->node_n = 2;
+    }
+    abg->is_called_cons = abg->is_set_msa_rank = abg->is_topological_sorted = 0;
+    return ab;
+}
diff --git a/src/abpoa_seq.h b/src/abpoa_seq.h
new file mode 100644
index 0000000..751175c
--- /dev/null
+++ b/src/abpoa_seq.h
@@ -0,0 +1,23 @@
+#ifndef _ABPOA_SEQ_H
+#define _ABPOA_SEQ_H
+#include <zlib.h>
+#include "abpoa.h"
+#include "kseq.h"
+
+KSEQ_INIT(gzFile, gzread)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+abpoa_seq_t *abpoa_realloc_seq(abpoa_seq_t *abs);
+void abpoa_cpy_str(abpoa_str_t *str, char *s, int l);
+abpoa_seq_t *abpoa_init_seq(void);
+void abpoa_free_seq(abpoa_seq_t *abs);
+int abpoa_read_seq(abpoa_seq_t *abs, kseq_t *kseq);
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/src/kalloc.c b/src/kalloc.c
new file mode 100644
index 0000000..8499552
--- /dev/null
+++ b/src/kalloc.c
@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kalloc.h"
+
+/* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
+ * associated with a master header, which keeps the size of the current core
+ * and the pointer to next core. Kalloc allocates small *blocks* of memory from
+ * the cores and organizes free memory blocks in a circular single-linked list.
+ *
+ * In the following diagram, "@" stands for the header of a free block (of type
+ * header_t), "#" for the header of an allocated block (of type size_t), "-"
+ * for free memory, and "+" for allocated memory.
+ *
+ * master        This region is core 1.          master           This region is core 2.
+ *      |                                             |
+ *      *@-------#++++++#++++++++++++@--------        *@----------#++++++++++++#+++++++@------------
+ *       |                           |                 |                               |
+ *       p=p->ptr->ptr->ptr->ptr     p->ptr            p->ptr->ptr                     p->ptr->ptr->ptr
+ */
+typedef struct header_t {
+	size_t size;
+	struct header_t *ptr;
+} header_t;
+
+typedef struct {
+	void *par;
+	size_t min_core_size;
+	header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
+} kmem_t;
+
+static void panic(const char *s)
+{
+	fprintf(stderr, "%s\n", s);
+	abort();
+}
+
+void *km_init2(void *km_par, size_t min_core_size)
+{
+	kmem_t *km;
+	km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t));
+	km->par = km_par;
+	km->min_core_size = min_core_size > 0? min_core_size : 0x80000;
+	return (void*)km;
+}
+
+void *km_init(void) { return km_init2(0, 0); }
+
+void km_destroy(void *_km)
+{
+	kmem_t *km = (kmem_t*)_km;
+	void *km_par;
+	header_t *p, *q;
+	if (km == NULL) return;
+	km_par = km->par;
+	for (p = km->core_head; p != NULL;) {
+		q = p->ptr;
+		kfree(km_par, p);
+		p = q;
+	}
+	kfree(km_par, km);
+}
+
+static header_t *morecore(kmem_t *km, size_t nu)
+{
+	header_t *q;
+	size_t bytes, *p;
+	nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */
+	bytes = nu * sizeof(header_t);
+	q = (header_t*)kmalloc(km->par, bytes);
+	if (!q) panic("[morecore] insufficient memory");
+	q->ptr = km->core_head, q->size = nu, km->core_head = q;
+	p = (size_t*)(q + 1);
+	*p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
+	kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
+	return km->loop_head;
+}
+
+void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
+{
+	header_t *p, *q;
+	kmem_t *km = (kmem_t*)_km;
+	
+	if (!ap) return;
+	if (km == NULL) {
+		free(ap);
+		return;
+	}
+	p = (header_t*)((size_t*)ap - 1);
+	p->size = *((size_t*)ap - 1);
+	/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
+	 *
+	 * a) "p>q && p<q->ptr": @------#++++++++#+++++++@-------    @---------------#+++++++@-------
+	 *    (can also be in    |      |                |        -> |                       |
+	 *     two cores)        q      p           q->ptr           q                  q->ptr
+	 *
+	 *                       @--------    #+++++++++@--------    @--------    @------------------
+	 *                       |            |         |         -> |            |
+	 *                       q            p    q->ptr            q       q->ptr
+	 *
+	 * b) "q>=q->ptr && (p>q || p<q->ptr)":  @-------#+++++   @--------#+++++++     @-------#+++++   @----------------
+	 *                                       |                |        |         -> |                |
+	 *                                  q->ptr                q        p       q->ptr                q
+	 *
+	 *                                       #+++++++@-----   #++++++++@-------     @-------------   #++++++++@-------
+	 *                                       |       |                 |         -> |                         |
+	 *                                       p  q->ptr                 q       q->ptr                         q
+	 */
+	for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
+		if (q >= q->ptr && (p > q || p < q->ptr)) break;
+	if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
+		p->size += q->ptr->size;
+		p->ptr = q->ptr->ptr;
+	} else if (p + p->size > q->ptr && q->ptr >= p) {
+		panic("[kfree] The end of the allocated block enters a free block.");
+	} else p->ptr = q->ptr; /* backup q->ptr */
+
+	if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
+		q->size += p->size;
+		q->ptr = p->ptr;
+		km->loop_head = q;
+	} else if (q + q->size > p && p >= q) {
+		panic("[kfree] The end of a free block enters the allocated block.");
+	} else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
+}
+
+void *kmalloc(void *_km, size_t n_bytes)
+{
+	kmem_t *km = (kmem_t*)_km;
+	size_t n_units;
+	header_t *p, *q;
+
+	if (n_bytes == 0) return 0;
+	if (km == NULL) return malloc(n_bytes);
+	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */
+
+	if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
+		q = km->loop_head = km->base.ptr = &km->base;
+	for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
+		if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
+			if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
+			else { /* split the block. NB: memory is allocated at the end of the block! */
+				p->size -= n_units; /* reduce the size of the free block */
+				p += p->size; /* p points to the allocated block */
+				*(size_t*)p = n_units; /* set the size */
+			}
+			km->loop_head = q; /* set the end of chain */
+			return (size_t*)p + 1;
+		}
+		if (p == km->loop_head) { /* then ask for more "cores" */
+			if ((p = morecore(km, n_units)) == 0) return 0;
+		}
+	}
+}
+
+void *kcalloc(void *_km, size_t count, size_t size)
+{
+	kmem_t *km = (kmem_t*)_km;
+	void *p;
+	if (size == 0 || count == 0) return 0;
+	if (km == NULL) return calloc(count, size);
+	p = kmalloc(km, count * size);
+	memset(p, 0, count * size);
+	return p;
+}
+
+void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
+{
+	kmem_t *km = (kmem_t*)_km;
+	size_t cap, *p, *q;
+
+	if (n_bytes == 0) {
+		kfree(km, ap); return 0;
+	}
+	if (km == NULL) return realloc(ap, n_bytes);
+	if (ap == NULL) return kmalloc(km, n_bytes);
+	p = (size_t*)ap - 1;
+	cap = (*p) * sizeof(header_t) - sizeof(size_t);
+	if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */
+	q = (size_t*)kmalloc(km, n_bytes);
+	memcpy(q, ap, cap);
+	kfree(km, ap);
+	return q;
+}
+
+void km_stat(const void *_km, km_stat_t *s)
+{
+	kmem_t *km = (kmem_t*)_km;
+	header_t *p;
+	memset(s, 0, sizeof(km_stat_t));
+	if (km == NULL || km->loop_head == NULL) return;
+	for (p = km->loop_head;; p = p->ptr) {
+		s->available += p->size * sizeof(header_t);
+		if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
+		if (p->ptr > p && p + p->size > p->ptr)
+			panic("[km_stat] The end of a free block enters another free block.");
+		if (p->ptr == km->loop_head) break;
+	}
+	for (p = km->core_head; p != NULL; p = p->ptr) {
+		size_t size = p->size * sizeof(header_t);
+		++s->n_cores;
+		s->capacity += size;
+		s->largest = s->largest > size? s->largest : size;
+	}
+}
diff --git a/src/kalloc.h b/src/kalloc.h
new file mode 100644
index 0000000..8c190b1
--- /dev/null
+++ b/src/kalloc.h
@@ -0,0 +1,38 @@
+#ifndef _KALLOC_H_
+#define _KALLOC_H_
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	size_t capacity, available, n_blocks, n_cores, largest;
+} km_stat_t;
+
+void *kmalloc(void *km, size_t size);
+void *krealloc(void *km, void *ptr, size_t size);
+void *kcalloc(void *km, size_t count, size_t size);
+void kfree(void *km, void *ptr);
+
+void *km_init(void);
+void *km_init2(void *km_par, size_t min_core_size);
+void km_destroy(void *km);
+void km_stat(const void *_km, km_stat_t *s);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#define KMALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kmalloc((km), (len) * sizeof(*(ptr))))
+#define KCALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kcalloc((km), (len), sizeof(*(ptr))))
+#define KREALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))krealloc((km), (ptr), (len) * sizeof(*(ptr))))
+
+#define KEXPAND(km, a, m) do { \
+		(m) = (m) >= 4? (m) + ((m)>>1) : 16; \
+		KREALLOC((km), (a), (m)); \
+	} while (0)
+
+#endif
diff --git a/src/kdq.h b/src/kdq.h
new file mode 100644
index 0000000..edd55b5
--- /dev/null
+++ b/src/kdq.h
@@ -0,0 +1,128 @@
+#ifndef __AC_KDQ_H
+#define __AC_KDQ_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#define __KDQ_TYPE(type) \
+	typedef struct { \
+		size_t front:58, bits:6, count, mask; \
+		type *a; \
+	} kdq_##type##_t;
+
+#define kdq_t(type) kdq_##type##_t
+#define kdq_size(q) ((q)->count)
+#define kdq_first(q) ((q)->a[(q)->front])
+#define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask])
+#define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask])
+
+#define __KDQ_IMPL(type, SCOPE) \
+	SCOPE kdq_##type##_t *kdq_init_##type() \
+	{ \
+		kdq_##type##_t *q; \
+		q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \
+		q->bits = 2, q->mask = (1ULL<<q->bits) - 1; \
+		q->a = (type*)malloc((1<<q->bits) * sizeof(type)); \
+		return q; \
+	} \
+	SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \
+	{ \
+		if (q == 0) return; \
+		free(q->a); free(q); \
+	} \
+	SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \
+	{ \
+		size_t new_size = 1ULL<<new_bits, old_size = 1ULL<<q->bits; \
+		if (new_size < q->count) { /* not big enough */ \
+			int i; \
+			for (i = 0; i < 64; ++i) \
+				if (1ULL<<i > q->count) break; \
+			new_bits = i, new_size = 1ULL<<new_bits; \
+		} \
+		if (new_bits == q->bits) return q->bits; /* unchanged */ \
+		if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \
+		if (q->front + q->count <= old_size) { /* unwrapped */ \
+			if (q->front + q->count > new_size) /* only happens for shrinking */ \
+				memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \
+		} else { /* wrapped */ \
+			memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \
+			q->front = new_size - (old_size - q->front); \
+		} \
+		q->bits = new_bits, q->mask = (1ULL<<q->bits) - 1; \
+		if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<<new_bits) * sizeof(type)); \
+		return q->bits; \
+	} \
+	SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \
+	{ \
+		if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+		return &q->a[((q->count++) + q->front) & (q)->mask]; \
+	} \
+	SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \
+	{ \
+		if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+		q->a[((q->count++) + q->front) & (q)->mask] = v; \
+	} \
+	SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \
+	{ \
+		if (q->count == 1ULL<<q->bits) kdq_resize_##type(q, q->bits + 1); \
+		++q->count; \
+		q->front = q->front? q->front - 1 : (1ULL<<q->bits) - 1; \
+		return &q->a[q->front]; \
+	} \
+	SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \
+	{ \
+		type *p; \
+		p = kdq_unshiftp_##type(q); \
+		*p = v; \
+	} \
+	SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \
+	{ \
+		return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \
+	} \
+	SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \
+	{ \
+		type *d = 0; \
+		if (q->count == 0) return 0; \
+		d = &q->a[q->front++]; \
+		q->front &= q->mask; \
+		--q->count; \
+		return d; \
+	}
+
+#define KDQ_INIT2(type, SCOPE) \
+	__KDQ_TYPE(type) \
+	__KDQ_IMPL(type, SCOPE)
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+#define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused)
+
+#define KDQ_DECLARE(type) \
+	__KDQ_TYPE(type) \
+	kdq_##type##_t *kdq_init_##type(); \
+	void kdq_destroy_##type(kdq_##type##_t *q); \
+	int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \
+	type *kdq_pushp_##type(kdq_##type##_t *q); \
+	void kdq_push_##type(kdq_##type##_t *q, type v); \
+	type *kdq_unshiftp_##type(kdq_##type##_t *q); \
+	void kdq_unshift_##type(kdq_##type##_t *q, type v); \
+	type *kdq_pop_##type(kdq_##type##_t *q); \
+	type *kdq_shift_##type(kdq_##type##_t *q);
+
+#define kdq_init(type) kdq_init_##type()
+#define kdq_destroy(type, q) kdq_destroy_##type(q)
+#define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits)
+#define kdq_pushp(type, q) kdq_pushp_##type(q)
+#define kdq_push(type, q, v) kdq_push_##type(q, v)
+#define kdq_pop(type, q) kdq_pop_##type(q)
+#define kdq_unshiftp(type, q) kdq_unshiftp_##type(q)
+#define kdq_unshift(type, q, v) kdq_unshift_##type(q, v)
+#define kdq_shift(type, q) kdq_shift_##type(q)
+
+#endif
diff --git a/src/khash.h b/src/khash.h
new file mode 100644
index 0000000..6373a93
--- /dev/null
+++ b/src/khash.h
@@ -0,0 +1,615 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "kalloc.h"
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(0, 1, sizeof(kh_##name##_t));	\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree(0, (void *)h->keys); kfree(0, h->flags);				\
+			kfree(0, (void *)h->vals);									\
+			kfree(0, h);												\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(0, __ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(0, new_flags); return -1; }	\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(0, new_flags); return -1; } \
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(0, h->flags); /* free the working space */			\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/src/kseq.h b/src/kseq.h
new file mode 100644
index 0000000..95b5285
--- /dev/null
+++ b/src/kseq.h
@@ -0,0 +1,247 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+#  include "malloc_wrap.h"
+#endif
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+	typedef struct __kstream_t {				\
+		unsigned char *buf;						\
+		int begin, end, is_eof, last_char;					\
+		type_t f;								\
+	} kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+	static inline kstream_t *ks_init(type_t f)						\
+	{																\
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+		ks->f = f;													\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
+		return ks;													\
+	}																\
+	static inline void ks_destroy(kstream_t *ks)					\
+	{																\
+		if (ks) {													\
+			free(ks->buf);											\
+			free(ks);												\
+		}															\
+	}
+
+#define __KS_GETC(__read, __bufsize)						\
+	static inline int ks_getc(kstream_t *ks)				\
+	{														\
+		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+		if (ks->begin >= ks->end) {							\
+			ks->begin = 0;									\
+			ks->end = __read(ks->f, ks->buf, __bufsize);	\
+			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
+		}													\
+		return (int)ks->buf[ks->begin++];					\
+	}
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+	{																	\
+		int gotany = 0;													\
+		if (dret) *dret = 0;											\
+		str->l = append? str->l : 0;									\
+		for (;;) {														\
+			int i;														\
+			if (ks->begin >= ks->end) {									\
+				if (!ks->is_eof) {										\
+					ks->begin = 0;										\
+					ks->end = __read(ks->f, ks->buf, __bufsize);		\
+					if (ks->end == 0) { ks->is_eof = 1; break; }		\
+				} else break;											\
+			}															\
+			if (delimiter == KS_SEP_LINE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (ks->buf[i] == delimiter) break;					\
+			} else if (delimiter == KS_SEP_SPACE) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i])) break;						\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+				str->m = str->l + (i - ks->begin) + 1;					\
+				kroundup32(str->m);										\
+				str->s = (char*)realloc(str->s, str->m);				\
+			}															\
+			gotany = 1;													\
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+			str->l = str->l + (i - ks->begin);							\
+			ks->begin = i + 1;											\
+			if (i < ks->end) {											\
+				if (dret) *dret = ks->buf[i];							\
+				break;													\
+			}															\
+		}																\
+		if (!gotany && ks_eof(ks)) return -1;							\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l;													\
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+	__KS_TYPE(type_t)							\
+	__KS_BASIC(type_t, __bufsize)				\
+	__KS_GETC(__read, __bufsize)				\
+	__KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->f->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+        free(ks);                                                       \
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
+		if (ks->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			ks->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') ks->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+		ks->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+#define __KSEQ_COPY(SCOPE) \
+    SCOPE void kseq_copy(kseq_t *seq, kseq_t kseq)\
+    { \
+        seq->name.s = strdup(kseq.name.s);      \
+        seq->seq.s = strdup(kseq.seq.s);        \
+        seq->seq.l = kseq.seq.l;                \
+    }
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)                          \
+	__KSEQ_COPY(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);     \
+    int kseq_copy(kseq_t *seq, kseq_t kseq);
+
+#endif
diff --git a/src/ksort.h b/src/ksort.h
new file mode 100644
index 0000000..d7599d1
--- /dev/null
+++ b/src/ksort.h
@@ -0,0 +1,153 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+// This is a simplified version of ksort.h
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+typedef struct {
+	void *left, *right;
+	int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+	void ks_heapdown_##name(size_t i, size_t n, type_t l[]) \
+	{ \
+		size_t k = i; \
+		type_t tmp = l[i]; \
+		while ((k = (k << 1) + 1) < n) { \
+			if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+			if (__sort_lt(l[k], tmp)) break; \
+			l[i] = l[k]; i = k; \
+		} \
+		l[i] = tmp; \
+	} \
+	void ks_heapmake_##name(size_t lsize, type_t l[]) \
+	{ \
+		size_t i; \
+		for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
+			ks_heapdown_##name(i, lsize, l); \
+	} \
+	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
+	{																	\
+		type_t *low, *high, *k, *ll, *hh, *mid;							\
+		low = arr; high = arr + n - 1; k = arr + kk;					\
+		for (;;) {														\
+			if (high <= low) return *k;									\
+			if (high == low + 1) {										\
+				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+				return *k;												\
+			}															\
+			mid = low + (high - low) / 2;								\
+			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
+			KSORT_SWAP(type_t, *mid, *(low+1));							\
+			ll = low + 1; hh = high;									\
+			for (;;) {													\
+				do ++ll; while (__sort_lt(*ll, *low));					\
+				do --hh; while (__sort_lt(*low, *hh));					\
+				if (hh < ll) break;										\
+				KSORT_SWAP(type_t, *ll, *hh);							\
+			}															\
+			KSORT_SWAP(type_t, *low, *hh);								\
+			if (hh <= k) low = ll;										\
+			if (hh >= k) high = hh - 1;									\
+		}																\
+	}																	\
+
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#define RS_MIN_SIZE 64
+#define RS_MAX_BITS 8
+
+#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
+	typedef struct { \
+		rstype_t *b, *e; \
+	} rsbucket_##name##_t; \
+	void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
+	{ \
+		rstype_t *i; \
+		for (i = beg + 1; i < end; ++i) \
+			if (rskey(*i) < rskey(*(i - 1))) { \
+				rstype_t *j, tmp = *i; \
+				for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
+					*j = *(j - 1); \
+				*j = tmp; \
+			} \
+	} \
+	void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
+	{ \
+		rstype_t *i; \
+		int size = 1<<n_bits, m = size - 1; \
+		rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
+		assert(n_bits <= RS_MAX_BITS); \
+		for (k = b; k != be; ++k) k->b = k->e = beg; \
+		for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
+		for (k = b + 1; k != be; ++k) \
+			k->e += (k-1)->e - beg, k->b = (k-1)->e; \
+		for (k = b; k != be;) { \
+			if (k->b != k->e) { \
+				rsbucket_##name##_t *l; \
+				if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
+					rstype_t tmp = *k->b, swap; \
+					do { \
+						swap = tmp; tmp = *l->b; *l->b++ = swap; \
+						l = b + (rskey(tmp)>>s&m); \
+					} while (l != k); \
+					*k->b++ = tmp; \
+				} else ++k->b; \
+			} else ++k; \
+		} \
+		for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
+		if (s) { \
+			s = s > n_bits? s - n_bits : 0; \
+			for (k = b; k != be; ++k) \
+				if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
+				else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
+		} \
+	} \
+	void radix_sort_##name(rstype_t *beg, rstype_t *end) \
+	{ \
+		if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
+		else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
+	}
+
+#endif
diff --git a/src/kstring.c b/src/kstring.c
new file mode 100644
index 0000000..aa6c2ee
--- /dev/null
+++ b/src/kstring.c
@@ -0,0 +1,250 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
+{
+	va_list args;
+	int l;
+	va_copy(args, ap);
+	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
+	va_end(args);
+	if (l + 1 > s->m - s->l) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+		va_copy(args, ap);
+		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
+		va_end(args);
+	}
+	s->l += l;
+	return l;
+}
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+	va_list ap;
+	int l;
+	va_start(ap, fmt);
+	l = kvsprintf(s, fmt, ap);
+	va_end(ap);
+	return l;
+}
+
+char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
+{
+	const unsigned char *p, *start, *sep = (unsigned char *) sep_in;
+	if (sep) { // set up the table
+		if (str == 0 && aux->finished) return 0; // no need to set up if we have finished
+		aux->finished = 0;
+		if (sep[0] && sep[1]) {
+			aux->sep = -1;
+			aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+			for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+		} else aux->sep = sep[0];
+	}
+	if (aux->finished) return 0;
+	else if (str) start = (unsigned char *) str, aux->finished = 0;
+	else start = (unsigned char *) aux->p + 1;
+	if (aux->sep < 0) {
+		for (p = start; *p; ++p)
+			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+	} else {
+		for (p = start; *p; ++p)
+			if (*p == aux->sep) break;
+	}
+	aux->p = (const char *) p; // end of token
+	if (*p == 0) aux->finished = 1; // no more tokens
+	return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+	int i, n, max, last_char, last_start, *offsets, l;
+	n = 0; max = *_max; offsets = *_offsets;
+	l = strlen(s);
+	
+#define __ksplit_aux do {						\
+		if (_offsets) {						\
+			s[i] = 0;					\
+			if (n == max) {					\
+				int *tmp;				\
+				max = max? max<<1 : 2;			\
+				if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
+					offsets = tmp;			\
+				} else	{				\
+					free(offsets);			\
+					*_offsets = NULL;		\
+					return 0;			\
+				}					\
+			}						\
+			offsets[n++] = last_start;			\
+		} else ++n;						\
+	} while (0)
+
+	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+		if (delimiter == 0) {
+			if (isspace(s[i]) || s[i] == 0) {
+				if (isgraph(last_char)) __ksplit_aux; // the end of a field
+			} else {
+				if (isspace(last_char) || last_char == 0) last_start = i;
+			}
+		} else {
+			if (s[i] == delimiter || s[i] == 0) {
+				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+			} else {
+				if (last_char == delimiter || last_char == 0) last_start = i;
+			}
+		}
+		last_char = s[i];
+	}
+	*_max = max; *_offsets = offsets;
+	return n;
+}
+
+int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp)
+{
+	size_t l0 = s->l;
+
+	while (s->l == l0 || s->s[s->l-1] != '\n') {
+		if (s->m - s->l < 200) ks_resize(s, s->m + 200);
+		if (fgets_fn(s->s + s->l, s->m - s->l, fp) == NULL) break;
+		s->l += strlen(s->s + s->l);
+	}
+
+	if (s->l == l0) return EOF;
+
+	if (s->l > l0 && s->s[s->l-1] == '\n') {
+		s->l--;
+		if (s->l > l0 && s->s[s->l-1] == '\r') s->l--;
+	}
+	s->s[s->l] = '\0';
+	return 0;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+	int i, *suff, *prep, *bmGs, *bmBc;
+	prep = (int*)calloc(m + 256, sizeof(int));
+	bmGs = prep; bmBc = prep + m;
+	{ // preBmBc()
+		for (i = 0; i < 256; ++i) bmBc[i] = m;
+		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+	}
+	suff = (int*)calloc(m, sizeof(int));
+	{ // suffixes()
+		int f = 0, g;
+		suff[m - 1] = m;
+		g = m - 1;
+		for (i = m - 2; i >= 0; --i) {
+			if (i > g && suff[i + m - 1 - f] < i - g)
+				suff[i] = suff[i + m - 1 - f];
+			else {
+				if (i < g) g = i;
+				f = i;
+				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+				suff[i] = f - g;
+			}
+		}
+	}
+	{ // preBmGs()
+		int j = 0;
+		for (i = 0; i < m; ++i) bmGs[i] = m;
+		for (i = m - 1; i >= 0; --i)
+			if (suff[i] == i + 1)
+				for (; j < m - 1 - i; ++j)
+					if (bmGs[j] == m)
+						bmGs[j] = m - 1 - i;
+		for (i = 0; i <= m - 2; ++i)
+			bmGs[m - 1 - suff[i]] = m - 1 - i;
+	}
+	free(suff);
+	return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+	int i, j, *prep = 0, *bmGs, *bmBc;
+	const ubyte_t *str, *pat;
+	str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+	prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+	if (_prep && *_prep == 0) *_prep = prep;
+	bmGs = prep; bmBc = prep + m;
+	j = 0;
+	while (j <= n - m) {
+		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+		if (i >= 0) {
+			int max = bmBc[str[i+j]] - m + 1 + i;
+			if (max < bmGs[i]) max = bmGs[i];
+			j += max;
+		} else return (void*)(str + j);
+	}
+	if (_prep == 0) free(prep);
+	return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+	return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+	return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+	kstring_t *s;
+	int *fields, n, i;
+	ks_tokaux_t aux;
+	char *p;
+	s = (kstring_t*)calloc(1, sizeof(kstring_t));
+	// test ksprintf()
+	ksprintf(s, " abcdefg:    %d ", 100);
+	printf("'%s'\n", s->s);
+	// test ksplit()
+	fields = ksplit(s, 0, &n);
+	for (i = 0; i < n; ++i)
+		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+	// test kstrtok()
+	s->l = 0;
+	for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+		kputsn(p, aux.p - p, s);
+		kputc('\n', s);
+	}
+	printf("%s", s->s);
+	// free
+	free(s->s); free(s); free(fields);
+
+	{
+		static char *str = "abcdefgcdgcagtcakcdcd";
+		static char *pat = "cd";
+		char *ret, *s = str;
+		int *prep = 0;
+		while ((ret = kstrstr(s, pat, &prep)) != 0) {
+			printf("match: %s\n", ret);
+			s = ret + prep[0];
+		}
+		free(prep);
+	}
+	return 0;
+}
+#endif
diff --git a/src/kstring.h b/src/kstring.h
new file mode 100644
index 0000000..f13fcd9
--- /dev/null
+++ b/src/kstring.h
@@ -0,0 +1,277 @@
+/* The MIT License
+
+   Copyright (c) by Attractive Chaos <attractor@live.co.uk> 
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg)))
+#else
+#define KS_ATTR_PRINTF(fmt, arg)
+#endif
+
+
+/* kstring_t is a simple non-opaque type whose fields are likely to be
+ * used directly by user code (but see also ks_str() and ks_len() below).
+ * A kstring_t object is initialised by either of
+ *       kstring_t str = { 0, 0, NULL };
+ *       kstring_t str; ...; str.l = str.m = 0; str.s = NULL;
+ * and either ownership of the underlying buffer should be given away before
+ * the object disappears (see ks_release() below) or the kstring_t should be
+ * destroyed with  free(str.s);  */
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+	uint64_t tab[4];
+	int sep, finished;
+	const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0);
+	int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3);
+	int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+	char *kstrstr(const char *str, const char *pat, int **_prep);
+	char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+	void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+	/* kstrtok() is similar to strtok_r() except that str is not
+	 * modified and both str and sep can be NULL. For efficiency, it is
+	 * actually recommended to set both to NULL in the subsequent calls
+	 * if sep is not changed. */
+	char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+	/* kgetline() uses the supplied fgets()-like function to read a "\n"-
+	 * or "\r\n"-terminated line from fp.  The line read is appended to the
+	 * kstring without its terminator and 0 is returned; EOF is returned at
+	 * EOF or on error (determined by querying fp, as per fgets()). */
+	typedef char *kgets_func(char *, int, void *);
+	int kgetline(kstring_t *s, kgets_func *fgets, void *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int ks_resize(kstring_t *s, size_t size)
+{
+	if (s->m < size) {
+		char *tmp;
+		s->m = size;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return -1;
+	}
+	return 0;
+}
+
+static inline char *ks_str(kstring_t *s)
+{
+	return s->s;
+}
+
+static inline size_t ks_len(kstring_t *s)
+{
+	return s->l;
+}
+
+// Give ownership of the underlying buffer away to something else (making
+// that something else responsible for freeing it), leaving the kstring_t
+// empty and ready to be used again, or ready to go out of scope without
+// needing  free(str.s)  to prevent a memory leak.
+static inline char *ks_release(kstring_t *s)
+{
+	char *ss = s->s;
+	s->l = s->m = 0;
+	s->s = NULL;
+	return ss;
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	s->s[s->l] = 0;
+	return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+	return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	s->s[s->l] = 0;
+	return c;
+}
+
+static inline int kputc_(int c, kstring_t *s)
+{
+	if (s->l + 1 > s->m) {
+		char *tmp;
+		s->m = s->l + 1;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	return 1;
+}
+
+static inline int kputsn_(const void *p, int l, kstring_t *s)
+{
+	if (s->l + l > s->m) {
+		char *tmp;
+		s->m = s->l + l;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	return l;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+	char buf[16];
+	int i, l = 0;
+	unsigned int x = c;
+	if (c < 0) x = -x;
+	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+	char buf[16];
+	int l, i;
+	unsigned x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+	char buf[32];
+	int i, l = 0;
+	unsigned long x = c;
+	if (c < 0) x = -x;
+	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+/*
+ * Returns 's' split by delimiter, with *n being the number of components;
+ *         NULL on failue.
+ */
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+	int max = 0, *offsets = 0;
+	*n = ksplit_core(s->s, delimiter, &max, &offsets);
+	return offsets;
+}
+
+#endif
diff --git a/src/kvec.h b/src/kvec.h
new file mode 100644
index 0000000..e865173
--- /dev/null
+++ b/src/kvec.h
@@ -0,0 +1,105 @@
+/* The MIT License
+
+   Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "kvec.h"
+int main() {
+	kvec_t(int) array;
+	kv_init(array);
+	kv_push(int, array, 10); // append
+	kv_a(int, array, 20) = 5; // dynamic
+	kv_A(array, 20) = 4; // static
+	kv_destroy(array);
+	return 0;
+}
+*/
+
+/*
+  2008-09-22 (0.1.0):
+
+	* The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+#include "kalloc.h"
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, km, v, s) do { \
+		if ((v).m < (s)) { \
+			(v).m = (s); \
+			kv_roundup32((v).m); \
+			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
+		} \
+	} while (0)
+
+#define kv_copy(type, km, v1, v0) do { \
+		if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \
+		(v1).n = (v0).n; \
+		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
+	} while (0) \
+
+#define kv_push(type, km, v, x) do { \
+		if ((v).n == (v).m) { \
+			(v).m = (v).m? (v).m<<1 : 2; \
+			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
+		} \
+		(v).a[(v).n++] = (x); \
+	} while (0)
+
+#define kv_pushp(type, km, v, p) do { \
+		if ((v).n == (v).m) { \
+			(v).m = (v).m? (v).m<<1 : 2; \
+			(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
+		} \
+		*(p) = &(v).a[(v).n++]; \
+	} while (0)
+
+#define kv_reverse(type, v, start) do { \
+		if ((v).m > 0 && (v).n > (start)) { \
+			size_t __i, __end = (v).n - (start); \
+			type *__a = (v).a + (start); \
+			for (__i = 0; __i < __end>>1; ++__i) { \
+				type __t = __a[__end - 1 - __i]; \
+				__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
+			} \
+		} \
+	} while (0)
+
+#endif
diff --git a/src/simd_abpoa_align.c b/src/simd_abpoa_align.c
new file mode 100644
index 0000000..cd2c0df
--- /dev/null
+++ b/src/simd_abpoa_align.c
@@ -0,0 +1,1716 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "abpoa_align.h"
+#include "simd_instruction.h"
+#include "utils.h"
+
+typedef struct {
+    const int reg_n, bits_n, log_num, num_of_value, size;
+    int inf_min; // based on penalty of mismatch and GAP_OE1
+} SIMD_para_t;
+
+#define SIMDShiftOneNi8  1
+#define SIMDShiftOneNi16 2
+#define SIMDShiftOneNi32 4
+#define SIMDShiftOneNi64 8
+
+#ifdef __AVX512F__
+SIMD_para_t _simd_p8  = {512,  8, 6, 64, 64, -1};
+SIMD_para_t _simd_p16 = {512, 16, 5, 32, 64, -1};
+SIMD_para_t _simd_p32 = {512, 32, 4, 16, 64, -1};
+SIMD_para_t _simd_p64 = {512, 64, 3,  8, 64, -1};
+#define SIMDTotalBytes 64
+#elif defined(__AVX2__)
+SIMD_para_t _simd_p8  = {256,  8, 5, 32, 32, -1};
+SIMD_para_t _simd_p16 = {256, 16, 4, 16, 32, -1};
+SIMD_para_t _simd_p32 = {256, 32, 3,  8, 32, -1};
+SIMD_para_t _simd_p64 = {256, 64, 2,  4, 32, -1};
+#define SIMDTotalBytes 32
+#else
+SIMD_para_t _simd_p8  = {128,  8, 4, 16, 16, -1};
+SIMD_para_t _simd_p16 = {128, 16, 3,  8, 16, -1};
+SIMD_para_t _simd_p32 = {128, 32, 2,  4, 16, -1};
+SIMD_para_t _simd_p64 = {128, 64, 1,  2, 16, -1};
+#define SIMDTotalBytes 16
+#endif
+
+#define print_simd(s, str, score_t) {                                \
+    int _i; score_t *_a = (score_t*)(s);                             \
+    fprintf(stderr, "%s\t", str);                                    \
+    for (_i = 0; _i < SIMDTotalBytes / (int)sizeof(score_t); ++_i) { \
+        fprintf(stderr, "%d\t", _a[_i]);                             \
+    } fprintf(stderr, "\n");                                         \
+}
+
+
+#define simd_abpoa_print_lg_matrix(score_t, beg_index, end_index) { \
+    for (j = 0; j < end_index-beg_index; ++j) {                     \
+        fprintf(stderr, "index: %d\t", j);                          \
+        dp_h = DP_H + j * dp_sn;                                    \
+        _dp_h = (score_t*)dp_h;                                     \
+        for (i = dp_beg[j]; i <= dp_end[j]; ++i) {                  \
+            fprintf(stderr, "%d:(%d)\t", i, _dp_h[i]);              \
+        } fprintf(stderr, "\n");                                    \
+    }                                                               \
+}
+
+#define simd_abpoa_print_ag_matrix(score_t, beg_index, end_index) {  \
+    for (j = beg_index; j < end_index; ++j) {                        \
+        fprintf(stderr, "index: %d\t", j);                           \
+        dp_h = DP_HEF + j * 3 * dp_sn; dp_e1 = dp_h + dp_sn;         \
+        _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1;            \
+        for (i = dp_beg[j]; i <= dp_end[j]; ++i) {                   \
+            fprintf(stderr, "%d:(%d,%d)\t", i, _dp_h[i], _dp_e1[i]); \
+        } fprintf(stderr, "\n");                                     \
+    }                                                                \
+}
+
+#define debug_simd_abpoa_print_cg_matrix_row(str, score_t, index_i) {                                    \
+    score_t *_dp_h = (score_t*)dp_h, *_dp_e1 = (score_t*)dp_e1;                                          \
+    score_t *_dp_e2 = (score_t*)dp_e2, *_dp_f1 = (score_t*)dp_f1, *_dp_f2 = (score_t*)dp_f2;             \
+    fprintf(stderr, "%s\tindex: %d\t", str, index_i);                                                    \
+    for (i = dp_beg[index_i]; i <= (dp_end[index_i]/16+1)*16-1; ++i) {                                   \
+        fprintf(stderr, "%d:(%d,%d,%d,%d,%d)\t", i, _dp_h[i], _dp_e1[i],_dp_e2[i], _dp_f1[i],_dp_f2[i]); \
+    } fprintf(stderr, "\n");                                                                             \
+}
+
+#define simd_abpoa_print_cg_matrix(score_t, beg_index, end_index) {                                          \
+    for (j = 0; j < end_index-beg_index; ++j) {                                                              \
+        fprintf(stderr, "index: %d\t", j);                                                                   \
+        dp_h=DP_H2E2F+j*5*dp_sn; dp_e1=dp_h+dp_sn; dp_e2=dp_e1+dp_sn; dp_f1=dp_e2+dp_sn; dp_f2=dp_f1+dp_sn;  \
+        score_t *_dp_h=(score_t*)dp_h, *_dp_e1=(score_t*)dp_e1, *_dp_e2=(score_t*)dp_e2;                     \
+        score_t *_dp_f1=(score_t*)dp_f1, *_dp_f2=(score_t*)dp_f2;                                            \
+        for (i = dp_beg[j]; i <= dp_end[j]; ++i) {                                                           \
+            fprintf(stderr, "%d:(%d,%d,%d,%d,%d)\t", i, _dp_h[i], _dp_e1[i],_dp_e2[i], _dp_f1[i],_dp_f2[i]); \
+        } fprintf(stderr, "\n");                                                                             \
+    }                                                                                                        \
+}
+
+/* max_pos_left/right: left/right boundary of max column index for each row, based on the pre_nodes' DP score */
+/* === workflow of alignment === */
+/* a. global:
+ * (1) alloc mem
+ * (2) init for first row
+ * (3) DP for each row
+ * (3.2) if use_ada, update max_pos_left/right
+ * (4) find best_i/j, backtrack
+ * b. extend:
+ * (1) alloc mem
+ * (2) init for first row
+ * (3) DP for each row
+ * (3.2) find max of current row
+ * (3.3) z-drop, set_max_score
+ * (3.4) if use_ada, update max_pos_left/right
+ */
+
+// backtrack order:
+// Match/Mismatch, Deletion, Insertion
+#define simd_abpoa_lg_backtrack(score_t) {                                                                  \
+    int i, j, k, pre_i, n_c = 0, s, is_match, m_c = 0, hit, id, _start_i, _start_j;                         \
+    SIMDi *dp_h; score_t *_dp_h=NULL, *_pre_dp_h; abpoa_cigar_t *cigar = 0;                                 \
+    i = best_i, j = best_j, _start_i = best_i, _start_j = best_j;                                           \
+    id = abpoa_graph_index_to_node_id(graph, i+beg_index);                                                  \
+    if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-j, -1, qlen-1);         \
+    dp_h = DP_H + i * dp_sn; _dp_h = (score_t*)dp_h;                                                        \
+    int indel_first = 1; /* prefer to keep gaps at the end */                                               \
+    while (i > 0 && j > 0) {                                                                                \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE && _dp_h[j] == 0) break;                                   \
+        _start_i = i, _start_j = j;                                                                         \
+        int *pre_index_i = pre_index[i];                                                                    \
+        s = mat[m * graph->node[id].base + query[j-1]]; hit = 0;                                            \
+        is_match = graph->node[id].base == query[j-1];                                                      \
+        if (indel_first == 0) { /* match/mismatch */                                                        \
+            for (k = 0; k < pre_n[i]; ++k) { /* match/mismatch */                                           \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
+                _pre_dp_h = (score_t*)(DP_H + pre_i * dp_sn);                                               \
+                if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
+                    i = pre_i; --j; hit = 1; id = abpoa_graph_index_to_node_id(graph, i+beg_index);         \
+                    dp_h = DP_H + i * dp_sn; _dp_h = (score_t*)dp_h;                                        \
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0) { /* deletion */                                                                      \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j < dp_beg[pre_i] || j > dp_end[pre_i]) continue;                                       \
+                _pre_dp_h = (score_t*)( DP_H + pre_i * dp_sn);                                              \
+                if (_pre_dp_h[j] - gap_ext1 == _dp_h[j]) {                                                  \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);                    \
+                    i = pre_i; hit = 1; id = abpoa_graph_index_to_node_id(graph, i+beg_index);              \
+                    dp_h = DP_H + i * dp_sn; _dp_h = (score_t*)dp_h;                                        \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0) { /* insertion */                                                                     \
+            if (_dp_h[j-1] - gap_ext1 == _dp_h[j]) {                                                        \
+                cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, 1, id, j-1); j--;                   \
+                hit = 1; ++res->n_aln_bases;                                                                \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && indel_first == 1) { /* match/mismatch */                                            \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
+                _pre_dp_h = (score_t*)(DP_H + pre_i * dp_sn);                                               \
+                if (_pre_dp_h[j-1] + s == _dp_h[j]) { /* match/mismatch */                                  \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
+                    i = pre_i; --j; hit = 1; id = abpoa_graph_index_to_node_id(graph, i+beg_index);         \
+                    dp_h = DP_H + i * dp_sn; _dp_h = (score_t*)dp_h;                                        \
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
+                    indel_first = 0;                                                                        \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0) err_fatal_simple("Error in lg_backtrack.");                                           \
+     /* fprintf(stderr, "%d, %d, (%d)\n", i, j, indel_first); */                                            \
+    }                                                                                                       \
+    if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1);                         \
+    /* reverse cigar */                                                                                     \
+    res->graph_cigar = abpt->rev_cigar ? cigar : abpoa_reverse_cigar(n_c, cigar);                           \
+    res->n_cigar = n_c; res->m_cigar = m_c;                                                                 \
+    res->node_e = abpoa_graph_index_to_node_id(graph, best_i+beg_index), res->query_e=best_j-1; /*0-based*/ \
+    res->node_s = abpoa_graph_index_to_node_id(graph, _start_i+beg_index), res->query_s=_start_j-1;         \
+    /*abpoa_print_cigar(n_c, *graph_cigar, graph);*/                                                        \
+}
+
+#define simd_abpoa_ag_backtrack(score_t) {                                                                  \
+    int i, j, k, pre_i, n_c = 0, s, is_match, m_c = 0, id, hit, cur_op = ABPOA_ALL_OP, _start_i, _start_j;  \
+    score_t *_dp_h, *_dp_e1, *_dp_f1, *_pre_dp_h, *_pre_dp_e1; abpoa_cigar_t *cigar = 0;                    \
+    i = best_i, j = best_j; _start_i = best_i, _start_j = best_j;                                           \
+    id = abpoa_graph_index_to_node_id(graph, i+beg_index);                                                  \
+    if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-j, -1, qlen-1);         \
+    SIMDi *dp_h = DP_HEF + dp_sn * i * 3; _dp_h = (score_t*)dp_h;                                           \
+    int indel_first = 1; /* prefer to keep gaps at the end */                                               \
+    while (i > 0 && j > 0) {                                                                                \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE && _dp_h[j] == 0) break;                                   \
+        _start_i = i, _start_j = j;                                                                         \
+        int *pre_index_i = pre_index[i];                                                                    \
+        s = mat[m * graph->node[id].base + query[j-1]]; hit = 0;                                            \
+        is_match = graph->node[id].base == query[j-1];                                                      \
+        if (cur_op & ABPOA_M_OP && indel_first == 0) { /* match/mismatch */                                 \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
+                _pre_dp_h = (score_t*)(DP_HEF + dp_sn * pre_i * 3);                                         \
+                if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
+                    cur_op = ABPOA_ALL_OP; hit = 1;                                                         \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
+                    i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index);                  \
+                    dp_h = DP_HEF + dp_sn * i * 3; _dp_h = (score_t*)dp_h;                                  \
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && cur_op & ABPOA_E1_OP) { /* deletion */                                              \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j < dp_beg[pre_i] || j > dp_end[pre_i]) continue;                                       \
+                _pre_dp_e1 = (score_t*)(DP_HEF + dp_sn * (pre_i * 3 + 1));                                  \
+                if (cur_op & ABPOA_M_OP) {                                                                  \
+                    if (_dp_h[j] == _pre_dp_e1[j]) {                                                        \
+                        _pre_dp_h = (score_t*)(DP_HEF + dp_sn * pre_i * 3);                                 \
+                        if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;      \
+                        else cur_op = ABPOA_E1_OP;                                                          \
+                        hit = 1;                                                                            \
+                        cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);                \
+                        i = pre_i; id = abpoa_graph_index_to_node_id(graph, i+beg_index);                   \
+                        dp_h = DP_HEF + dp_sn * i * 3; _dp_h = (score_t*)dp_h;                              \
+                        break;                                                                              \
+                    }                                                                                       \
+                } else {                                                                                    \
+                    _dp_e1 = (score_t*)(dp_h + dp_sn);                                                      \
+                    if (_dp_e1[j] == _pre_dp_e1[j] - gap_ext1) {                                            \
+                        _pre_dp_h = (score_t*)(DP_HEF + dp_sn * pre_i * 3);                                 \
+                        if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;      \
+                        else cur_op = ABPOA_E1_OP;                                                          \
+                        hit = 1;                                                                            \
+                        cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);                \
+                        i = pre_i; id = abpoa_graph_index_to_node_id(graph, i+beg_index);                   \
+                        dp_h = DP_HEF + dp_sn * i * 3; _dp_h = (score_t*)dp_h;                              \
+                        break;                                                                              \
+                    }                                                                                       \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && cur_op & ABPOA_F_OP) { /* insertion */                                              \
+            _dp_f1 = (score_t*)(dp_h + dp_sn * 2);                                                          \
+            if (cur_op & ABPOA_M_OP) {                                                                      \
+                if (_dp_h[j] == _dp_f1[j]) {                                                                \
+                    if (_dp_h[j-1] - gap_oe1 == _dp_f1[j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;       \
+                    else if (_dp_f1[j-1] - gap_ext1 == _dp_f1[j]) cur_op = ABPOA_F1_OP, hit = 1;            \
+                }                                                                                           \
+            } else {                                                                                        \
+                if (_dp_h[j-1] - gap_oe1 == _dp_f1[j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;           \
+                else if (_dp_f1[j-1] - gap_ext1 == _dp_f1[j]) cur_op = ABPOA_F1_OP, hit = 1;                \
+            }                                                                                               \
+            if (hit == 1) {                                                                                 \
+                cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, 1, id, j-1); --j;                   \
+                ++res->n_aln_bases;                                                                         \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && cur_op & ABPOA_M_OP && indel_first == 1) {                                          \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
+                _pre_dp_h = (score_t*)(DP_HEF + dp_sn * pre_i * 3);                                         \
+                if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
+                    cur_op = ABPOA_ALL_OP; hit = 1;                                                         \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
+                    i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index);                  \
+                    dp_h = DP_HEF + dp_sn * i * 3; _dp_h = (score_t*)dp_h;                                  \
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
+                    indel_first = 0;                                                                        \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0) err_fatal_simple("Error in ag_backtrack.");                                           \
+     /* fprintf(stderr, "%d, %d, %d (indel_first: %d)\n", i, j, cur_op, indel_first); */                    \
+    }                                                                                                       \
+    if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1);                         \
+    /* reverse cigar */                                                                                     \
+    res->graph_cigar = abpt->rev_cigar ? cigar : abpoa_reverse_cigar(n_c, cigar);                           \
+    res->n_cigar = n_c; res->m_cigar = m_c;                                                                 \
+    res->node_e = abpoa_graph_index_to_node_id(graph, best_i+beg_index), res->query_e=best_j-1; /*0-based*/ \
+    res->node_s = abpoa_graph_index_to_node_id(graph, _start_i+beg_index), res->query_s=_start_j-1;         \
+    /*abpoa_print_cigar(n_c, *graph_cigar, graph);*/                                                        \
+}
+
+#define simd_abpoa_cg_backtrack(score_t) {                                                                  \
+    int i, j, k, pre_i, n_c = 0, s, is_match, m_c = 0, id, hit, cur_op = ABPOA_ALL_OP, _start_i, _start_j;  \
+    score_t *_dp_h, *_dp_e1, *_dp_e2, *_dp_f1, *_dp_f2, *_pre_dp_h, *_pre_dp_e1, *_pre_dp_e2;               \
+    abpoa_cigar_t *cigar = 0;                                                                               \
+    i = best_i, j = best_j, _start_i = best_i, _start_j = best_j;                                           \
+    id = abpoa_graph_index_to_node_id(graph, i+beg_index);                                                  \
+    if (best_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-j, -1, qlen-1);         \
+    SIMDi *dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                                         \
+    int indel_first = 1; /* prefer to keep gaps at the end */                                               \
+    while (i > 0 && j > 0) {                                                                                \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE && _dp_h[j] == 0) break;                                   \
+        _start_i = i, _start_j = j;                                                                         \
+        int *pre_index_i = pre_index[i];                                                                    \
+        s = mat[m * graph->node[id].base + query[j-1]]; hit = 0;                                            \
+        is_match = graph->node[id].base == query[j-1];                                                      \
+        if (cur_op & ABPOA_M_OP && indel_first == 0) { /* match/mismatch */                                 \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
+                _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                                       \
+                if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
+                    cur_op = ABPOA_ALL_OP; hit = 1;                                                         \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
+                    i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index); hit = 1;         \
+                    dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                                \
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && cur_op & ABPOA_E_OP) { /* deletion */                                               \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j < dp_beg[pre_i] || j > dp_end[pre_i]) continue;                                       \
+                if (cur_op & ABPOA_E1_OP) {                                                                 \
+                    _pre_dp_e1 = (score_t*)(DP_H2E2F + dp_sn * (pre_i * 5 + 1));                            \
+                    if (cur_op & ABPOA_M_OP) {                                                              \
+                        if (_dp_h[j] == _pre_dp_e1[j]) {                                                    \
+                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
+                            if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
+                            else cur_op = ABPOA_E1_OP;                                                      \
+                            hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
+                            i = pre_i; id = abpoa_graph_index_to_node_id(graph, i+beg_index);               \
+                            dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                        \
+                            break;                                                                          \
+                        }                                                                                   \
+                    } else {                                                                                \
+                        _dp_e1 = (score_t*)(dp_h + dp_sn);                                                  \
+                        if (_dp_e1[j] == _pre_dp_e1[j] - gap_ext1) {                                        \
+                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
+                            if (_pre_dp_h[j] - gap_oe1 == _pre_dp_e1[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
+                            else cur_op = ABPOA_E1_OP;                                                      \
+                            hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
+                            i = pre_i; id = abpoa_graph_index_to_node_id(graph, i+beg_index);               \
+                            dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                        \
+                            break;                                                                          \
+                        }                                                                                   \
+                    }                                                                                       \
+                }                                                                                           \
+                if (cur_op & ABPOA_E2_OP) {                                                                 \
+                    _pre_dp_e2 = (score_t*)(DP_H2E2F + dp_sn * (pre_i * 5 + 2));                            \
+                    if (cur_op & ABPOA_M_OP) {                                                              \
+                        if (_dp_h[j] == _pre_dp_e2[j]) {                                                    \
+                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
+                            if (_pre_dp_h[j] - gap_oe2 == _pre_dp_e2[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
+                            else cur_op = ABPOA_E2_OP;                                                      \
+                            hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
+                            i = pre_i; id = abpoa_graph_index_to_node_id(graph, i+beg_index);               \
+                            dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                        \
+                            break;                                                                          \
+                        }                                                                                   \
+                    } else {                                                                                \
+                        _dp_e2 = (score_t*)(dp_h + dp_sn * 2);                                              \
+                        if (_dp_e2[j] == _pre_dp_e2[j] - gap_ext2) {                                        \
+                            _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                           \
+                            if (_pre_dp_h[j] - gap_oe2 == _pre_dp_e2[j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;  \
+                            else cur_op = ABPOA_E2_OP;                                                      \
+                            hit = 1; cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, j-1);   \
+                            i = pre_i; id = abpoa_graph_index_to_node_id(graph, i+beg_index);               \
+                            dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                        \
+                            break;                                                                          \
+                        }                                                                                   \
+                    }                                                                                       \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && cur_op & ABPOA_F_OP) { /* insertion */                                              \
+            if (cur_op & ABPOA_F1_OP) {                                                                     \
+                _dp_f1 = (score_t*)(dp_h + dp_sn * 3);                                                      \
+                if (cur_op & ABPOA_M_OP) {                                                                  \
+                    if (_dp_h[j] == _dp_f1[j]) {                                                            \
+                        if (_dp_h[j-1] - gap_oe1 == _dp_f1[j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;   \
+                        else if (_dp_f1[j-1] - gap_ext1 == _dp_f1[j]) cur_op = ABPOA_F1_OP, hit = 1;        \
+                    }                                                                                       \
+                } else {                                                                                    \
+                    if (_dp_h[j-1] - gap_oe1 == _dp_f1[j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;       \
+                    else if (_dp_f1[j-1] - gap_ext1 == _dp_f1[j]) cur_op = ABPOA_F1_OP, hit = 1;            \
+                }                                                                                           \
+            }                                                                                               \
+            if (hit == 0 && cur_op & ABPOA_F2_OP) {                                                         \
+                _dp_f2 = (score_t*)(dp_h + dp_sn * 4);                                                      \
+                if (cur_op & ABPOA_M_OP) {                                                                  \
+                    if (_dp_h[j] == _dp_f2[j]) {                                                            \
+                        if (_dp_h[j-1] - gap_oe2 == _dp_f2[j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;   \
+                        else if (_dp_f2[j-1] - gap_ext2 == _dp_f2[j]) cur_op = ABPOA_F2_OP, hit = 1;        \
+                    }                                                                                       \
+                } else {                                                                                    \
+                    if (_dp_h[j-1] - gap_oe2 == _dp_f2[j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;       \
+                    else if (_dp_f2[j-1] - gap_ext2 == _dp_f2[j]) cur_op = ABPOA_F2_OP, hit = 1;            \
+                }                                                                                           \
+            }                                                                                               \
+            if (hit == 1) {                                                                                 \
+                cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, 1, id, j-1); --j;                   \
+                ++res->n_aln_bases;                                                                         \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0 && cur_op & ABPOA_M_OP && indel_first == 1) { /* match/mismatch */                     \
+            for (k = 0; k < pre_n[i]; ++k) {                                                                \
+                pre_i = pre_index_i[k];                                                                     \
+                if (j-1 < dp_beg[pre_i] || j-1 > dp_end[pre_i]) continue;                                   \
+                _pre_dp_h = (score_t*)(DP_H2E2F + dp_sn * pre_i * 5);                                       \
+                if (_pre_dp_h[j-1] + s == _dp_h[j]) {                                                       \
+                    cur_op = ABPOA_ALL_OP; hit = 1;                                                         \
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, j-1);                  \
+                    i = pre_i; --j; id = abpoa_graph_index_to_node_id(graph, i+beg_index); hit = 1;         \
+                    dp_h = DP_H2E2F + dp_sn * i * 5; _dp_h = (score_t*)dp_h;                                \
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;                           \
+                    indel_first = 0;                                                                        \
+                    break;                                                                                  \
+                }                                                                                           \
+            }                                                                                               \
+        }                                                                                                   \
+        if (hit == 0) err_fatal_simple("Error in cg_backtrack.");                                           \
+     /* fprintf(stderr, "%d, %d, %d\n", i, j, cur_op); */                                                   \
+    }                                                                                                       \
+    if (j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, j, -1, j-1);                         \
+    /* reverse cigar */                                                                                     \
+    res->graph_cigar = abpt->rev_cigar ? cigar : abpoa_reverse_cigar(n_c, cigar);                           \
+    res->n_cigar = n_c; res->m_cigar = m_c;                                                                 \
+    res->node_e = abpoa_graph_index_to_node_id(graph, best_i+beg_index), res->query_e=best_j-1; /*0-based*/ \
+    res->node_s = abpoa_graph_index_to_node_id(graph, _start_i+beg_index), res->query_s=_start_j-1;         \
+    /*abpoa_print_cigar(n_c, *graph_cigar, graph);*/                                                        \
+}
+
+// simd_abpoa_va
+// simd_abpoa_ag_only_var
+// sim_abpoa_init_var
+#define simd_abpoa_var(score_t, sp, SIMDSetOne, SIMDShiftOneN)                                      \
+    /* int tot_dp_sn = 0; */                                                                        \
+    abpoa_graph_t *graph = ab->abg; abpoa_simd_matrix_t *abm = ab->abm;                             \
+    int matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1;                              \
+    int **pre_index, *pre_n, _pre_index, _pre_n, pre_i;                                             \
+    int i, j, k, *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn, node_id, index_i, dp_i;                  \
+    int beg, end, beg_sn, end_sn, _beg_sn, _end_sn, pre_beg_sn, pre_end, sn_i;                      \
+    int pn, log_n, size, qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/ \
+    SIMDi *dp_h, *pre_dp_h, *qp, *qi=NULL;                                                          \
+    score_t *_dp_h=NULL, *_qi, best_score = sp.inf_min, inf_min = sp.inf_min;                       \
+    int *mat = abpt->mat, m = abpt->m; score_t gap_ext1 = abpt->gap_ext1;                           \
+    int w = abpt->wb < 0 ? qlen : abpt->wb+(int)(abpt->wf*qlen); /* when w < 0, do whole global */  \
+    int best_i = 0, best_j = 0, best_id = 0, max, max_i=-1;                                         \
+    SIMDi zero = SIMDSetZeroi(), SIMD_INF_MIN = SIMDSetOne(inf_min);                                \
+    pn = sp.num_of_value; qp_sn = dp_sn = (matrix_col_n + pn - 1) / pn;                             \
+    log_n = sp.log_num, size = sp.size; qp = abm->s_mem;                                            \
+    int set_num; SIMDi *PRE_MASK, *SUF_MIN, *PRE_MIN;                                               \
+    PRE_MASK = (SIMDi*)SIMDMalloc((pn+1) * size, size);                                             \
+    SUF_MIN = (SIMDi*)SIMDMalloc((pn+1) * size, size);                                              \
+    PRE_MIN = (SIMDi*)SIMDMalloc(pn * size, size);                                                  \
+    for (i = 0; i < pn; ++i) {                                                                      \
+        score_t *pre_mask = (score_t*)(PRE_MASK+i);                                                 \
+        for (j = 0; j <= i; ++j) pre_mask[j] = -1;                                                  \
+        for (j = i+1; j < pn; ++j) pre_mask[j] = 0;                                                 \
+    } PRE_MASK[pn] = PRE_MASK[pn-1];                                                                \
+    SUF_MIN[0] = SIMDShiftLeft(SIMD_INF_MIN, SIMDShiftOneN);                                        \
+    for (i = 1; i < pn; ++i)                                                                        \
+        SUF_MIN[i] = SIMDShiftLeft(SUF_MIN[i-1], SIMDShiftOneN); SUF_MIN[pn] = SUF_MIN[pn-1];       \
+    for (i = 1; i < pn; ++i) {                                                                      \
+        score_t *pre_min = (score_t*)(PRE_MIN + i);                                                 \
+        for (j = 0; j < i; ++j) pre_min[j] = inf_min;                                               \
+        for (j = i; j < pn; ++j) pre_min[j] = 0;                                                    \
+    }
+
+#define simd_abpoa_lg_only_var(score_t, SIMDSetOne, SIMDAdd)              \
+    SIMDi *DP_H = qp + qp_sn * abpt->m; qi = DP_H + dp_sn * matrix_row_n; \
+    SIMDi GAP_E1 = SIMDSetOne(gap_ext1);                                  \
+    SIMDi *GAP_E1S =  (SIMDi*)SIMDMalloc(log_n * size, size);             \
+    GAP_E1S[0] = GAP_E1;                                                  \
+    for (i = 1; i < log_n; ++i) {                                         \
+        GAP_E1S[i] = SIMDAdd(GAP_E1S[i-1], GAP_E1S[i-1]);                 \
+    }
+
+#define simd_abpoa_ag_only_var(score_t, SIMDSetOne, SIMDAdd)                                            \
+    score_t *_dp_e1, *_dp_f1, gap_open1 = abpt->gap_open1, gap_oe1 = abpt->gap_open1 + abpt->gap_ext1;  \
+    SIMDi *DP_HEF, *dp_e1, *pre_dp_e1, *dp_f1; int pre_end_sn;                                          \
+    DP_HEF = qp + qp_sn * abpt->m; qi = DP_HEF + dp_sn * matrix_row_n * 3;                              \
+    SIMDi GAP_O1 = SIMDSetOne(gap_open1), GAP_E1 = SIMDSetOne(gap_ext1), GAP_OE1 = SIMDSetOne(gap_oe1); \
+    SIMDi *GAP_E1S =  (SIMDi*)SIMDMalloc(log_n * size, size);  GAP_E1S[0] = GAP_E1;                     \
+    for (i = 1; i < log_n; ++i) {                                                                       \
+        GAP_E1S[i] = SIMDAdd(GAP_E1S[i-1], GAP_E1S[i-1]);                                               \
+    }
+
+#define simd_abpoa_cg_only_var(score_t, SIMDSetOne, SIMDAdd)                                                      \
+    score_t *_dp_e1, *_dp_e2, *_dp_f1, *_dp_f2, gap_open1 = abpt->gap_open1, gap_oe1 = gap_open1 + gap_ext1;      \
+    score_t gap_open2 = abpt->gap_open2, gap_ext2 = abpt->gap_ext2, gap_oe2 = gap_open2 + gap_ext2;               \
+    SIMDi *DP_H2E2F, *dp_e1, *dp_e2, *dp_f1, *dp_f2, *pre_dp_e1, *pre_dp_e2; int pre_end_sn;                      \
+    SIMDi GAP_O1 = SIMDSetOne(gap_open1), GAP_O2 = SIMDSetOne(gap_open2);                                         \
+    SIMDi GAP_E1 = SIMDSetOne(gap_ext1), GAP_E2 = SIMDSetOne(gap_ext2);                                           \
+    SIMDi GAP_OE1 = SIMDSetOne(gap_oe1), GAP_OE2 = SIMDSetOne(gap_oe2);                                           \
+    DP_H2E2F = qp + qp_sn * abpt->m; qi = DP_H2E2F + dp_sn * matrix_row_n * 5;                                    \
+    SIMDi *GAP_E1S =  (SIMDi*)SIMDMalloc(log_n * size, size), *GAP_E2S =  (SIMDi*)SIMDMalloc(log_n * size, size); \
+    GAP_E1S[0] = GAP_E1; GAP_E2S[0] = GAP_E2;                                                                     \
+    for (i = 1; i < log_n; ++i) {                                                                                 \
+        GAP_E1S[i] = SIMDAdd(GAP_E1S[i-1], GAP_E1S[i-1]);                                                         \
+        GAP_E2S[i] = SIMDAdd(GAP_E2S[i-1], GAP_E2S[i-1]);                                                         \
+    }
+
+#define simd_abpoa_init_var(score_t) {                                                             \
+    /* generate the query profile */                                                               \
+    for (i = 0; i < qp_sn * abpt->m; ++i) qp[i] = SIMD_INF_MIN;                                    \
+    for (k = 0; k < abpt->m; ++k) { /* SIMD parallelization */                                     \
+        int *p = &mat[k * abpt->m];                                                                \
+        score_t *_qp = (score_t*)(qp + k * qp_sn); _qp[0] = 0;                                     \
+        for (j = 0; j < qlen; ++j) _qp[j+1] = (score_t)p[query[j]];                                \
+        for (j = qlen+1; j < qp_sn * pn; ++j) _qp[j] = 0;                                          \
+    }                                                                                              \
+    if (abpt->wb>=0 || abpt->align_mode==ABPOA_LOCAL_MODE || abpt->align_mode==ABPOA_EXTEND_MODE){ \
+        _qi = (score_t*)qi; /* query index */                                                      \
+        for (i = 0; i <= qlen; ++i) _qi[i] = i;                                                    \
+        for (i = qlen+1; i < (qlen/pn+1) * pn; ++i) _qi[i] = -1;                                   \
+    }                                                                                              \
+    /* for backtrack */                                                                            \
+    dp_beg=abm->dp_beg, dp_end=abm->dp_end, dp_beg_sn=abm->dp_beg_sn, dp_end_sn=abm->dp_end_sn;    \
+    /* index of pre-node */                                                                        \
+    pre_index = (int**)_err_calloc(matrix_row_n, sizeof(int*));                                    \
+    pre_n = (int*)_err_calloc(matrix_row_n, sizeof(int));                                          \
+    for (index_i=beg_index+1, dp_i=1; index_i<=end_index; ++index_i, ++dp_i) {                     \
+        node_id = abpoa_graph_index_to_node_id(graph, index_i);                                    \
+        pre_n[dp_i] = graph->node[node_id].in_edge_n;                                              \
+        pre_index[dp_i] = (int*)_err_malloc(pre_n[dp_i] * sizeof(int));                            \
+        for (j = _pre_n = 0; j < pre_n[dp_i]; ++j) {                                               \
+            _pre_index = abpoa_graph_node_id_to_index(graph, graph->node[node_id].in_id[j]);       \
+            if (index_map[_pre_index]) pre_index[dp_i][_pre_n++] = _pre_index-beg_index;           \
+        }                                                                                          \
+        pre_n[dp_i] = _pre_n;                                                                      \
+    }                                                                                              \
+}
+
+#define simd_abpoa_free_var {                                                            \
+    for (i = 0; i < matrix_row_n; ++i) free(pre_index[i]); free(pre_index); free(pre_n); \
+    SIMDFree(PRE_MASK); SIMDFree(SUF_MIN); SIMDFree(PRE_MIN);                            \
+}                                                                                        \
+
+#define simd_abpoa_lg_var(score_t, sp, SIMDSetOne, SIMDShiftOneN, SIMDAdd) \
+    simd_abpoa_var(score_t, sp, SIMDSetOne, SIMDShiftOneN);                \
+    simd_abpoa_lg_only_var(score_t, SIMDSetOne, SIMDAdd);                  \
+    simd_abpoa_init_var(score_t);
+
+#define simd_abpoa_ag_var(score_t, sp, SIMDSetOne, SIMDShiftOneN, SIMDAdd) \
+    simd_abpoa_var(score_t, sp, SIMDSetOne, SIMDShiftOneN);                \
+    simd_abpoa_ag_only_var(score_t, SIMDSetOne, SIMDAdd);                  \
+    simd_abpoa_init_var(score_t);
+
+#define simd_abpoa_cg_var(score_t, sp, SIMDSetOne, SIMDShiftOneN, SIMDAdd) \
+    simd_abpoa_var(score_t, sp, SIMDSetOne, SIMDShiftOneN);                \
+    simd_abpoa_cg_only_var(score_t, SIMDSetOne, SIMDAdd);                  \
+    simd_abpoa_init_var(score_t);
+
+#define simd_abpoa_lg_first_row {                                                                       \
+    /* fill the first row */                                                                            \
+    if (abpt->wb >= 0) {                                                                                \
+        graph->node_id_to_max_pos_left[beg_node_id] = graph->node_id_to_max_pos_right[beg_node_id] = 0; \
+        for (i = 0; i < graph->node[beg_node_id].out_edge_n; ++i) { /* set max pos for out_id */        \
+            int out_id = graph->node[beg_node_id].out_id[i];                                            \
+            if (index_map[abpoa_graph_node_id_to_index(graph, out_id)])                                 \
+                graph->node_id_to_max_pos_left[out_id] = graph->node_id_to_max_pos_right[out_id] = 1;   \
+        }                                                                                               \
+        dp_beg[0] = 0, dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen);             \
+    } else {                                                                                            \
+        dp_beg[0] = 0, dp_end[0] = qlen;                                                                \
+    }                                                                                                   \
+    dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;                                       \
+    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;                                   \
+    dp_h = DP_H; _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);                                         \
+}
+
+#define simd_abpoa_ag_first_row {                                                                       \
+    /* fill the first row */                                                                            \
+    if (abpt->wb >= 0) {                                                                                \
+        graph->node_id_to_max_pos_left[beg_node_id] = graph->node_id_to_max_pos_right[beg_node_id] = 0; \
+        for (i = 0; i < graph->node[beg_node_id].out_edge_n; ++i) { /* set max pos for out_id */        \
+            int out_id = graph->node[beg_node_id].out_id[i];                                            \
+            if (index_map[abpoa_graph_node_id_to_index(graph, out_id)])                                 \
+                graph->node_id_to_max_pos_left[out_id] = graph->node_id_to_max_pos_right[out_id] = 1;   \
+        }                                                                                               \
+        dp_beg[0] = 0, dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen);             \
+    } else {                                                                                            \
+        dp_beg[0] = 0, dp_end[0] = qlen;                                                                \
+    }                                                                                                   \
+    dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;                                       \
+    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;                                   \
+    dp_h = DP_HEF; dp_e1 = dp_h + dp_sn; dp_f1 = dp_e1 + dp_sn;                                         \
+    _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);                                                      \
+}
+
+#define simd_abpoa_cg_first_row {                                                                       \
+    /* fill the first row */                                                                            \
+    if (abpt->wb >= 0) {                                                                                \
+        graph->node_id_to_max_pos_left[beg_node_id] = graph->node_id_to_max_pos_right[beg_node_id] = 0; \
+        for (i = 0; i < graph->node[beg_node_id].out_edge_n; ++i) { /* set max pos for out_id */        \
+            int out_id = graph->node[beg_node_id].out_id[i];                                            \
+            if (index_map[abpoa_graph_node_id_to_index(graph, out_id)])                                 \
+                graph->node_id_to_max_pos_left[out_id] = graph->node_id_to_max_pos_right[out_id] = 1;   \
+        }                                                                                               \
+        dp_beg[0] = 0, dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen);             \
+    } else {                                                                                            \
+        dp_beg[0] = 0, dp_end[0] = qlen;                                                                \
+    }                                                                                                   \
+    dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;                                       \
+    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;                                   \
+    dp_h = DP_H2E2F; dp_e1 = dp_h+dp_sn; dp_e2 = dp_e1+dp_sn; dp_f1 = dp_e2+dp_sn; dp_f2 = dp_f1+dp_sn; \
+    _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);                                                      \
+}
+
+#define simd_abpoa_lg_first_dp(score_t) {                                   \
+    simd_abpoa_lg_first_row;                                                \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                             \
+        for (i = 0; i < _end_sn; ++i)                                       \
+            dp_h[i] = zero;                                                 \
+    } else {                                                                \
+        for (i = 0; i <= _end_sn; ++i) {                                    \
+            dp_h[i] = SIMD_INF_MIN;                                         \
+        }                                                                   \
+        _dp_h = (score_t*)dp_h;                                             \
+        for (i = 0; i <= dp_end[0]; ++i) { /* no SIMD parallelization */    \
+            _dp_h[i] = -gap_ext1 * i;                                       \
+        }                                                                   \
+    }                                                                       \
+}
+
+#define simd_abpoa_ag_first_dp(score_t) {                                   \
+    simd_abpoa_ag_first_row;                                                \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                             \
+        for (i = 0; i < _end_sn; ++i)                                       \
+            dp_h[i] = dp_e1[i] = dp_f1[i] = zero;                           \
+    } else {                                                                \
+        for (i = 0; i <= _end_sn; ++i) {                                    \
+            dp_h[i] = SIMD_INF_MIN; dp_e1[i] = SIMD_INF_MIN;                \
+        }                                                                   \
+        _dp_h=(score_t*)dp_h,_dp_e1=(score_t*)dp_e1,_dp_f1=(score_t*)dp_f1; \
+        _dp_h[0] = 0; _dp_e1[0] = -(gap_oe1), _dp_f1[0] = inf_min;          \
+        for (i = 1; i <= dp_end[0]; ++i) { /* no SIMD parallelization */    \
+            _dp_f1[i] = -gap_open1 - gap_ext1 * i;                          \
+            _dp_h[i] = -gap_open1 - gap_ext1 * i;                           \
+        }                                                                   \
+    }                                                                       \
+}
+
+#define simd_abpoa_cg_first_dp(score_t) {                                             \
+    simd_abpoa_cg_first_row;                                                          \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                       \
+        for (i = 0; i < _end_sn; ++i)                                                 \
+            dp_h[i] = dp_e1[i] = dp_e2[i] = dp_f1[i] = dp_f2[i] = zero;               \
+    } else {                                                                          \
+        for (i = 0; i <= _end_sn; ++i) {                                              \
+            dp_h[i] = SIMD_INF_MIN; dp_e1[i] = SIMD_INF_MIN; dp_e2[i] = SIMD_INF_MIN; \
+        }                                                                             \
+        _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1, _dp_e2 = (score_t*)dp_e2;   \
+        _dp_f1 = (score_t*)dp_f1, _dp_f2 = (score_t*)dp_f2;                           \
+        _dp_h[0] = 0; _dp_e1[0] = -(gap_oe1); _dp_e2[0] = -(gap_oe2);                 \
+        _dp_f1[0] = _dp_f2[0] = inf_min;                                              \
+        for (i = 1; i <= dp_end[0]; ++i) { /* no SIMD parallelization */              \
+            _dp_f1[i] = -gap_open1 - gap_ext1 * i;                                    \
+            _dp_f2[i] = -gap_open2 - gap_ext2 * i;                                    \
+            _dp_h[i] = MAX_OF_TWO(_dp_f1[i], _dp_f2[i]);                              \
+        }                                                                             \
+    }                                                                                 \
+}
+
+// mask[pn], suf_min[pn], pre_min[logN]
+#define SIMD_SET_F(F, log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN) { \
+    if (set_num == pn) {                                                                                               \
+        F = SIMDMax(F, SIMDOri(SIMDShiftLeft(SIMDSub(F, GAP_E1S[0]), SIMDShiftOneN), PRE_MIN[1]));                     \
+        if (log_n > 1) {                                                                                               \
+            F = SIMDMax(F, SIMDOri(SIMDShiftLeft(SIMDSub(F, GAP_E1S[1]), SIMDShiftOneN<<1), PRE_MIN[2]));              \
+        } if (log_n > 2) {                                                                                             \
+            F = SIMDMax(F, SIMDOri(SIMDShiftLeft(SIMDSub(F, GAP_E1S[2]), SIMDShiftOneN<<2), PRE_MIN[4]));              \
+        } if (log_n > 3) {                                                                                             \
+            F = SIMDMax(F, SIMDOri(SIMDShiftLeft(SIMDSub(F, GAP_E1S[3]), SIMDShiftOneN<<3), PRE_MIN[8]));              \
+        } if (log_n > 4) {                                                                                             \
+            F = SIMDMax(F, SIMDOri(SIMDShiftLeft(SIMDSub(F, GAP_E1S[4]), SIMDShiftOneN<<4), PRE_MIN[16]));             \
+        } if (log_n > 5) {                                                                                             \
+            F = SIMDMax(F, SIMDOri(SIMDShiftLeft(SIMDSub(F, GAP_E1S[5]), SIMDShiftOneN<<5), PRE_MIN[32]));             \
+        }                                                                                                              \
+    } else { /*suffix MIN_INF*/                                                                                                                                    \
+        int cov_bit = set_num;                                                                                                                                     \
+        F = SIMDMax(F, SIMDOri(SIMDAndi(SIMDShiftLeft(SIMDSub(F, GAP_E1S[0]), SIMDShiftOneN), PRE_MASK[cov_bit]), SIMDOri(SUF_MIN[cov_bit], PRE_MIN[1])));         \
+        if (log_n > 1) {                                                                                                                                           \
+            cov_bit += 2;                                                                                                                                          \
+            F = SIMDMax(F, SIMDOri(SIMDAndi(SIMDShiftLeft(SIMDSub(F, GAP_E1S[1]), SIMDShiftOneN<<1), PRE_MASK[cov_bit]), SIMDOri(SUF_MIN[cov_bit], PRE_MIN[2])));  \
+        } if (log_n > 2) {                                                                                                                                         \
+            cov_bit += 4;                                                                                                                                          \
+            F = SIMDMax(F, SIMDOri(SIMDAndi(SIMDShiftLeft(SIMDSub(F, GAP_E1S[2]), SIMDShiftOneN<<2), PRE_MASK[cov_bit]), SIMDOri(SUF_MIN[cov_bit], PRE_MIN[4])));  \
+        } if (log_n > 3) {                                                                                                                                         \
+            cov_bit += 8;                                                                                                                                          \
+            F = SIMDMax(F, SIMDOri(SIMDAndi(SIMDShiftLeft(SIMDSub(F, GAP_E1S[3]), SIMDShiftOneN<<3), PRE_MASK[cov_bit]), SIMDOri(SUF_MIN[cov_bit], PRE_MIN[8])));  \
+        } if (log_n > 4) {                                                                                                                                         \
+            cov_bit += 16;                                                                                                                                         \
+            F = SIMDMax(F, SIMDOri(SIMDAndi(SIMDShiftLeft(SIMDSub(F, GAP_E1S[4]), SIMDShiftOneN<<4), PRE_MASK[cov_bit]), SIMDOri(SUF_MIN[cov_bit], PRE_MIN[16]))); \
+        } if (log_n > 5) {                                                                                                                                         \
+            cov_bit += 32;                                                                                                                                         \
+            F = SIMDMax(F, SIMDOri(SIMDAndi(SIMDShiftLeft(SIMDSub(F, GAP_E1S[5]), SIMDShiftOneN<<5), PRE_MASK[cov_bit]), SIMDOri(SUF_MIN[cov_bit], PRE_MIN[32]))); \
+        }                                                                                                                                                          \
+    }                                                                                                                                                              \
+}
+
+#define simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub) {                       \
+    node_id = abpoa_graph_index_to_node_id(graph, index_i);                                         \
+    SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                               \
+    dp_h = &DP_H[dp_i * dp_sn]; _dp_h = (score_t*)dp_h;                                             \
+    int min_pre_beg_sn, max_pre_end_sn;                                                             \
+    if (abpt->wb < 0) {                                                                             \
+        beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                          \
+        beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn; \
+        min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                \
+    } else {                                                                                        \
+        beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen); \
+        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                         \
+        for (i = 0; i < pre_n[dp_i]; ++i) {                                                         \
+            pre_i = pre_index[dp_i][i];                                                             \
+            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];               \
+            if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];               \
+        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;                                     \
+        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;                        \
+        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;           \
+    }                                                                                               \
+    /* loop query */                                                                                                         \
+    /* first pre_node */                                                                                                     \
+    pre_i = pre_index[dp_i][0];                                                                                              \
+    pre_dp_h = DP_H + pre_i * dp_sn;                                                                                         \
+    pre_end = dp_end[pre_i];                                                                                                 \
+    pre_beg_sn = dp_beg_sn[pre_i];                                                                                           \
+    /* set M from (pre_i, q_i-1), E from (pre_i, q_i) */                                                                     \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                              \
+        _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                           \
+    } else {                                                                                                                 \
+        if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN); \
+        else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                       \
+        _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                             \
+        for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;                                                           \
+        for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;                                 \
+    }                                                                                                                        \
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                               \
+        remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                               \
+        dp_h[sn_i] = SIMDMax(SIMDAdd(SIMDOri(first, remain), q[sn_i]), SIMDSub(pre_dp_h[sn_i], GAP_E1));                     \
+        first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                \
+    }                                                                                                                        \
+    /* get max m and e */                                                                                                         \
+    for (i = 1; i < pre_n[dp_i]; ++i) {                                                                                           \
+        pre_i = pre_index[dp_i][i];                                                                                               \
+        pre_dp_h = DP_H + pre_i * dp_sn;                                                                                          \
+        pre_end = dp_end[pre_i];                                                                                                  \
+        pre_beg_sn = dp_beg_sn[pre_i];                                                                                            \
+        /* set M from (pre_i, q_i-1), E from (pre_i, q_i) */                                                                      \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                                                           \
+        } else {                                                                                                                  \
+            if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN);  \
+            else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                        \
+            _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                              \
+        }                                                                                                                         \
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                \
+            remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                                \
+            dp_h[sn_i] = SIMDMax(SIMDAdd(SIMDOri(first, remain), q[sn_i]), SIMDMax(SIMDSub(pre_dp_h[sn_i], GAP_E1), dp_h[sn_i])); \
+            first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                 \
+        } /* now we have max(h,e) stored at dp_h */                                                                               \
+    }                                                                                                                             \
+    /* new F start */                                                                                                                  \
+    first = SIMDOri(SIMDAndi(dp_h[beg_sn], PRE_MASK[0]), SUF_MIN[0]);                                                                  \
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {                                                                                      \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                                    \
+            set_num = pn;                                                                                                              \
+        } else {                                                                                                                       \
+            if (sn_i < min_pre_beg_sn) {                                                                                               \
+                _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");                                                                \
+            } else if (sn_i > max_pre_end_sn) {                                                                                        \
+                set_num = sn_i == max_pre_end_sn+1 ? 1 : 0;                                                                            \
+            } else set_num = pn;                                                                                                       \
+        }                                                                                                                              \
+        dp_h[sn_i] = SIMDMax(dp_h[sn_i], first);                                                                                       \
+        SIMD_SET_F(dp_h[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN);         \
+        first = SIMDOri(SIMDAndi(SIMDShiftRight(SIMDSub(dp_h[sn_i], GAP_E1), SIMDTotalBytes-SIMDShiftOneN), PRE_MASK[0]), SUF_MIN[0]); \
+    }                                                                                                                                  \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) for (sn_i = 0; sn_i <= end_sn; ++sn_i) dp_h[sn_i] = SIMDMax(zero, dp_h[sn_i]);           \
+}
+
+#define simd_abpoa_ag_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual) { \
+    node_id = abpoa_graph_index_to_node_id(graph, index_i);                                                                       \
+    SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                                                             \
+    dp_h = DP_HEF + dp_i * 3 * dp_sn; dp_e1 = dp_h + dp_sn; dp_f1 = dp_e1 + dp_sn;                                                \
+    _dp_h = (score_t*)dp_h, _dp_e1 = (score_t*)dp_e1, _dp_f1 = (score_t*)dp_f1;                                                   \
+    int min_pre_beg_sn, max_pre_end_sn;                                                                                           \
+    if (abpt->wb < 0) {                                                                                                           \
+        beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                                                        \
+        beg_sn = dp_beg_sn[dp_i] = (dp_beg[dp_i])/pn; end_sn = dp_end_sn[dp_i] = (dp_end[dp_i])/pn;                               \
+        min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                                              \
+    } else {                                                                                                                      \
+        beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);   \
+        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                                       \
+        for (i = 0; i < pre_n[dp_i]; ++i) {                                                                                       \
+            pre_i = pre_index[dp_i][i];                                                                                           \
+            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];                                             \
+            if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];                                             \
+        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;                                                                   \
+        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;                                                      \
+        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;                                         \
+    }                                                                                                                             \
+    /* loop query */                                                                                                              \
+    /* first pre_node */                                                                                                          \
+    pre_i = pre_index[dp_i][0];                                                                                                   \
+    pre_dp_h = DP_HEF + pre_i * 3 * dp_sn; pre_dp_e1 = pre_dp_h + dp_sn;                                                          \
+    pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; pre_end_sn = dp_end_sn[pre_i];                                        \
+    /* set M from (pre_i, q_i-1) */                                                                                               \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                                   \
+        _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                                \
+    } else {                                                                                                                      \
+        if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN);      \
+        else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                            \
+        _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                                  \
+        for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;                                                                \
+        for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;                                      \
+    }                                                                                                                             \
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                    \
+        remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                                    \
+        dp_h[sn_i] = SIMDOri(first, remain);                                                                                      \
+        first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                     \
+    }                                                                                                                             \
+    /* set E from (pre_i, q_i) */                                                                                                 \
+    if (abpt->align_mode != ABPOA_LOCAL_MODE) {                                                                                   \
+        _end_sn = MIN_OF_TWO(pre_end_sn, end_sn);                                                                                 \
+        for (i = beg_sn; i < _beg_sn; ++i) dp_e1[i] = SIMD_INF_MIN;                                                               \
+        for (i = _end_sn+1; i <= end_sn; ++i) dp_e1[i] = SIMD_INF_MIN;                                                            \
+    }                                                                                                                             \
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i)   /* SIMD parallelization */                                                    \
+        dp_e1[sn_i] = pre_dp_e1[sn_i];                                                                                            \
+    /* get max m and e */                                                                                                         \
+    for (i = 1; i < pre_n[dp_i]; ++i) {                                                                                           \
+        pre_i = pre_index[dp_i][i];                                                                                               \
+        pre_dp_h = DP_HEF + pre_i * 3 * dp_sn; pre_dp_e1 = pre_dp_h + dp_sn;                                                      \
+        pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; pre_end_sn = dp_end_sn[pre_i];                                    \
+        /* set M from (pre_i, q_i-1) */                                                                                           \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                                                           \
+        } else {                                                                                                                  \
+            if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN);  \
+            else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                        \
+            _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                              \
+        }                                                                                                                         \
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                \
+            remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                                \
+            dp_h[sn_i] = SIMDMax(SIMDOri(first, remain), dp_h[sn_i]);                                                             \
+            first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                 \
+        }                                                                                                                         \
+        /* set E from (pre_i, q_i) */                                                                                             \
+        _end_sn = MIN_OF_TWO(pre_end_sn, end_sn);                                                                                 \
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i)   /* SIMD parallelization */                                                \
+            dp_e1[sn_i] = SIMDMax(pre_dp_e1[sn_i], dp_e1[sn_i]);                                                                  \
+    }                                                                                                                             \
+    /* compare M, E, and F */                                                                                                     \
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */                                                      \
+        dp_h[sn_i] = SIMDAdd(dp_h[sn_i], q[sn_i]);                                                                                \
+    }                                                                                                                             \
+    /* new F start */                                                                                                             \
+    first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneN), SIMDTotalBytes-SIMDShiftOneN);              \
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {                                                                                 \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            set_num  = pn;                                                                                                        \
+        } else {                                                                                                                  \
+            if (sn_i < min_pre_beg_sn) {                                                                                          \
+                _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");                                                           \
+            } else if (sn_i > max_pre_end_sn) {                                                                                   \
+                set_num = sn_i == max_pre_end_sn+1 ? 2 : 1;                                                                       \
+            } else set_num = pn;                                                                                                  \
+        }                                                                                                                         \
+        /* F = (H << 1 | x) - OE */                                                                                               \
+        dp_f1[sn_i] = SIMDSub(SIMDOri(SIMDShiftLeft(dp_h[sn_i], SIMDShiftOneN), first), GAP_OE1);                                 \
+        /* F = max{F, (F-e)<<1}, F = max{F, (F-2e)<<2} ... */                                                                     \
+        SIMD_SET_F(dp_f1[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN);   \
+        /* x = max{H, F+o} */                                                                                                     \
+        first = SIMDShiftRight(SIMDMax(dp_h[sn_i], SIMDAdd(dp_f1[sn_i], GAP_O1)), SIMDTotalBytes-SIMDShiftOneN);                  \
+        /* H = max{H, F} */                                                                                                       \
+        dp_h[sn_i] = SIMDMax(dp_h[sn_i], dp_e1[sn_i]); SIMDi tmp = dp_h[sn_i];                                                    \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            dp_h[sn_i] = SIMDMax(zero, SIMDMax(dp_h[sn_i], dp_f1[sn_i]));                                                         \
+            SIMDSetIfEqual(dp_e1[sn_i], dp_h[sn_i],tmp, SIMDMax(SIMDSub(dp_e1[sn_i],GAP_E1), SIMDSub(dp_h[sn_i],GAP_OE1)),zero);  \
+        } else {                                                                                                                  \
+            dp_h[sn_i] = SIMDMax(dp_h[sn_i], dp_f1[sn_i]);                                                                        \
+            SIMDSetIfEqual(dp_e1[sn_i], dp_h[sn_i],tmp, SIMDMax(SIMDSub(dp_e1[sn_i],GAP_E1), SIMDSub(dp_h[sn_i],GAP_OE1)),SIMD_INF_MIN); \
+        }                                                                                                                         \
+    }                                                                                                                             \
+}
+
+#define simd_abpoa_cg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual) { \
+    node_id = abpoa_graph_index_to_node_id(graph, index_i);                                                                       \
+    SIMDi *q = qp + graph->node[node_id].base * qp_sn, first, remain;                                                             \
+    dp_h = DP_H2E2F+dp_i*5*dp_sn; dp_e1 = dp_h+dp_sn; dp_e2 = dp_e1+dp_sn; dp_f1 = dp_e2+dp_sn; dp_f2 = dp_f1+dp_sn;              \
+    _dp_h=(score_t*)dp_h, _dp_e1=(score_t*)dp_e1, _dp_e2=(score_t*)dp_e2, _dp_f1=(score_t*)dp_f1, _dp_f2=(score_t*)dp_f2;         \
+    int min_pre_beg_sn, max_pre_end_sn;                                                                                           \
+    if (abpt->wb < 0) {                                                                                                           \
+        beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;                                                                        \
+        beg_sn = dp_beg_sn[dp_i] = beg/pn; end_sn = dp_end_sn[dp_i] = end/pn;                                                     \
+        min_pre_beg_sn = 0, max_pre_end_sn = end_sn;                                                                              \
+    } else {                                                                                                                      \
+        beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);   \
+        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;                                                       \
+        for (i = 0; i < pre_n[dp_i]; ++i) {                                                                                       \
+            pre_i = pre_index[dp_i][i];                                                                                           \
+            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];                                             \
+            if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];                                             \
+        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;                                                                   \
+        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;                                                      \
+        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;                                         \
+     /* fprintf(stderr, "index: %d, beg: %d, end: %d, beg_sn: %d, end_sn: %d\n", index_i, beg, end, beg_sn, end_sn); */           \
+    }                                                                                                                             \
+ /* fprintf(stderr, "%d: beg, end: %d, %d\n", index_i, beg, end); */                                                              \
+    /* tot_dp_sn += (end_sn - beg_sn + 1); */                                                                                     \
+    /* loop query */                                                                                                              \
+    /* first pre_node */                                                                                                          \
+    pre_i = pre_index[dp_i][0];                                                                                                   \
+    pre_dp_h = DP_H2E2F + pre_i * 5 * dp_sn; pre_dp_e1 = pre_dp_h + dp_sn; pre_dp_e2 = pre_dp_e1 + dp_sn;                         \
+    pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; pre_end_sn = dp_end_sn[pre_i];                                        \
+    /* set M from (pre_i, q_i-1) */                                                                                               \
+    if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                                   \
+        _beg_sn = 0, _end_sn = end_sn; first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                                \
+    } else {                                                                                                                      \
+        if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN);      \
+        else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                            \
+        _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                                  \
+        for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;                                                                \
+        for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;                                      \
+    }                                                                                                                             \
+ /* fprintf(stderr, "1 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */                                     \
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                    \
+        remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                                    \
+        dp_h[sn_i] = SIMDOri(first, remain);                                                                                      \
+        first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                     \
+    }                                                                                                                             \
+    /* set E from (pre_i, q_i) */                                                                                                 \
+    if (abpt->align_mode != ABPOA_LOCAL_MODE) {                                                                                   \
+        _end_sn = MIN_OF_TWO(pre_end_sn, end_sn);                                                                                 \
+        for (i = beg_sn; i < _beg_sn; ++i) dp_e1[i] = SIMD_INF_MIN, dp_e2[i] = SIMD_INF_MIN;                                      \
+        for (i = _end_sn+1; i <= end_sn; ++i) dp_e1[i] = SIMD_INF_MIN, dp_e2[i] = SIMD_INF_MIN;                                   \
+    }                                                                                                                             \
+ /* fprintf(stderr, "2 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */                                     \
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                    \
+        dp_e1[sn_i] = pre_dp_e1[sn_i];                                                                                            \
+        dp_e2[sn_i] = pre_dp_e2[sn_i];                                                                                            \
+    }                                                                                                                             \
+    /* get max m and e */                                                                                                         \
+    for (i = 1; i < pre_n[dp_i]; ++i) {                                                                                           \
+        pre_i = pre_index[dp_i][i];                                                                                               \
+        pre_dp_h = DP_H2E2F + (pre_i * 5) * dp_sn; pre_dp_e1 = pre_dp_h + dp_sn; pre_dp_e2 = pre_dp_e1 + dp_sn;                   \
+        pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; pre_end_sn = dp_end_sn[pre_i];                                    \
+        /* set M from (pre_i, q_i-1) */                                                                                           \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            first = SIMDShiftRight(zero, SIMDTotalBytes-SIMDShiftOneN);                                                           \
+        } else {                                                                                                                  \
+            if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneN);  \
+            else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneN);                        \
+            _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);                                                              \
+        }                                                                                                                         \
+     /* fprintf(stderr, "3 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */                                 \
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                \
+            remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneN);                                                                \
+            dp_h[sn_i] = SIMDMax(SIMDOri(first, remain), dp_h[sn_i]);                                                             \
+            first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneN);                                                 \
+        }                                                                                                                         \
+        /* set E from (pre_i, q_i) */                                                                                             \
+        _end_sn = MIN_OF_TWO(pre_end_sn, end_sn);                                                                                 \
+     /* fprintf(stderr, "4 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */                                 \
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */                                                \
+            dp_e1[sn_i] = SIMDMax(pre_dp_e1[sn_i], dp_e1[sn_i]);                                                                  \
+            dp_e2[sn_i] = SIMDMax(pre_dp_e2[sn_i], dp_e2[sn_i]);                                                                  \
+        }                                                                                                                         \
+    }                                                                                                                             \
+    /* compare M, E, and F */                                                                                                     \
+ /* fprintf(stderr, "5 index_i: %d, beg_sn: %d, end_sn: %d\n", index_i, _beg_sn, _end_sn); */                                     \
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */                                                      \
+        dp_h[sn_i] = SIMDAdd(dp_h[sn_i], q[sn_i]);                                                                                \
+    }                                                                                                                             \
+    /* new F start */                                                                                                             \
+    first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneN), SIMDTotalBytes-SIMDShiftOneN);              \
+    SIMDi first2 = first;                                                                                                         \
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {                                                                                 \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) set_num = pn;                                                                   \
+        else {                                                                                                                    \
+            if (sn_i < min_pre_beg_sn) {                                                                                          \
+                _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");                                                           \
+            } else if (sn_i > max_pre_end_sn) {                                                                                   \
+                set_num = sn_i == max_pre_end_sn+1 ? 2 : 1;                                                                       \
+            } else set_num = pn;                                                                                                  \
+        }                                                                                                                         \
+        /* H = max{H, E} */                                                                                                       \
+        dp_h[sn_i] =  SIMDMax(SIMDMax(dp_h[sn_i], dp_e1[sn_i]), dp_e2[sn_i]);                                                     \
+        /* F = (H << 1 | x) - OE */                                                                                               \
+        dp_f1[sn_i] = SIMDSub(SIMDOri(SIMDShiftLeft(dp_h[sn_i], SIMDShiftOneN), first), GAP_OE1);                                 \
+        dp_f2[sn_i] = SIMDSub(SIMDOri(SIMDShiftLeft(dp_h[sn_i], SIMDShiftOneN), first2), GAP_OE2);                                \
+        /* F = max{F, (F-e)<<1}, F = max{F, (F-2e)<<2} ... */                                                                     \
+        SIMD_SET_F(dp_f1[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN);   \
+        SIMD_SET_F(dp_f2[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E2S, SIMDMax, SIMDAdd, SIMDSub, SIMDShiftOneN);   \
+        /* x = max{H, F+o} */                                                                                                     \
+        first = SIMDShiftRight(SIMDMax(dp_h[sn_i], SIMDAdd(dp_f1[sn_i], GAP_O1)), SIMDTotalBytes-SIMDShiftOneN);                  \
+        first2 = SIMDShiftRight(SIMDMax(dp_h[sn_i], SIMDAdd(dp_f2[sn_i], GAP_O2)), SIMDTotalBytes-SIMDShiftOneN);                 \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                                               \
+            dp_h[sn_i] = SIMDMax(zero, SIMDMax(dp_h[sn_i], SIMDMax(dp_f1[sn_i], dp_f2[sn_i])));                                   \
+            dp_e1[sn_i] = SIMDMax(zero,SIMDMax(SIMDSub(dp_e1[sn_i],GAP_E1),SIMDSub(dp_h[sn_i],GAP_OE1)));                         \
+            dp_e2[sn_i] = SIMDMax(zero,SIMDMax(SIMDSub(dp_e2[sn_i],GAP_E2),SIMDSub(dp_h[sn_i],GAP_OE2)));                         \
+        } else {                                                                                                                  \
+            /* H = max{H, F}    */                                                                                                \
+            dp_h[sn_i] = SIMDMax(dp_h[sn_i], SIMDMax(dp_f1[sn_i], dp_f2[sn_i]));                                                  \
+            /* e for next cell */                                                                                                 \
+            dp_e1[sn_i] = SIMDMax(SIMDSub(dp_e1[sn_i],GAP_E1),SIMDSub(dp_h[sn_i],GAP_OE1));                                       \
+            dp_e2[sn_i] = SIMDMax(SIMDSub(dp_e2[sn_i],GAP_E2),SIMDSub(dp_h[sn_i],GAP_OE2));                                       \
+        }                                                                                                                         \
+    }                                                                                                                             \
+}
+
+#define set_global_max_score(score, i, j) {         \
+    if (score > best_score) {                       \
+        best_score = score; best_i = i; best_j = j; \
+    }                                               \
+}
+
+#define set_extend_max_score(score, i, j) {                                                              \
+    if (score > best_score) {                                                                            \
+        best_score = score; best_i = i; best_j = j; best_id = node_id;                                   \
+    } else if (abpt->zdrop > 0) {                                                                        \
+        int delta_index = graph->node_id_to_max_remain[best_id] - graph->node_id_to_max_remain[node_id]; \
+        if (best_score - score > abpt->zdrop + gap_ext1 * abs(delta_index-(j-best_j)))                   \
+            break;                                                                                       \
+    }                                                                                                    \
+}
+
+#define simd_abpoa_global_get_max(score_t, DP_M, dp_sn) {      \
+    int end, in_id, in_index, in_dp_i;                         \
+    for (i = 0; i < graph->node[end_node_id].in_edge_n; ++i) { \
+        in_id = graph->node[end_node_id].in_id[i];             \
+        in_index = abpoa_graph_node_id_to_index(graph, in_id); \
+        if (index_map[in_index] == 0) continue;                \
+        in_dp_i = in_index - beg_index;                        \
+        dp_h = DP_M + in_dp_i * dp_sn;                         \
+        _dp_h = (score_t*)dp_h;                                \
+        if (qlen > dp_end[in_dp_i]) end = dp_end[in_dp_i];     \
+        else end = qlen;                                       \
+        set_global_max_score(_dp_h[end], in_dp_i, end);        \
+    }                                                          \
+}
+
+#define simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater) { \
+    /* select max dp_h */                                                    \
+    max = inf_min, max_i = -1;                                               \
+    SIMDi a = dp_h[end_sn], b = qi[end_sn];                                  \
+    if (end_sn == qlen / pn) SIMDSetIfGreater(a, zero, b, SIMD_INF_MIN, a);  \
+    for (i = beg_sn; i < end_sn; ++i) {                                      \
+        SIMDGetIfGreater(b, a, dp_h[i], a, qi[i], b);                        \
+    }                                                                        \
+    _dp_h = (score_t*)&a, _qi = (score_t*)&b;                                \
+    for (i = 0; i < pn; ++i) {                                               \
+        if (_dp_h[i] > max) {                                                \
+            max = _dp_h[i]; max_i = _qi[i];                                  \
+        }                                                                    \
+    }                                                                        \
+}
+
+#define simd_abpoa_ada_max_i   {                                                                                        \
+    /* set max_pos_left/right for next nodes */                                                                         \
+    int out_i = max_i + 1;                                                                                              \
+    for (i = 0; i < graph->node[node_id].out_edge_n; ++i) {                                                             \
+        int out_node_id = graph->node[node_id].out_id[i];                                                               \
+        if (out_i > graph->node_id_to_max_pos_right[out_node_id]) graph->node_id_to_max_pos_right[out_node_id] = out_i; \
+        if (out_i < graph->node_id_to_max_pos_left[out_node_id]) graph->node_id_to_max_pos_left[out_node_id] = out_i;   \
+    }                                                                                                                   \
+}
+
+// TODO end_bonus for extension
+// linear gap penalty: gap_open1 == 0
+#define simd_abpoa_lg_align_sequence_to_graph_core(score_t, sp, SIMDSetOne, SIMDMax, SIMDAdd,   \
+        SIMDSub, SIMDShiftOneN, SIMDSetIfGreater, SIMDGetIfGreater) {                           \
+    simd_abpoa_lg_var(score_t, sp, SIMDSetOne, SIMDShiftOneN, SIMDAdd);                         \
+    simd_abpoa_lg_first_dp(score_t);                                                            \
+    for (index_i=beg_index+1, dp_i=1; index_i<end_index; ++index_i, ++dp_i) {                   \
+        if (index_map[index_i] == 0) continue;                                                  \
+        simd_abpoa_lg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub);                    \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                             \
+            simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                 \
+            set_global_max_score(max, dp_i, max_i);                                             \
+        }                                                                                       \
+        if (abpt->align_mode == ABPOA_EXTEND_MODE) {                                            \
+            simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                 \
+            set_extend_max_score(max, dp_i, max_i);                                             \
+        }                                                                                       \
+        if (abpt->wb >= 0) {                                                                    \
+            if (abpt->align_mode == ABPOA_GLOBAL_MODE) {                                        \
+                simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);             \
+            }                                                                                   \
+            simd_abpoa_ada_max_i;                                                               \
+        }                                                                                       \
+    }                                                                                           \
+    if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_H, dp_sn); \
+    res->best_score = best_score;                                                               \
+ /* simd_abpoa_print_lg_matrix(score_t, beg_index, end_index); printf("best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); */                          \
+    if (abpt->ret_cigar) simd_abpoa_lg_backtrack(score_t);                                      \
+    simd_abpoa_free_var; SIMDFree(GAP_E1S);                                                     \
+}
+
+// affine gap penalty: gap_open1 > 0
+#define simd_abpoa_ag_align_sequence_to_graph_core(score_t, sp, SIMDSetOne, SIMDMax, SIMDAdd,       \
+        SIMDSub, SIMDShiftOneN, SIMDSetIfGreater, SIMDGetIfGreater, SIMDSetIfEqual) {               \
+    simd_abpoa_ag_var(score_t, sp, SIMDSetOne, SIMDShiftOneN, SIMDAdd);                             \
+    simd_abpoa_ag_first_dp(score_t);                                                                \
+    for (index_i=beg_index+1, dp_i=1; index_i<end_index; ++index_i, ++dp_i) {                       \
+        if (index_map[index_i] == 0) continue;                                                      \
+        simd_abpoa_ag_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual); \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                 \
+            simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                     \
+            set_global_max_score(max, dp_i, max_i);                                                 \
+        } else if (abpt->align_mode == ABPOA_EXTEND_MODE) {                                         \
+            simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                     \
+            set_extend_max_score(max, dp_i, max_i);                                                 \
+        }                                                                                           \
+        if (abpt->wb >= 0) {                                                                        \
+            if (abpt->align_mode == ABPOA_GLOBAL_MODE) {                                            \
+                simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                 \
+            }                                                                                       \
+            simd_abpoa_ada_max_i;                                                                   \
+        }                                                                                           \
+    }                                                                                               \
+    if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_HEF, 3*dp_sn); \
+    res->best_score = best_score;                                                                   \
+ /* simd_abpoa_print_ag_matrix(score_t, beg_index, end_index); fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score); */ \
+    if (abpt->ret_cigar) simd_abpoa_ag_backtrack(score_t);                                          \
+    simd_abpoa_free_var; SIMDFree(GAP_E1S);                                                         \
+}
+
+// convex gap penalty: gap_open1 > 0 && gap_open2 > 0
+#define simd_abpoa_cg_align_sequence_to_graph_core(score_t, sp, SIMDSetOne, SIMDMax, SIMDAdd,         \
+        SIMDSub, SIMDShiftOneN, SIMDSetIfGreater, SIMDGetIfGreater, SIMDSetIfEqual) {                 \
+    simd_abpoa_cg_var(score_t, sp, SIMDSetOne, SIMDShiftOneN, SIMDAdd);                               \
+    simd_abpoa_cg_first_dp(score_t);                                                                  \
+    for (index_i=beg_index+1, dp_i=1; index_i<end_index; ++index_i, ++dp_i) {                         \
+        if (index_map[index_i] == 0) continue;                                                        \
+        simd_abpoa_cg_dp(score_t, SIMDShiftOneN, SIMDMax, SIMDAdd, SIMDSub, SIMDGetIfGreater, SIMDSetIfGreater, SIMDSetIfEqual); \
+        if (abpt->align_mode == ABPOA_LOCAL_MODE) {                                                   \
+            simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                       \
+            set_global_max_score(max, dp_i, max_i);                                                   \
+        } else if (abpt->align_mode == ABPOA_EXTEND_MODE) {                                           \
+            simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                       \
+            set_extend_max_score(max, dp_i, max_i);                                                   \
+        }                                                                                             \
+        if (abpt->wb >= 0) {                                                                          \
+            if (abpt->align_mode == ABPOA_GLOBAL_MODE) {                                              \
+                simd_abpoa_max_in_row(score_t, SIMDSetIfGreater, SIMDGetIfGreater);                   \
+            }                                                                                         \
+            simd_abpoa_ada_max_i;                                                                     \
+        }                                                                                             \
+    }                                                                                                 \
+ /* printf("dp_sn: %d\n", tot_dp_sn); */                                                              \
+    if (abpt->align_mode == ABPOA_GLOBAL_MODE) simd_abpoa_global_get_max(score_t, DP_H2E2F, 5*dp_sn); \
+    res->best_score = best_score;                                                                     \
+/* simd_abpoa_print_cg_matrix(score_t, beg_index, end_index); fprintf(stderr,"best_score: (%d, %d) -> %d\n",best_i,best_j,best_score); */ \
+    if (abpt->ret_cigar) simd_abpoa_cg_backtrack(score_t);                                            \
+    simd_abpoa_free_var; SIMDFree(GAP_E1S); SIMDFree(GAP_E2S);                                        \
+}
+
+abpoa_simd_matrix_t *abpoa_init_simd_matrix(void) {
+    abpoa_simd_matrix_t *abm = (abpoa_simd_matrix_t*)_err_malloc(sizeof(abpoa_simd_matrix_t));
+    abm->s_msize = 0; abm->s_mem = NULL; abm->rang_m = 0;
+    abm->dp_beg = NULL; abm->dp_end = NULL; abm->dp_beg_sn = NULL; abm->dp_end_sn = NULL;
+    return abm;
+}
+
+void abpoa_free_simd_matrix(abpoa_simd_matrix_t *abm) {
+    if (abm->s_mem) SIMDFree(abm->s_mem);
+    if (abm->dp_beg) {
+        free(abm->dp_beg); free(abm->dp_end); free(abm->dp_beg_sn); free(abm->dp_end_sn);
+    } free(abm);
+}
+
+// realloc memory everytime the graph is updated (nodes are updated already)
+// * index_to_node_id/node_id_to_index/node_id_to_max_remain, max_pos_left/right
+// * qp, DP_HE/H (if ag/lg), dp_f, qi (if ada/extend)
+// * dp_beg/end, dp_beg/end_sn if band
+// * pre_n, pre_index
+int simd_abpoa_realloc(abpoa_t *ab, int gn, int qlen, abpoa_para_t *abpt, SIMD_para_t sp) {
+    uint64_t pn = sp.num_of_value, size = sp.size, sn = (qlen + sp.num_of_value) / pn;
+    uint64_t s_msize = sn * abpt->m * size; // qp
+
+    if (abpt->gap_mode == ABPOA_LINEAR_GAP) s_msize += (sn * gn * size); // DP_H, linear
+    else if (abpt->gap_mode == ABPOA_AFFINE_GAP) s_msize += (sn * gn * 3 * size); // DP_HEF, affine
+    else s_msize += (sn * gn * 5 * size); // DP_H2E2F, convex
+
+    if (abpt->wb >= 0 || abpt->align_mode == ABPOA_EXTEND_MODE) // qi
+        s_msize += sn * size;
+
+    // if (s_msize > UINT32_MAX) {
+        // err_func_format_printf(__func__, "Warning: Graph is too large or query is too long.\n");
+        // return 1;
+    // }
+    // fprintf(stderr, "%lld, %lld, %lld\n", (long long)node_n, (long long)ab->abm->s_msize, (long long)s_msize);
+    if (s_msize > ab->abm->s_msize) {
+        if (ab->abm->s_mem) SIMDFree(ab->abm->s_mem);
+        kroundup64(s_msize); ab->abm->s_msize = s_msize;
+        ab->abm->s_mem = (SIMDi*)SIMDMalloc(ab->abm->s_msize, size);
+    }
+
+    if (gn > ab->abm->rang_m) {
+        ab->abm->rang_m = gn; kroundup32(ab->abm->rang_m);
+        ab->abm->dp_beg = (int*)_err_realloc(ab->abm->dp_beg, ab->abm->rang_m * sizeof(int));
+        ab->abm->dp_end = (int*)_err_realloc(ab->abm->dp_end, ab->abm->rang_m * sizeof(int));
+        ab->abm->dp_beg_sn = (int*)_err_realloc(ab->abm->dp_beg_sn, ab->abm->rang_m * sizeof(int));
+        ab->abm->dp_end_sn = (int*)_err_realloc(ab->abm->dp_end_sn, ab->abm->rang_m * sizeof(int));
+    }
+    return 0;
+}
+
+void abpoa_init_var(abpoa_para_t *abpt, uint8_t *query, int qlen, SIMDi *qp, SIMDi *qi, int *mat, int qp_sn, int pn, SIMDi SIMD_INF_MIN) {
+    int i, j, k; int32_t *_qi;
+    /* generate the query profile */
+    for (i = 0; i < qp_sn * abpt->m; ++i) qp[i] = SIMD_INF_MIN;
+    for (k = 0; k < abpt->m; ++k) { /* SIMD parallelization */
+        int *p = &mat[k * abpt->m];
+        int32_t *_qp = (int32_t*)(qp + k * qp_sn); _qp[0] = 0;
+        for (j = 0; j < qlen; ++j) _qp[j+1] = (int32_t)p[query[j]];
+        for (j = qlen+1; j < qp_sn * pn; ++j) _qp[j] = 0;
+    }
+    if (abpt->wb >= 0 || abpt->align_mode == ABPOA_EXTEND_MODE) { /* query index */
+        _qi = (int32_t*)qi;
+        for (i = 0; i <= qlen; ++i) _qi[i] = i;
+        for (i = qlen+1; i < (qlen/pn+1) * pn; ++i) _qi[i] = -1;
+    }
+}
+
+void abpoa_cg_first_dp(abpoa_para_t *abpt, abpoa_graph_t *graph, uint8_t *index_map, int beg_node_id, int end_node_id, int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int pn, int qlen, int w, int dp_sn, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, int32_t inf_min, int gap_open1, int gap_ext1, int gap_open2, int gap_ext2, int gap_oe1, int gap_oe2) {
+    int i, _end_sn;
+    if (abpt->wb >= 0) {
+        graph->node_id_to_max_pos_left[beg_node_id] = graph->node_id_to_max_pos_right[beg_node_id] = 0;
+        for (i = 0; i < graph->node[beg_node_id].out_edge_n; ++i) { /* set min/max rank for next_id */
+            int out_id = graph->node[beg_node_id].out_id[i];
+            if (index_map[abpoa_graph_node_id_to_index(graph, out_id)])
+                graph->node_id_to_max_pos_left[out_id] = graph->node_id_to_max_pos_right[out_id] = 1;
+        }
+        dp_beg[0] = GET_AD_DP_BEGIN(graph, w, beg_node_id, end_node_id, qlen), dp_end[0] = GET_AD_DP_END(graph, w, beg_node_id, end_node_id, qlen);
+    } else {
+        dp_beg[0] = 0, dp_end[0] = qlen;
+    }
+    dp_beg_sn[0] = (dp_beg[0])/pn; dp_end_sn[0] = (dp_end[0])/pn;
+    dp_beg[0] = dp_beg_sn[0] * pn; dp_end[0] = (dp_end_sn[0]+1)*pn-1;
+    SIMDi *dp_h = DP_H2E2F; SIMDi *dp_e1 = dp_h + dp_sn; SIMDi *dp_e2 = dp_e1 + dp_sn, *dp_f1 = dp_e2 + dp_sn, *dp_f2 = dp_f1 + dp_sn;
+    _end_sn = MIN_OF_TWO(dp_end_sn[0]+1, dp_sn-1);
+
+    for (i = 0; i <= _end_sn; ++i) {
+        dp_h[i] = SIMD_INF_MIN; dp_e1[i] = SIMD_INF_MIN; dp_e2[i] = SIMD_INF_MIN;
+    }
+    int32_t *_dp_h = (int32_t*)dp_h, *_dp_e1 = (int32_t*)dp_e1, *_dp_e2 = (int32_t*)dp_e2, *_dp_f1 = (int32_t*)dp_f1, *_dp_f2 = (int32_t*)dp_f2;
+    _dp_h[0] = 0; _dp_e1[0] = -(gap_oe1); _dp_e2[0] = -(gap_oe2); _dp_f1[0] = _dp_f2[0] = inf_min;
+    for (i = 1; i <= dp_end[0]; ++i) { /* no SIMD parallelization */
+        _dp_f1[i] = -(gap_open1 + gap_ext1 * i);
+        _dp_f2[i] = -(gap_open2 + gap_ext2 * i);
+        _dp_h[i] = MAX_OF_TWO(_dp_f1[i], _dp_f2[i]); // -MIN_OF_TWO(gap_open1+gap_ext1*i, gap_open2+gap_ext2*i);
+    }
+}
+
+int abpoa_max(SIMDi SIMD_INF_MIN, SIMDi zero, int inf_min, SIMDi *dp_h, SIMDi *qi, int qlen, int pn, int beg_sn, int end_sn) {
+    /* select max dp_h */
+    int max = inf_min, max_i = -1, i;
+    SIMDi a = dp_h[end_sn], b = qi[end_sn];
+    if (end_sn == qlen / pn) SIMDSetIfGreateri32(a, zero, b, SIMD_INF_MIN, a);
+    for (i = beg_sn; i < end_sn; ++i) {
+        SIMDGetIfGreateri32(b, a, dp_h[i], a, qi[i], b);
+    }
+    int32_t *_dp_h = (int32_t*)&a, *_qi = (int32_t*)&b;
+    for (i = 0; i < pn; ++i) {
+        if (_dp_h[i] > max) {
+            max = _dp_h[i]; max_i = _qi[i];
+        }
+    }
+    return max_i;
+}
+
+void abpoa_ada_max_i(int max_i, abpoa_graph_t *graph, int node_id) {
+    /* set max_pos_left/right for next nodes */
+    int out_i = max_i + 1; int i;
+    for (i = 0; i < graph->node[node_id].out_edge_n; ++i) {
+        int out_node_id = graph->node[node_id].out_id[i];
+        if (out_i > graph->node_id_to_max_pos_right[out_node_id]) graph->node_id_to_max_pos_right[out_node_id] = out_i;
+        if (out_i < graph->node_id_to_max_pos_left[out_node_id]) graph->node_id_to_max_pos_left[out_node_id] = out_i;
+    }
+}
+
+void abpoa_global_get_max(abpoa_graph_t *graph, int beg_index, int end_node_id, uint8_t *index_map, SIMDi *DP_H_HE, int dp_sn, int qlen, int *dp_end, int32_t *best_score, int *best_i, int *best_j) {
+    int in_id, in_index, dp_i, i;
+    for (i = 0; i < graph->node[end_node_id].in_edge_n; ++i) {
+        in_id = graph->node[end_node_id].in_id[i];
+        in_index = abpoa_graph_node_id_to_index(graph, in_id);
+        if (index_map[in_index] == 0) continue;
+        dp_i = in_index - beg_index;
+        SIMDi *dp_h = DP_H_HE + dp_i * dp_sn;
+        int32_t *_dp_h = (int32_t*)dp_h;
+        int end;
+        if (qlen > dp_end[dp_i]) end = dp_end[dp_i];
+        else end = qlen;
+        if (_dp_h[end] > *best_score) {
+            *best_score = _dp_h[end]; *best_i = dp_i; *best_j = end;
+        }
+    }
+}
+
+int abpoa_cg_dp(SIMDi *q, SIMDi *dp_h, SIMDi *dp_e1, SIMDi *dp_e2, SIMDi *dp_f1, SIMDi *dp_f2, int **pre_index, int *pre_n, int index_i, int dp_i, abpoa_graph_t *graph, abpoa_para_t *abpt, int dp_sn, int pn, int qlen, int w, SIMDi *DP_H2E2F, SIMDi SIMD_INF_MIN, SIMDi GAP_O1, SIMDi GAP_O2, SIMDi GAP_E1, SIMDi GAP_E2, SIMDi GAP_OE1, SIMDi GAP_OE2, SIMDi* GAP_E1S, SIMDi* GAP_E2S, SIMDi *PRE_MIN, SIMDi *PRE_MASK, SIMDi *SUF_MIN, int log_n, int *dp_beg, int *dp_end, int *dp_beg_sn, int *dp_end_sn, int end_node_id) {
+    int tot_dp_sn = 0, i, pre_i, node_id = abpoa_graph_index_to_node_id(graph, index_i);
+    int min_pre_beg_sn, max_pre_end_sn, beg, end, beg_sn, end_sn, pre_end, pre_end_sn, pre_beg_sn, sn_i;
+    if (abpt->wb < 0) {
+        beg = dp_beg[dp_i] = 0, end = dp_end[dp_i] = qlen;
+        beg_sn = dp_beg_sn[dp_i] = beg/pn; end_sn = dp_end_sn[dp_i] = end/pn;
+        min_pre_beg_sn = 0, max_pre_end_sn = end_sn;
+    } else {
+        beg = GET_AD_DP_BEGIN(graph, w, node_id, end_node_id, qlen), end = GET_AD_DP_END(graph, w, node_id, end_node_id, qlen);
+        beg_sn = beg / pn; min_pre_beg_sn = INT32_MAX, max_pre_end_sn = -1;
+        for (i = 0; i < pre_n[dp_i]; ++i) {
+            pre_i = pre_index[dp_i][i];
+            if (min_pre_beg_sn > dp_beg_sn[pre_i]) min_pre_beg_sn = dp_beg_sn[pre_i];
+            if (max_pre_end_sn < dp_end_sn[pre_i]) max_pre_end_sn = dp_end_sn[pre_i];
+        } if (beg_sn < min_pre_beg_sn) beg_sn = min_pre_beg_sn;
+        dp_beg_sn[dp_i] = beg_sn; beg = dp_beg[dp_i] = dp_beg_sn[dp_i] * pn;
+        end_sn = dp_end_sn[dp_i] = end/pn; end = dp_end[dp_i] = (dp_end_sn[dp_i]+1)*pn-1;
+#ifdef __DEBUG__
+    fprintf(stderr, "index: %d (node: %d): beg: %d, end: %d, beg_sn: %d, end_sn: %d\n", index_i, node_id, beg, end, beg_sn, end_sn);
+#endif
+    }
+    tot_dp_sn += (end_sn - beg_sn + 1);
+    /* loop query */
+    // new init start
+    int _beg_sn, _end_sn;
+    // first pre_node
+    pre_i = pre_index[dp_i][0];
+    SIMDi *pre_dp_h = DP_H2E2F + (pre_i * 5) * dp_sn; SIMDi *pre_dp_e1 = pre_dp_h + dp_sn; SIMDi *pre_dp_e2 = pre_dp_e1 + dp_sn;
+    pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; pre_end_sn = dp_end_sn[pre_i];
+    SIMDi first, remain;
+    /* set M from (pre_i, q_i-1) */
+    if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneNi32);
+    else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneNi32);
+    _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);
+    for (i = beg_sn; i < _beg_sn; ++i) dp_h[i] = SIMD_INF_MIN;
+    for (i = _end_sn+1; i <= MIN_OF_TWO(end_sn+1, dp_sn-1); ++i) dp_h[i] = SIMD_INF_MIN;
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */
+        remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneNi32);
+        dp_h[sn_i] = SIMDOri(first, remain);
+        first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneNi32);
+    }
+    /* set E from (pre_i, q_i) */
+    _end_sn = MIN_OF_TWO(pre_end_sn, end_sn);
+    for (i = beg_sn; i < _beg_sn; ++i) dp_e1[i] = SIMD_INF_MIN, dp_e2[i] = SIMD_INF_MIN;
+    for (i = _end_sn+1; i <= end_sn; ++i) dp_e1[i] = SIMD_INF_MIN, dp_e2[i] = SIMD_INF_MIN;
+    for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */
+        dp_e1[sn_i] = pre_dp_e1[sn_i];
+        dp_e2[sn_i] = pre_dp_e2[sn_i];
+    }
+    // if (index_i == 13095) debug_simd_abpoa_print_cg_matrix_row("1", int32_t, index_i);
+    // new init end
+    /* get max m and e */
+    for (i = 1; i < pre_n[dp_i]; ++i) {
+        pre_i = pre_index[dp_i][i];
+        pre_dp_h = DP_H2E2F + (pre_i * 5) * dp_sn; pre_dp_e1 = pre_dp_h + dp_sn; pre_dp_e2 = pre_dp_e1 + dp_sn;
+        pre_end = dp_end[pre_i]; pre_beg_sn = dp_beg_sn[pre_i]; pre_end_sn = dp_end_sn[pre_i];
+        /* set M from (pre_i, q_i-1) */
+        if (pre_beg_sn < beg_sn) _beg_sn = beg_sn, first = SIMDShiftRight(pre_dp_h[beg_sn-1], SIMDTotalBytes-SIMDShiftOneNi32);
+        else _beg_sn = pre_beg_sn, first = SIMDShiftRight(SIMD_INF_MIN, SIMDTotalBytes-SIMDShiftOneNi32);
+        _end_sn = MIN_OF_THREE((pre_end+1)/pn, end_sn, dp_sn-1);
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */
+            remain = SIMDShiftLeft(pre_dp_h[sn_i], SIMDShiftOneNi32);
+            dp_h[sn_i] = SIMDMaxi32(SIMDOri(first, remain), dp_h[sn_i]);
+            first = SIMDShiftRight(pre_dp_h[sn_i], SIMDTotalBytes-SIMDShiftOneNi32);
+        }
+        /* set E from (pre_i, q_i) */
+        _end_sn = MIN_OF_TWO(pre_end_sn, end_sn);
+        for (sn_i = _beg_sn; sn_i <= _end_sn; ++sn_i) { /* SIMD parallelization */
+            dp_e1[sn_i] = SIMDMaxi32(pre_dp_e1[sn_i], dp_e1[sn_i]);
+            dp_e2[sn_i] = SIMDMaxi32(pre_dp_e2[sn_i], dp_e2[sn_i]);
+        }
+    }
+    // debug_simd_abpoa_print_cg_matrix_row("2", int32_t, index_i);
+    /* compare M, E, and F */
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) { /* SIMD parallelization */
+        dp_h[sn_i] =  SIMDAddi32(dp_h[sn_i], q[sn_i]);
+    }
+    // debug_simd_abpoa_print_cg_matrix_row("3", int32_t, index_i);
+    /* new F start */
+    first = SIMDShiftRight(SIMDShiftLeft(dp_h[beg_sn], SIMDTotalBytes-SIMDShiftOneNi32), SIMDTotalBytes-SIMDShiftOneNi32);
+    int set_num; SIMDi first2 = first;//, tmp;
+    for (sn_i = beg_sn; sn_i <= end_sn; ++sn_i) {
+        if (sn_i < min_pre_beg_sn) {
+            _err_fatal_simple(__func__, "sn_i < min_pre_beg_sn\n");
+        } else if (sn_i > max_pre_end_sn) {
+            set_num = sn_i == max_pre_end_sn+1 ? 2 : 1;
+        } else set_num = pn;
+        /* H = max{H, E} */
+        dp_h[sn_i] = SIMDMaxi32(SIMDMaxi32(dp_h[sn_i], dp_e1[sn_i]), dp_e2[sn_i]); // tmp = dp_h[sn_i];
+        /* F = (H << 1 | x) - OE */
+        // if (sn_i==beg_sn) debug_simd_abpoa_print_cg_matrix_row("4.1", int32_t, index_i);
+        dp_f1[sn_i] = SIMDSubi32(SIMDOri(SIMDShiftLeft(dp_h[sn_i], SIMDShiftOneNi32), first), GAP_OE1);
+        dp_f2[sn_i] = SIMDSubi32(SIMDOri(SIMDShiftLeft(dp_h[sn_i], SIMDShiftOneNi32), first2), GAP_OE2);
+        /* F = max{F, (F-e)<<1}, F = max{F, (F-2e)<<2} ... */
+        // if (sn_i==beg_sn) debug_simd_abpoa_print_cg_matrix_row("4.2", int32_t, index_i);
+        SIMD_SET_F(dp_f1[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E1S, SIMDMaxi32, SIMDAddi32, SIMDSubi32, SIMDShiftOneNi32);
+        SIMD_SET_F(dp_f2[sn_i], log_n, set_num, PRE_MIN, PRE_MASK, SUF_MIN, GAP_E2S, SIMDMaxi32, SIMDAddi32, SIMDSubi32, SIMDShiftOneNi32);
+        /* x = max{H, F+o} */
+        // if (sn_i==beg_sn) debug_simd_abpoa_print_cg_matrix_row("4.3", int32_t, index_i);
+        first = SIMDShiftRight(SIMDMaxi32(dp_h[sn_i], SIMDAddi32(dp_f1[sn_i], GAP_O1)), SIMDTotalBytes-SIMDShiftOneNi32);
+        first2 = SIMDShiftRight(SIMDMaxi32(dp_h[sn_i], SIMDAddi32(dp_f2[sn_i], GAP_O2)), SIMDTotalBytes-SIMDShiftOneNi32);
+        /* H = max{H, F}    */
+        dp_h[sn_i] = SIMDMaxi32(SIMDMaxi32(dp_h[sn_i], dp_f1[sn_i]), dp_f2[sn_i]);
+        // if (sn_i==beg_sn) debug_simd_abpoa_print_cg_matrix_row("4.4", int32_t, index_i);
+        /* e for next cell */
+        // SIMDSetIfEquali32(dp_e1[sn_i], dp_h[sn_i], tmp, SIMDMaxi32(SIMDSubi32(dp_e1[sn_i], GAP_E1), SIMDSubi32(dp_h[sn_i], GAP_OE1)), SIMD_INF_MIN);
+        dp_e1[sn_i] = SIMDMaxi32(SIMDSubi32(dp_e1[sn_i], GAP_E1), SIMDSubi32(dp_h[sn_i], GAP_OE1));
+        // SIMDSetIfEquali32(dp_e2[sn_i], dp_h[sn_i], tmp, SIMDMaxi32(SIMDSubi32(dp_e2[sn_i], GAP_E2), SIMDSubi32(dp_h[sn_i], GAP_OE2)), SIMD_INF_MIN);
+        dp_e2[sn_i] = SIMDMaxi32(SIMDSubi32(dp_e2[sn_i], GAP_E2), SIMDSubi32(dp_h[sn_i], GAP_OE2));
+    }
+    return tot_dp_sn;
+}
+
+void abpoa_cg_backtrack(SIMDi *DP_H2E2F, int **pre_index, int *pre_n, int *dp_beg, int *dp_end, int dp_sn, int m, int *mat, int gap_ext1, int gap_ext2, int gap_oe1, int gap_oe2, int beg_index, int best_dp_i, int best_dp_j, int qlen, abpoa_graph_t *graph, abpoa_para_t *abpt, uint8_t *query, abpoa_res_t *res) {
+    int dp_i, dp_j, k, pre_i, n_c = 0, m_c = 0, id, hit, cur_op = ABPOA_ALL_OP, _start_i, _start_j;
+    SIMDi *dp_h;
+    int32_t s, is_match, *_dp_h=NULL, *_dp_e1, *_dp_e2, *_pre_dp_h, *_pre_dp_e1, *_pre_dp_e2, *_dp_f1, *_dp_f2; abpoa_cigar_t *cigar = 0;
+    dp_i = best_dp_i, dp_j = best_dp_j, _start_i = best_dp_i, _start_j = best_dp_j;
+    id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index);
+    if (best_dp_j < qlen) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, qlen-best_dp_j, -1, qlen-1);
+    dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+    int indel_first = 1; /* prefer to keep gaps at the end */
+    while (dp_i > 0 && dp_j > 0) {
+        _start_i = dp_i, _start_j = dp_j;
+        int *pre_index_i = pre_index[dp_i];
+        s = mat[m * graph->node[id].base + query[dp_j-1]]; hit = 0;
+        is_match = graph->node[id].base == query[dp_j-1];
+        if ((cur_op & ABPOA_M_OP) && (indel_first == 0)) {
+            for (k = 0; k < pre_n[dp_i]; ++k) {
+                pre_i = pre_index_i[k];
+                if (dp_j-1 < dp_beg[pre_i] || dp_j-1 > dp_end[pre_i]) continue;
+                _pre_dp_h = (int32_t*)(DP_H2E2F + dp_sn * (pre_i * 5));
+                if (_pre_dp_h[dp_j-1] + s == _dp_h[dp_j]) { /* match/mismatch */
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, dp_j-1);
+                    dp_i = pre_i; --dp_j; id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index); hit = 1;
+                    dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+                    cur_op = ABPOA_ALL_OP;
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;
+                    break;
+                }
+            }
+        }
+        if (hit == 0 && cur_op & ABPOA_E_OP) {
+            _dp_e1 = (int32_t*)(dp_h+dp_sn), _dp_e2 = (int32_t*)(dp_h+dp_sn*2);
+            for (k = 0; k < pre_n[dp_i]; ++k) {
+                pre_i = pre_index_i[k];
+                if (dp_j < dp_beg[pre_i] || dp_j > dp_end[pre_i]) continue;
+                _pre_dp_h = (int32_t*)(DP_H2E2F + dp_sn * (pre_i * 5));
+                if (cur_op & ABPOA_E1_OP) {
+                    _pre_dp_e1 = (int32_t*)(DP_H2E2F + dp_sn * ((pre_i * 5) + 1));
+                    if (cur_op & ABPOA_M_OP) {
+                        if (_dp_h[dp_j] == _pre_dp_e1[dp_j]) {
+                            if (_pre_dp_h[dp_j] - gap_oe1 == _pre_dp_e1[dp_j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;
+                            else cur_op = ABPOA_E1_OP;
+                            cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, dp_j-1);
+                            dp_i = pre_i; id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index); hit = 1;
+                            dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+                            break;
+                        }
+                    } else {
+                        if (_dp_e1[dp_j] == _pre_dp_e1[dp_j] - gap_ext1) {
+                            if (_pre_dp_h[dp_j] - gap_oe1 == _pre_dp_e1[dp_j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;
+                            else cur_op = ABPOA_E1_OP;
+                            cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, dp_j-1);
+                            dp_i = pre_i; id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index); hit = 1;
+                            dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+                            break;
+                        }
+                    }
+                }
+                if (cur_op & ABPOA_E2_OP) {
+                    _pre_dp_e2 = (int32_t*)(DP_H2E2F + dp_sn * ((pre_i * 5) + 2));
+                    if (cur_op & ABPOA_M_OP) {
+                        if (_dp_h[dp_j] == _pre_dp_e2[dp_j]) {
+                            if (_pre_dp_h[dp_j] - gap_oe2 == _pre_dp_e2[dp_j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;
+                            else cur_op = ABPOA_E2_OP;
+                            cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, dp_j-1);
+                            dp_i = pre_i; id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index); hit = 1;
+                            dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+                            break;
+                        }
+                    } else {
+                        if (_dp_e2[dp_j] == _pre_dp_e2[dp_j] - gap_ext2) {
+                            if (_pre_dp_h[dp_j] - gap_oe2 == _pre_dp_e2[dp_j]) cur_op = ABPOA_M_OP | ABPOA_F_OP;
+                            else cur_op = ABPOA_E2_OP;
+                            cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CDEL, 1, id, dp_j-1);
+                            dp_i = pre_i; id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index); hit = 1;
+                            dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        if (hit == 0 && cur_op & ABPOA_F_OP) {
+            if (cur_op & ABPOA_F1_OP) {
+                _dp_f1 = (int32_t*)(dp_h + dp_sn * 3);
+                if (cur_op & ABPOA_M_OP) {
+                    if (_dp_h[dp_j] == _dp_f1[dp_j]) {
+                        if (_dp_h[dp_j-1] - gap_oe1 == _dp_f1[dp_j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;
+                        else if (_dp_f1[dp_j-1] - gap_ext1 == _dp_f1[dp_j]) cur_op = ABPOA_F1_OP, hit = 1;
+                    }
+                } else {
+                    if (_dp_h[dp_j-1] - gap_oe1 == _dp_f1[dp_j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;
+                    else if (_dp_f1[dp_j-1] - gap_ext1 == _dp_f1[dp_j]) cur_op = ABPOA_F1_OP, hit = 1;
+                }
+            }
+            if (hit == 0 && cur_op & ABPOA_F2_OP) {
+                _dp_f2 = (int32_t*)(dp_h + dp_sn * 4);
+                if (cur_op & ABPOA_M_OP) {
+                    if (_dp_h[dp_j] == _dp_f2[dp_j]) {
+                        if (_dp_h[dp_j-1] - gap_oe2 == _dp_f2[dp_j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;
+                        else if (_dp_f2[dp_j-1] - gap_ext2 == _dp_f2[dp_j]) cur_op = ABPOA_F2_OP, hit = 1;
+                    }
+                } else {
+                    if (_dp_h[dp_j-1] - gap_oe2 == _dp_f2[dp_j]) cur_op = ABPOA_M_OP | ABPOA_E_OP, hit = 1;
+                    else if (_dp_f2[dp_j-1] - gap_ext2 == _dp_f2[dp_j]) cur_op = ABPOA_F2_OP, hit = 1;
+                }
+            }
+            if (hit == 1) {
+                cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, 1, id, dp_j-1); --dp_j;
+                ++res->n_aln_bases;
+            }
+        }
+        if (hit == 0 && (cur_op & ABPOA_M_OP) && (indel_first == 1)) {
+            for (k = 0; k < pre_n[dp_i]; ++k) {
+                pre_i = pre_index_i[k];
+                if (dp_j-1 < dp_beg[pre_i] || dp_j-1 > dp_end[pre_i]) continue;
+                _pre_dp_h = (int32_t*)(DP_H2E2F + dp_sn * (pre_i * 5));
+                if (_pre_dp_h[dp_j-1] + s == _dp_h[dp_j]) { /* match/mismatch */
+                    cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CMATCH, 1, id, dp_j-1);
+                    dp_i = pre_i; --dp_j; id = abpoa_graph_index_to_node_id(graph, dp_i+beg_index); hit = 1;
+                    dp_h = DP_H2E2F + dp_sn * (dp_i * 5); _dp_h = (int32_t*)dp_h;
+                    cur_op = ABPOA_ALL_OP;
+                    ++res->n_aln_bases; res->n_matched_bases += is_match ? 1 : 0;
+                    indel_first = 0;
+                    break;
+                }
+            }
+        }
+        if (hit == 0) exit(1);
+#ifdef __DEBUG__
+        fprintf(stderr, "%d, %d, %d\n", dp_i, dp_j, cur_op);
+#endif
+    }
+    if (dp_j > 0) cigar = abpoa_push_cigar(&n_c, &m_c, cigar, ABPOA_CINS, dp_j, -1, dp_j-1);
+    /* reverse cigar */
+    res->graph_cigar = abpt->rev_cigar ? cigar : abpoa_reverse_cigar(n_c, cigar);
+    res->n_cigar = n_c; res->m_cigar = m_c;
+    res->node_e = abpoa_graph_index_to_node_id(graph, best_dp_i+beg_index), res->query_e = best_dp_j-1; /* 0-based */
+    res->node_s = abpoa_graph_index_to_node_id(graph, _start_i+beg_index), res->query_s = _start_j-1;
+    /*abpoa_print_cigar(n_c, *graph_cigar, graph);*/
+}
+
+int abpoa_cg_global_align_sequence_to_graph_core(abpoa_t *ab, int beg_node_id, int beg_index, int end_node_id, int end_index, uint8_t *index_map, int qlen, uint8_t *query, abpoa_para_t *abpt, SIMD_para_t sp, abpoa_res_t *res) {
+    int tot_dp_sn = 0;
+    abpoa_graph_t *graph = ab->abg; abpoa_simd_matrix_t *abm = ab->abm;
+    int matrix_row_n = end_index-beg_index+1, matrix_col_n = qlen + 1;
+    int **pre_index, *pre_n, _pre_index, _pre_n;
+    int i, j, *dp_beg, *dp_beg_sn, *dp_end, *dp_end_sn, node_id, index_i, dp_i;
+    int beg_sn, end_sn;
+    int pn, log_n, size, qp_sn, dp_sn; /* pn: value per SIMDi, qp_sn/dp_sn/d_sn: segmented length*/
+    SIMDi *dp_h, *qp, *qi;
+    int32_t best_score = sp.inf_min, inf_min = sp.inf_min;
+    int *mat = abpt->mat, best_i = 0, best_j = 0; int32_t gap_ext1 = abpt->gap_ext1;
+    int w = abpt->wb < 0 ? qlen : abpt->wb + (int)(abpt->wf * qlen); /* when w < 0, do whole global */
+    SIMDi zero = SIMDSetZeroi(), SIMD_INF_MIN = SIMDSetOnei32(inf_min);
+    pn = sp.num_of_value; qp_sn = dp_sn = (matrix_col_n + pn - 1) / pn, log_n = sp.log_num, size = sp.size;
+    qp = abm->s_mem;
+
+    int32_t gap_open1 = abpt->gap_open1, gap_oe1 = gap_open1 + gap_ext1;
+    int32_t gap_open2 = abpt->gap_open2, gap_ext2 = abpt->gap_ext2, gap_oe2 = gap_open2 + gap_ext2;
+    SIMDi *DP_H2E2F, *dp_e1, *dp_e2, *dp_f2, *dp_f1;
+    SIMDi GAP_O1 = SIMDSetOnei32(gap_open1), GAP_O2 = SIMDSetOnei32(gap_open2), GAP_E1 = SIMDSetOnei32(gap_ext1), GAP_E2 = SIMDSetOnei32(gap_ext2), GAP_OE1 = SIMDSetOnei32(gap_oe1), GAP_OE2 = SIMDSetOnei32(gap_oe2);
+    DP_H2E2F = qp + qp_sn * abpt->m; qi = DP_H2E2F + dp_sn * matrix_row_n * 5;
+
+    // for SET_F mask[pn], suf_min[pn], pre_min[logN]
+    SIMDi *PRE_MASK, *SUF_MIN, *PRE_MIN, *GAP_E1S, *GAP_E2S;
+    PRE_MASK = (SIMDi*)SIMDMalloc((pn+1) * size, size), SUF_MIN = (SIMDi*)SIMDMalloc((pn+1) * size, size), PRE_MIN = (SIMDi*)SIMDMalloc(pn * size, size), GAP_E1S =  (SIMDi*)SIMDMalloc(log_n * size, size), GAP_E2S =  (SIMDi*)SIMDMalloc(log_n * size, size);
+    for (i = 0; i < pn; ++i) {
+        int32_t *pre_mask = (int32_t*)(PRE_MASK + i);
+        for (j = 0; j <= i; ++j) pre_mask[j] = -1;
+        for (j = i+1; j < pn; ++j) pre_mask[j] = 0;
+    } PRE_MASK[pn] = PRE_MASK[pn-1];
+    SUF_MIN[0] = SIMDShiftLeft(SIMD_INF_MIN, SIMDShiftOneNi32);
+    for (i = 1; i < pn; ++i) SUF_MIN[i] = SIMDShiftLeft(SUF_MIN[i-1], SIMDShiftOneNi32); SUF_MIN[pn] = SUF_MIN[pn-1];
+    for (i = 1; i < pn; ++i) {
+        int32_t *pre_min = (int32_t*)(PRE_MIN + i);
+        for (j = 0; j < i; ++j) pre_min[j] = inf_min;
+        for (j = i; j < pn; ++j) pre_min[j] = 0;
+    }
+    GAP_E1S[0] = GAP_E1; GAP_E2S[0] = GAP_E2;
+    for (i = 1; i < log_n; ++i) {
+        GAP_E1S[i] = SIMDAddi32(GAP_E1S[i-1], GAP_E1S[i-1]); GAP_E2S[i] = SIMDAddi32(GAP_E2S[i-1], GAP_E2S[i-1]);
+    }
+    abpoa_init_var(abpt, query, qlen, qp, qi, mat, qp_sn, pn, SIMD_INF_MIN);
+    dp_beg = abm->dp_beg, dp_end = abm->dp_end, dp_beg_sn = abm->dp_beg_sn, dp_end_sn = abm->dp_end_sn;
+    /* index of pre-node */
+    pre_index = (int**)_err_calloc(matrix_row_n, sizeof(int*));
+    pre_n = (int*)_err_calloc(matrix_row_n, sizeof(int));
+    for (index_i = beg_index+1, dp_i = 1; index_i <= end_index; ++index_i, ++dp_i) {
+        node_id = abpoa_graph_index_to_node_id(graph, index_i);
+        pre_n[dp_i] = graph->node[node_id].in_edge_n;
+        pre_index[dp_i] = (int*)_err_malloc(pre_n[dp_i] * sizeof(int));
+        _pre_n = 0;
+        for (j = 0; j < pre_n[dp_i]; ++j) {
+            _pre_index = abpoa_graph_node_id_to_index(graph, graph->node[node_id].in_id[j]);
+            if (index_map[_pre_index]) 
+                pre_index[dp_i][_pre_n++] = _pre_index-beg_index;
+        }
+        pre_n[dp_i] = _pre_n;
+    }
+    abpoa_cg_first_dp(abpt, graph, index_map, beg_node_id, end_node_id, dp_beg, dp_end, dp_beg_sn, dp_end_sn, pn, qlen, w, dp_sn, DP_H2E2F, SIMD_INF_MIN, inf_min, gap_open1, gap_ext1, gap_open2, gap_ext2, gap_oe1, gap_oe2);
+
+    for (index_i=beg_index+1, dp_i=1; index_i<end_index; ++index_i, ++dp_i) {
+        if (index_map[index_i] == 0) continue;
+        node_id = abpoa_graph_index_to_node_id(graph, index_i);
+        SIMDi *q = qp + graph->node[node_id].base * qp_sn;
+        dp_h = DP_H2E2F + (dp_i*5) * dp_sn; dp_e1 = dp_h + dp_sn; dp_e2 = dp_e1 + dp_sn; dp_f1 = dp_e2 + dp_sn; dp_f2 = dp_f1 + dp_sn;
+        tot_dp_sn += abpoa_cg_dp(q, dp_h, dp_e1, dp_e2, dp_f1, dp_f2, pre_index, pre_n, index_i, dp_i, graph, abpt, dp_sn, pn, qlen, w, DP_H2E2F, SIMD_INF_MIN, GAP_O1, GAP_O2, GAP_E1, GAP_E2, GAP_OE1, GAP_OE2, GAP_E1S, GAP_E2S, PRE_MIN, PRE_MASK, SUF_MIN, log_n, dp_beg, dp_end, dp_beg_sn, dp_end_sn, end_node_id);
+        if (abpt->wb >= 0) {
+            beg_sn = dp_beg_sn[dp_i], end_sn = dp_end_sn[dp_i];
+            int max_i = abpoa_max(SIMD_INF_MIN, zero, inf_min, dp_h, qi, qlen, pn, beg_sn, end_sn);
+            abpoa_ada_max_i(max_i, graph, node_id);
+        }
+    }
+    // printf("dp_sn: %d\n", tot_dp_sn);
+    // printf("dp_sn: %d, node_n: %d, seq_n: %d\n", tot_dp_sn, graph->node_n, qlen);
+    abpoa_global_get_max(graph, beg_index, end_node_id, index_map, DP_H2E2F, 5*dp_sn, qlen, dp_end, &best_score, &best_i, &best_j);
+#ifdef __DEBUG__
+    simd_abpoa_print_cg_matrix(int32_t, beg_index, end_index); fprintf(stderr, "best_score: (%d, %d) -> %d\n", best_i, best_j, best_score);
+#endif
+    res->best_score = best_score;
+    abpoa_cg_backtrack(DP_H2E2F, pre_index, pre_n, dp_beg, dp_end, dp_sn, abpt->m, mat, gap_ext1, gap_ext2, gap_oe1, gap_oe2, beg_index, best_i, best_j, qlen, graph, abpt, query, res);
+    for (i = 0; i < matrix_row_n; ++i) free(pre_index[i]);
+    free(pre_index); free(pre_n);
+    SIMDFree(PRE_MASK); SIMDFree(SUF_MIN); SIMDFree(PRE_MIN);
+    SIMDFree(GAP_E1S); SIMDFree(GAP_E2S);
+    return best_score;
+}
+
+// align query to subgraph between beg_node_id and end_node_id (both are excluded)
+// generally: beg/end are the SRC/SINK_node
+int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res) {
+    // if (abpt->simd_flag == 0) err_fatal_simple("No SIMD instruction available.");
+
+    int i, j, beg_index = ab->abg->node_id_to_index[beg_node_id], end_index = ab->abg->node_id_to_index[end_node_id];
+    int gn = end_index - beg_index + 1;
+    uint8_t *index_map = (uint8_t*)_err_calloc(ab->abg->node_n, sizeof(uint8_t));
+    index_map[beg_index] = index_map[end_index] = 1;
+    for (i = beg_index; i < end_index-1; ++i) {
+        if (index_map[i] == 0) continue;
+        int node_id = abpoa_graph_index_to_node_id(ab->abg, i);
+        int out_n = ab->abg->node[node_id].out_edge_n;
+        for (j = 0; j < out_n; ++j) {
+            int out_id = ab->abg->node[node_id].out_id[j];
+            index_map[abpoa_graph_node_id_to_index(ab->abg, out_id)] = 1;
+        }
+    }
+
+#ifdef __DEBUG__
+    _simd_p32.inf_min = MAX_OF_TWO(abpt->gap_ext1, abpt->gap_ext2) * 31 +MAX_OF_THREE(INT32_MIN + abpt->min_mis, INT32_MIN + abpt->gap_open1 + abpt->gap_ext1, INT32_MIN + abpt->gap_open2 + abpt->gap_ext2);
+    if (simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p32)) return 0;
+    if (abpt->gap_mode == ABPOA_CONVEX_GAP) abpoa_cg_global_align_sequence_to_graph_core(ab, beg_node_id, beg_index, end_node_id, end_index, index_map, qlen, query, abpt, _simd_p32, res);
+#else
+    int32_t max_score, bits, mem_ret=0, gap_ext1 = abpt->gap_ext1, gap_ext2 = abpt->gap_ext2;
+    int32_t gap_oe1 = abpt->gap_open1+gap_ext1, gap_oe2 = abpt->gap_open2+gap_ext2;
+    // if (abpt->simd_flag & SIMD_AVX512F && !(abpt->simd_flag & SIMD_AVX512BW))
+    //    max_score = INT16_MAX + 1; // AVX512F has no 8/16 bits operations
+    // else {
+    int len = qlen > gn ? qlen : gn;
+    max_score = MAX_OF_TWO(qlen * abpt->max_mat, len * abpt->gap_ext1 + abpt->gap_open1);
+    // }
+    if (max_score <= INT16_MAX - abpt->min_mis - gap_oe1 - gap_oe2) {
+        _simd_p16.inf_min = MAX_OF_THREE(INT16_MIN + abpt->min_mis, INT16_MIN + gap_oe1, INT16_MIN + gap_oe2) + 31 * MAX_OF_TWO(gap_ext1, gap_ext2);
+        mem_ret = simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p16);
+        bits = 16;
+    } else {
+        _simd_p32.inf_min = MAX_OF_THREE(INT32_MIN + abpt->min_mis, INT32_MIN + gap_oe1, INT32_MIN + gap_oe2) + 31 * MAX_OF_TWO(gap_ext1, gap_ext2);
+        mem_ret = simd_abpoa_realloc(ab, gn, qlen, abpt, _simd_p32);
+        bits = 32;
+    }
+    if (mem_ret) return 0;
+
+    if (bits == 16) {
+        if (abpt->gap_mode == ABPOA_LINEAR_GAP) {
+            simd_abpoa_lg_align_sequence_to_graph_core(int16_t, _simd_p16, SIMDSetOnei16, SIMDMaxi16, \
+                    SIMDAddi16, SIMDSubi16, SIMDShiftOneNi16, SIMDSetIfGreateri16, SIMDGetIfGreateri16);
+        } else if (abpt->gap_mode == ABPOA_AFFINE_GAP) {
+            simd_abpoa_ag_align_sequence_to_graph_core(int16_t, _simd_p16, SIMDSetOnei16, SIMDMaxi16, \
+                    SIMDAddi16, SIMDSubi16, SIMDShiftOneNi16, SIMDSetIfGreateri16, SIMDGetIfGreateri16, SIMDSetIfEquali16);
+        } else if (abpt->gap_mode == ABPOA_CONVEX_GAP) {
+            simd_abpoa_cg_align_sequence_to_graph_core(int16_t, _simd_p16, SIMDSetOnei16, SIMDMaxi16, \
+                    SIMDAddi16, SIMDSubi16, SIMDShiftOneNi16, SIMDSetIfGreateri16, SIMDGetIfGreateri16, SIMDSetIfEquali16);
+        }
+    } else { // 2147483647, DP_H/E/F: 32 bits
+        if (abpt->gap_mode == ABPOA_LINEAR_GAP) {
+            simd_abpoa_lg_align_sequence_to_graph_core(int32_t, _simd_p32, SIMDSetOnei32, SIMDMaxi32, \
+                    SIMDAddi32, SIMDSubi32, SIMDShiftOneNi32, SIMDSetIfGreateri32, SIMDGetIfGreateri32);
+        } else if (abpt->gap_mode == ABPOA_AFFINE_GAP) {
+            simd_abpoa_ag_align_sequence_to_graph_core(int32_t, _simd_p32, SIMDSetOnei32, SIMDMaxi32, \
+                    SIMDAddi32, SIMDSubi32, SIMDShiftOneNi32, SIMDSetIfGreateri32, SIMDGetIfGreateri32, SIMDSetIfEquali32);
+        } else if (abpt->gap_mode == ABPOA_CONVEX_GAP) {
+            simd_abpoa_cg_align_sequence_to_graph_core(int32_t, _simd_p32, SIMDSetOnei32, SIMDMaxi32, \
+                    SIMDAddi32, SIMDSubi32, SIMDShiftOneNi32, SIMDSetIfGreateri32, SIMDGetIfGreateri32, SIMDSetIfEquali32);
+        }
+    }
+#endif
+    free(index_map);
+    return 0;
+}
+
+int simd_abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res) {
+    return simd_abpoa_align_sequence_to_subgraph(ab, abpt, ABPOA_SRC_NODE_ID, ABPOA_SINK_NODE_ID, query, qlen, res);
+}
diff --git a/src/simd_abpoa_align.h b/src/simd_abpoa_align.h
new file mode 100644
index 0000000..cae2af7
--- /dev/null
+++ b/src/simd_abpoa_align.h
@@ -0,0 +1,20 @@
+#ifndef SIMD_ABPOA_ALIGN_H
+#define SIMD_ABPOA_ALIGN_H
+
+#include "abpoa.h"
+#include "abpoa_graph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int simd_abpoa_align_sequence_to_graph(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *query, int qlen, abpoa_res_t *res);
+int simd_abpoa_align_sequence_to_subgraph(abpoa_t *ab, abpoa_para_t *abpt, int beg_node_id, int end_node_id, uint8_t *query, int qlen, abpoa_res_t *res);
+abpoa_simd_matrix_t *abpoa_init_simd_matrix(void);
+void abpoa_free_simd_matrix(abpoa_simd_matrix_t *abm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/simd_check.c b/src/simd_check.c
new file mode 100644
index 0000000..91a96a4
--- /dev/null
+++ b/src/simd_check.c
@@ -0,0 +1,89 @@
+#include "simd_instruction.h"
+
+
+//https://stackoverflow.com/questions/152016/detecting-cpu-architecture-compile-time                                                         
+#if MSVC
+#ifdef _M_X86
+#define ARCH_X86
+#endif
+#endif
+
+#if GCC
+#ifdef __i386__
+#define ARCH_X86
+#endif
+#endif
+
+#ifndef ARCH_X86
+
+int simd_check(void) {
+  return SIMD_AVX2;
+}
+#else
+
+#ifndef _MSC_VER
+// adapted from https://github.com/01org/linux-sgx/blob/master/common/inc/internal/linux/cpuid_gnu.h
+void __cpuidex(int cpuid[4], int func_id, int subfunc_id)
+{
+#if defined(__x86_64__)
+	__asm__ volatile ("cpuid"
+			: "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
+			: "0" (func_id), "2" (subfunc_id));
+#else // on 32bit, ebx can NOT be used as PIC code
+	__asm__ volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1"
+			: "=a" (cpuid[0]), "=r" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
+			: "0" (func_id), "2" (subfunc_id));
+#endif
+}
+#endif
+
+int simd_check(void) {
+	int flag = 0, cpuid[4], max_id;
+	__cpuidex(cpuid, 0, 0);
+    // int i;
+    // for (i = 0 ; i < 4; ++i) printf("%d\t", cpuid[i]); printf("\n");
+	max_id = cpuid[0];
+	if (max_id == 0) return 0;
+	__cpuidex(cpuid, 1, 0);
+    // for (i = 0 ; i < 4; ++i) printf("%d\t", cpuid[i]); printf("\n");
+	if (cpuid[3]>>25&1) flag |= SIMD_SSE;
+	if (cpuid[3]>>26&1) flag |= SIMD_SSE2;
+	if (cpuid[2]>>0 &1) flag |= SIMD_SSE3;
+	if (cpuid[2]>>9 &1) flag |= SIMD_SSSE3;
+	if (cpuid[2]>>19&1) flag |= SIMD_SSE41;
+	if (cpuid[2]>>20&1) flag |= SIMD_SSE42;
+	if (cpuid[2]>>28&1) flag |= SIMD_AVX;
+	if (max_id >= 7) {
+		__cpuidex(cpuid, 7, 0);
+        // for (i = 0 ; i < 4; ++i) printf("%d\t", cpuid[i]); printf("\n");
+		if (cpuid[1]>>5 &1) flag |= SIMD_AVX2;
+		if (cpuid[1]>>16&1) flag |= SIMD_AVX512F;
+		if (cpuid[1]>>30&1) flag |= SIMD_AVX512BW;
+	}
+
+	return flag;
+}
+
+#ifdef __CHECK_SIMD_MAIN__
+int main(void) {
+    char simd_label[6][20] = {"No SIMD", "SSE2 (128 bits)", "SSE4.1 (128 bits)", "AVX2 (256 bits)", "AVX512F (512 bits)", "AVX512BW (512 bits)"};
+    int simd_flag = simd_check(), t=0;
+    if (simd_flag & SIMD_AVX512BW) printf("__AVX512BW__\n"), t = 5;
+    else if (simd_flag & SIMD_AVX512F) printf("__AVX512F__\n"), t = 4;
+    else if (simd_flag & SIMD_AVX2) printf("__AVX2__\n"), t = 3;
+    else if (simd_flag & SIMD_SSE41) printf("__SSE4_1__\n"), t = 2;
+    else if (simd_flag & SIMD_SSE2) printf("__SSE2__\n"), t = 1;
+    else printf("NO SIMD\n"), t = 0;
+
+    char msg[100], i;
+    fprintf(stderr, "\n");
+    sprintf(msg, "==== %s will be used. ====", simd_label[t]);
+    for (i = 0; msg[i]; ++i) fprintf(stderr, "="); fprintf(stderr, "\n");
+    fprintf(stderr, "%s\n",msg);
+    for (i = 0; msg[i]; ++i) fprintf(stderr, "="); fprintf(stderr, "\n");
+    fprintf(stderr, "\n");
+    return simd_flag;
+}
+#endif
+
+#endif
diff --git a/src/simd_instruction.h b/src/simd_instruction.h
new file mode 100644
index 0000000..41deb16
--- /dev/null
+++ b/src/simd_instruction.h
@@ -0,0 +1,633 @@
+// A header file to get you set going with Intel SIMD instrinsic programming. 
+// <immintrin.h> is inlucded for SSE2, SSE41, AVX2 and AVX512F, AVX512BW
+// SSE4.1: floor and blend is available)
+// AVX2: double speed
+
+// do not support AVX512F/AVX512BW 12/20/2021 - Yan Gao
+// AVX512F: quardruple speed
+// AVX512BW: byte and word operation
+
+#include <stdlib.h>
+#include <errno.h>
+
+#pragma once
+#ifndef SIMD_INSTRUCTION_H
+#define SIMD_INSTRUCTION_H
+
+#undef __AVX512F__
+#undef __AVX512BW__
+
+#ifndef USE_SIMDE
+#include <immintrin.h>
+#else // use SIMDE
+#ifdef __AVX512F__
+#include "simde/simde/x86/avx512.h"
+#else
+#ifdef __AVX2__
+#include "simde/simde/x86/avx2.h"
+#else
+#ifdef __SSE4_1__
+#include "simde/simde/x86/sse4.1.h"
+#else
+#include "simde/simde/x86/sse2.h"
+#endif // end of sse41
+#endif // end of AVX2
+#endif // end of 512F
+#endif // end of USE_SIMDE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define SIMD_SSE      0x1
+#define SIMD_SSE2     0x2
+#define SIMD_SSE3     0x4
+#define SIMD_SSSE3    0x8
+#define SIMD_SSE41    0x10
+#define SIMD_SSE42    0x20
+#define SIMD_AVX      0x40
+#define SIMD_AVX2     0x80
+#define SIMD_AVX512F  0x100
+#define SIMD_AVX512BW 0x200
+
+// #define SIMDFree(x) _mm_free(x)
+// posix_memalign and free
+#define SIMDFree(x) free(x)
+
+// Shift, Blend, ... for 8/16 and 32/64
+#ifdef __AVX512BW__
+// start of AVX512BW
+
+typedef __m512 SIMDf;
+typedef __m512i SIMDi;
+
+#define SIMDStore(x,y) _mm512_store_ps(x,y)
+#define SIMDStorei(x,y) _mm512_store_si512(x,y)
+#define SIMDLoad(x) _mm512_load_ps(x)
+#define SIMDLoadi(x) _mm512_load_si512(x)
+#define SIMDZero _mm512_setzero_si512()
+#define SIMDSetZero() _mm512_setzero_ps()
+#define SIMDSetZeroi() _mm512_setzero_si512()
+#define SIMDSetOne(x) _mm512_set1_ps(x)
+#define SIMDSetOnei8(x) _mm512_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm512_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm512_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm512_set1_epi64(x)
+#define SIMDAdd(x,y) _mm512_add_ps(x,y)
+#define SIMDAddi8(x,y) _mm512_add_epi8(x,y)
+#define SIMDAddi16(x,y) _mm512_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm512_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm512_add_epi64(x,y)
+#define SIMDSub(x,y) _mm512_sub_ps(x,y)
+#define SIMDSubi8(x,y) _mm512_sub_epi8(x,y)
+#define SIMDSubi16(x,y) _mm512_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm512_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm512_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm512_mul_ps(x,y)
+#define SIMDMuli32(x,y) _mm512_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm512_and_ps(x,y)
+#define SIMDAndi(x,y) _mm512_and_si512(x,y)
+#define SIMDAndNot(x,y) _mm512_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm512_andnot_si512(x,y)
+#define SIMDOr(x,y) _mm512_or_ps(x,y)
+#define SIMDOri(x,y) _mm512_or_si512(x,y)
+#define SIMDShiftLeft(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(x, _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), (16-(n))) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), (32-(n))) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _mm512_shuffle_i64x2(SIMDZero, _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), (48-(n))) : \
+    _mm512_bslli_epi128(_mm512_shuffle_i64x2(SIMDZero,  _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), ((n)-48))))
+/*
+static inline SIMDi SIMDShiftLeft(SIMDi x, const int n) { // x=a|b|c|d
+    SIMDi tmp1,tmp2;
+    if (n < 16) {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)); // tmp1=0|0|c|d
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(2,1,0,2)); // tmp2=b|c|d|0
+        return _mm512_alignr_epi8(x, tmp2, 16 - n);
+    } else if (n < 32) {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)); // tmp1=0|0|c|d
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(2,1,0,2)); // tmp2=b|c|d|0
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)); // tmp1=c|d|0|0
+        return _mm512_alignr_epi8(tmp2, tmp1, 32 - n);
+    } else if (n < 48) {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0));    // tmp1=c|d|0|0
+        tmp2 = _mm512_shuffle_i64x2(SIMDZero, tmp1, _MM_SHUFFLE(2,0,0,0)); // tmp2=d|0|0|0
+        return _mm512_alignr_epi8(tmp1, tmp2, 48 - n);
+    } else {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0));    // tmp1=c|d|0|0
+        tmp2 = _mm512_shuffle_i64x2(SIMDZero, tmp1, _MM_SHUFFLE(2,0,0,0)); // tmp2=d|0|0|0
+        return _mm512_bslli_epi128(tmp2, n - 48);
+    }
+}*/
+#define SIMDShiftRight(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2( _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), x, (n)) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), ((n)-16)) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), ((n)-32)) : \
+    _mm512_bsrli_epi128(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), ((n)-48))))
+/*
+static inline SIMDi SIMDShiftRight(SIMDi x, int n) { // x=a|b|c|d
+    SIMDi tmp1, tmp2;
+    if (n < 16) {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)); // tmp1=a|b|0|0
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(0,3,2,1)); // tmp2=0|a|b|c
+        return _mm512_alignr_epi8(tmp2, x, n);
+    } else if (n < 32) {
+        tmp1 = _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)); // tmp1=a|b|0|0
+        tmp2 = _mm512_shuffle_i64x2(tmp1, x, _MM_SHUFFLE(0,3,2,1)); // tmp2=0|a|b|c
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)); // tmp1=0|0|a|b
+        return _mm512_alignr_epi8(tmp1, tmp2, n-16);
+    } else if (n < 48) {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2));    // tmp1=0|0|a|b
+        tmp2 = _mm512_shuffle_i64x2(tmp1, SIMDZero, _MM_SHUFFLE(0,0,2,1)); // tmp2=0|0|0|a
+        return _mm512_alignr_epi8(tmp2, tmp1, n-32);
+    } else {
+        tmp1 = _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2));    // tmp1=0|0|a|b
+        tmp2 = _mm512_shuffle_i64x2(tmp1, SIMDZero, _MM_SHUFFLE(0,0,2,1)); // tmp2=0|0|0|a
+        return _mm512_bsrli_epi128(tmp2, n - 48);
+    }
+}*/
+#define SIMDShiftLeftOnei16(x,y) _mm512_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm512_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm512_slli_epi64(x,y)
+#define SIMDShiftRightOnei16(x,y) _mm512_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm512_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm512_srli_epi64(x,y)
+#define SIMDEqualM(x,y) _mm512_cmpeq_ps_mask(x,y)
+#define SIMDEquali8M(x,y) _mm512_cmpeq_epi8_mask(x,y)
+#define SIMDEquali16M(x,y) _mm512_cmpeq_epi16_mask(x,y)
+#define SIMDEquali32M(x,y) _mm512_cmpeq_epi32_mask(x,y)
+#define SIMDEquali64M(x,y) _mm512_cmpeq_epi64_mask(x,y)
+#define SIMDNotEqualM(x,y) _mm512_cmpneq_ps_mask(x,y)
+#define SIMDNotEquali8M(x,y) _mm512_cmpneq_epi8_mask(x,y)
+#define SIMDNotEquali16M(x,y) _mm512_cmpneq_epi16_mask(x,y)
+#define SIMDNotEquali32M(x,y) _mm512_cmpneq_epi32_mask(x,y)
+#define SIMDNotEquali64M(x,y) _mm512_cmpneq_epi64_mask(x,y)
+#define SIMDGreaterThani8M(x,y) _mm512_cmpgt_epi8_mask(x,y)
+#define SIMDGreaterThani16M(x,y) _mm512_cmpgt_epi16_mask(x,y)
+#define SIMDGreaterThani32M(x,y) _mm512_cmpgt_epi32_mask(x,y)
+#define SIMDGreaterThani64M(x,y) _mm512_cmpgt_epi64_mask(x,y)
+#define SIMDGreaterThanOrEquali8M(x,y) _mm512_cmpge_epi8_mask(x,y)
+#define SIMDGreaterThanOrEquali16M(x,y) _mm512_cmpge_epi16_mask(x,y)
+#define SIMDGreaterThanOrEquali32M(x,y) _mm512_cmpge_epi32_mask(x,y)
+#define SIMDGreaterThanOrEquali64M(x,y) _mm512_cmpge_epi64_mask(x,y)
+#define SIMDLessThanM(x,y) _mm512_cmplt_ps_mask(x,y)
+#define SIMDLessThani8M(x,y) _mm512_cmplt_epi8_mask(x,y)
+#define SIMDLessThani16M(x,y) _mm512_cmplt_epi16_mask(x,y)
+#define SIMDLessThani32M(x,y) _mm512_cmplt_epi32_mask(x,y)
+#define SIMDLessThani64M(x,y) _mm512_cmplt_epi64_mask(x,y)
+#define SIMDLessThanOrEqualM(x,y) _mm512_cmple_ps_mask(x,y)
+#define SIMDLessThanOrEquali8M(x,y) _mm512_cmple_epi8_mask(x,y)
+#define SIMDLessThanOrEquali16M(x,y) _mm512_cmple_epi16_mask(x,y)
+#define SIMDLessThanOrEquali32M(x,y) _mm512_cmple_epi32_mask(x,y)
+#define SIMDLessThanOrEquali64M(x,y) _mm512_cmple_epi64_mask(x,y)
+#define SIMDMax(x,y) _mm512_max_ps(x,y)
+#define SIMDMaxi8(x,y) _mm512_max_epi8(x,y)
+#define SIMDMaxi16(x,y) _mm512_max_epi16(x,y)
+#define SIMDMaxi32(x,y) _mm512_max_epi32(x,y)
+#define SIMDMaxi64(x,y) _mm512_max_epi64(x,y)
+#define SIMDMin(x,y) _mm512_min_ps(x,y)
+#define SIMDMini8(x,y) _mm512_min_epi8(x,y)
+#define SIMDMini16(x,y) _mm512_min_epi16(x,y)
+#define SIMDMini32(x,y) _mm512_min_epi32(x,y)
+#define SIMDMini64(x,y) _mm512_min_epi64(x,y)
+
+#define SIMDBlend(x,y,z) _mm512_mask_blend_ps(z, x, y)
+#define SIMDBlendi8(x,y,z) _mm512_mask_blend_epi8(z, x, y)
+#define SIMDBlendi16(x,y,z) _mm512_mask_blend_epi16(z, x, y)
+#define SIMDBlendi32(x,y,z) _mm512_mask_blend_epi32(z, x, y)
+#define SIMDBlendi64(x,y,z) _mm512_mask_blend_epi64(z, x, y)
+
+// with AVX512BW
+#define Maski8 __mmask64
+#define Maski16 __mmask32
+#define Maski32 __mmask16
+#define Maski64 __mmask8
+/* x = a == b ? c : d */ 
+#define SIMDSetIfEquali8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDEquali8M(a,b)); } 
+#define SIMDSetIfEquali16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDEquali16M(a,b)); } 
+#define SIMDSetIfEquali32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDEquali32M(a,b)); } 
+#define SIMDSetIfEquali64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDEquali64M(a,b)); } 
+/* x = a > b ? c : d */
+#define SIMDSetIfGreateri8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDGreaterThani8M(a,b)); } 
+#define SIMDSetIfGreateri16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16M(a,b)); } 
+#define SIMDSetIfGreateri32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32M(a,b)); } 
+#define SIMDSetIfGreateri64(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani64M(a,b)); } 
+/* x = a < b ? c : d */
+#define SIMDSetIfLessi8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDGreaterThani8M(b,a)); } 
+#define SIMDSetIfLessi16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16M(b,a)); } 
+#define SIMDSetIfLessi32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32M(b,a)); } 
+#define SIMDSetIfLessi64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani64M(b,a)); } 
+
+/* x = a > b ? c : d, y = a > b ? a : b */
+#define SIMDGetIfGreateri8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(a,b);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+#define SIMDGetIfGreateri16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(a,b); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfGreateri32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(a,b); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfGreateri64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(a,b); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+/* x = a < b ? c : d, y = a < b ? a : b */
+#define SIMDGetIfLessi8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(b,a);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+#define SIMDGetIfLessi16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(b,a); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfLessi32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(b,a); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfLessi64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(b,a); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+
+// end of AVX512BW
+#else
+#ifdef __AVX512F__
+
+// start of AVX512F
+
+// XXX AVX512F has no  following instructions (AVX512BW HAS), so AVX512F is not working for 8/16 bits tasks
+// addi8/16, subi8/16, alignri8, bslli_epi128, bslrli_epi128,
+// comeqi8/16, cmpneqi8/16, cmpgti8/16, cmpgei8/16, cmplti8/16, cmplei8
+// maxi8/16, blendi8/i16, slli_epi16,srli_epi16 
+typedef __m512 SIMDf;
+typedef __m512i SIMDi;
+
+#define SIMDStore(x,y) _mm512_store_ps(x,y)
+#define SIMDStorei(x,y) _mm512_store_si512(x,y)
+#define SIMDLoad(x) _mm512_load_ps(x)
+#define SIMDLoadi(x) _mm512_load_si512(x)
+#define SIMDZero _mm512_setzero_si512()
+#define SIMDSetZero() _mm512_setzero_ps()
+#define SIMDSetZeroi() _mm512_setzero_si512()
+#define SIMDSetOne(x) _mm512_set1_ps(x)
+#define SIMDSetOnei8(x) _mm512_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm512_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm512_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm512_set1_epi64(x)
+#define SIMDAdd(x,y) _mm512_add_ps(x,y)
+//#define SIMDAddi8(x,y) _mm512_add_epi8(x,y)
+//#define SIMDAddi16(x,y) _mm512_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm512_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm512_add_epi64(x,y)
+#define SIMDSub(x,y) _mm512_sub_ps(x,y)
+//#define SIMDSubi8(x,y) _mm512_sub_epi8(x,y)
+//#define SIMDSubi16(x,y) _mm512_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm512_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm512_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm512_mul_ps(x,y)
+#define SIMDMuli32(x,y) _mm512_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm512_and_ps(x,y)
+#define SIMDAndi(x,y) _mm512_and_si512(x,y)
+#define SIMDAndNot(x,y) _mm512_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm512_andnot_si512(x,y)
+#define SIMDOr(x,y) _mm512_or_ps(x,y)
+#define SIMDOri(x,y) _mm512_or_si512(x,y)
+/*#define SIMDShiftLeft(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(x, _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), (16-(n))) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,1,0)), x, _MM_SHUFFLE(2,1,0,2)), _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), (32-(n))) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _mm512_shuffle_i64x2(SIMDZero, _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), (48-(n))) : \
+    _mm512_bslli_epi128(_mm512_shuffle_i64x2(SIMDZero,  _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(1,0,0,0)), _MM_SHUFFLE(2,0,0,0)), ((n)-48))))
+#define SIMDShiftRight(x,n) \
+    (n) < 16 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2( _mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), x, (n)) : \
+    ((n) < 32 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), _mm512_shuffle_i64x2(_mm512_shuffle_i64x2(SIMDZero, x, _MM_SHUFFLE(3,2,0,0)), x, _MM_SHUFFLE(0,3,2,1)), ((n)-16)) : \
+    ((n) < 48 ? \
+    _mm512_alignr_epi8(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), _mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), ((n)-32)) : \
+    _mm512_bsrli_epi128(_mm512_shuffle_i64x2(_mm512_shuffle_i64x2(x, SIMDZero, _MM_SHUFFLE(0,0,3,2)), SIMDZero, _MM_SHUFFLE(0,0,2,1)), ((n)-48))))*/
+//#define SIMDShiftLeftOnei16(x,y) _mm512_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm512_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm512_slli_epi64(x,y)
+//#define SIMDShiftRightOnei16(x,y) _mm512_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm512_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm512_srli_epi64(x,y)
+#define SIMDEqualM(x,y) _mm512_cmpeq_ps_mask(x,y)
+//#define SIMDEquali8M(x,y) _mm512_cmpeq_epi8_mask(x,y)
+//#define SIMDEquali16M(x,y) _mm512_cmpeq_epi16_mask(x,y)
+#define SIMDEquali32M(x,y) _mm512_cmpeq_epi32_mask(x,y)
+#define SIMDEquali64M(x,y) _mm512_cmpeq_epi64_mask(x,y)
+#define SIMDNotEqualM(x,y) _mm512_cmpneq_ps_mask(x,y)
+//#define SIMDNotEquali8M(x,y) _mm512_cmpneq_epi8_mask(x,y)
+//#define SIMDNotEquali16M(x,y) _mm512_cmpneq_epi16_mask(x,y)
+#define SIMDNotEquali32M(x,y) _mm512_cmpneq_epi32_mask(x,y)
+#define SIMDNotEquali64M(x,y) _mm512_cmpneq_epi64_mask(x,y)
+//#define SIMDGreaterThani8M(x,y) _mm512_cmpgt_epi8_mask(x,y)
+//#define SIMDGreaterThani16M(x,y) _mm512_cmpgt_epi16_mask(x,y)
+#define SIMDGreaterThani32M(x,y) _mm512_cmpgt_epi32_mask(x,y)
+#define SIMDGreaterThani64M(x,y) _mm512_cmpgt_epi64_mask(x,y)
+//#define SIMDGreaterThanOrEquali8M(x,y) _mm512_cmpge_epi8_mask(x,y)
+//#define SIMDGreaterThanOrEquali16M(x,y) _mm512_cmpge_epi16_mask(x,y)
+#define SIMDGreaterThanOrEquali32M(x,y) _mm512_cmpge_epi32_mask(x,y)
+#define SIMDGreaterThanOrEquali64M(x,y) _mm512_cmpge_epi64_mask(x,y)
+#define SIMDLessThanM(x,y) _mm512_cmplt_ps_mask(x,y)
+//#define SIMDLessThani8M(x,y) _mm512_cmplt_epi8_mask(x,y)
+//#define SIMDLessThani16M(x,y) _mm512_cmplt_epi16_mask(x,y)
+#define SIMDLessThani32M(x,y) _mm512_cmplt_epi32_mask(x,y)
+#define SIMDLessThani64M(x,y) _mm512_cmplt_epi64_mask(x,y)
+#define SIMDLessThanOrEqualM(x,y) _mm512_cmple_ps_mask(x,y)
+//#define SIMDLessThanOrEquali8M(x,y) _mm512_cmple_epi8_mask(x,y)
+//#define SIMDLessThanOrEquali16M(x,y) _mm512_cmple_epi16_mask(x,y)
+#define SIMDLessThanOrEquali32M(x,y) _mm512_cmple_epi32_mask(x,y)
+#define SIMDLessThanOrEquali64M(x,y) _mm512_cmple_epi64_mask(x,y)
+#define SIMDMax(x,y) _mm512_max_ps(x,y)
+//#define SIMDMaxi8(x,y) _mm512_max_epi8(x,y)
+//#define SIMDMaxi16(x,y) _mm512_max_epi16(x,y)
+#define SIMDMaxi32(x,y) _mm512_max_epi32(x,y)
+#define SIMDMaxi64(x,y) _mm512_max_epi64(x,y)
+#define SIMDMin(x,y) _mm512_min_ps(x,y)
+//#define SIMDMini8(x,y) _mm512_min_epi8(x,y)
+//#define SIMDMini16(x,y) _mm512_min_epi16(x,y)
+#define SIMDMini32(x,y) _mm512_min_epi32(x,y)
+#define SIMDMini64(x,y) _mm512_min_epi64(x,y)
+
+#define SIMDBlend(x,y,z) _mm512_mask_blend_ps(z, x, y)
+//#define SIMDBlendi8(x,y,z) _mm512_mask_blend_epi8(z, x, y)
+//#define SIMDBlendi16(x,y,z) _mm512_mask_blend_epi16(z, x, y)
+#define SIMDBlendi32(x,y,z) _mm512_mask_blend_epi32(z, x, y)
+#define SIMDBlendi64(x,y,z) _mm512_mask_blend_epi64(z, x, y)
+
+// with AVX512F
+//#define Maski8 __mmask64
+//#define Maski16 __mmask32
+#define Maski32 __mmask16
+#define Maski64 __mmask8
+/* x = a == b ? c : d */ 
+//#define SIMDSetIfEquali8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDEquali8M(a,b)); } 
+//#define SIMDSetIfEquali16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDEquali16M(a,b)); } 
+#define SIMDSetIfEquali32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDEquali32M(a,b)); } 
+#define SIMDSetIfEquali64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDEquali64M(a,b)); } 
+/* x = a > b ? c : d */
+//#define SIMDSetIfGreateri8(x,a,b,c,d)  { x = SIMDBlendi8(d, c, SIMDGreaterThani8M(a,b)); } 
+//#define SIMDSetIfGreateri16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16M(a,b)); } 
+#define SIMDSetIfGreateri32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32M(a,b)); } 
+#define SIMDSetIfGreateri64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani64M(a,b)); } 
+/* x = a < b ? c : d */
+//#define SIMDSetIfLessi8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDGreaterThani8M(b,a)); } 
+//#define SIMDSetIfLessi16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani8M(b,a)); } 
+#define SIMDSetIfLessi32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani8M(b,a)); } 
+#define SIMDSetIfLessi64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani8M(b,a)); } 
+
+/* x = a > b ? c : d, y = a > b ? a : b */
+//#define SIMDGetIfGreateri8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(a,b);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+//#define SIMDGetIfGreateri16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(a,b); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfGreateri32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(a,b); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfGreateri64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(a,b); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+/* x = a < b ? c : d, y = a < b ? a : b */
+//#define SIMDGetIfLessi8(x,y,a,b,c,d)  { Maski8  cmp = SIMDGreaterThani8M(b,a);  x = SIMDBlendi8(d, c, cmp);  y = SIMDBlendi8(b, a, cmp); } 
+//#define SIMDGetIfLessi16(x,y,a,b,c,d) { Maski16 cmp = SIMDGreaterThani16M(b,a); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfLessi32(x,y,a,b,c,d) { Maski32 cmp = SIMDGreaterThani32M(b,a); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfLessi64(x,y,a,b,c,d) { Maski64 cmp = SIMDGreaterThani64M(b,a); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+
+// end of AVX512F
+#else  // AVX2 SSE4.1 SSE2
+#ifdef __AVX2__
+
+// start of AVX2
+// m256 will be our base type
+typedef __m256 SIMDf;  //for floats
+typedef __m256i SIMDi; //for integers
+
+//intrinsic functions
+#define SIMDStore(x,y) _mm256_store_ps(x,y)
+#define SIMDLoad(x) _mm256_load_ps(x)
+#define SIMDStorei(x,y) _mm256_store_si256(x,y)
+#define SIMDLoadi(x) _mm256_load_si256(x)
+#define SIMDSet(x,y,z,w,a,b,c,d) _mm256_set_ps(x,y,z,w,a,b,c,d)
+#define SIMDSeti8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32) __mm256_set_epi8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32)
+#define SIMDSeti16(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16) __mm256_set_epi16(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16)
+#define SIMDSeti32(x1,x2,x3,x4,x5,x6,x7,x8) __mm256_set_epi32(x1,x2,x3,x4,x5,x6,x7,x8)
+#define SIMDSeti64(x1,x2,x3,x4) __mm256_set_epi64x(x1,x2,x3,x4)
+#define SIMDSeti128(x,y) __mm256_set_m128(x,y)
+#define SIMDSetZero() _mm256_setzero_ps()
+#define SIMDSetZeroi() _mm256_setzero_si256()
+#define SIMDSetOne(x) _mm256_set1_ps(x)
+#define SIMDSetOnei8(x) _mm256_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm256_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm256_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm256_set1_epi64x(x)
+#define SIMDAdd(x,y) _mm256_add_ps(x,y)
+#define SIMDAddi8(x,y) _mm256_add_epi8(x,y)
+#define SIMDAddi16(x,y) _mm256_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm256_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm256_add_epi64(x,y)
+#define SIMDSub(x,y) _mm256_sub_ps(x,y)
+#define SIMDSubi8(x,y) _mm256_sub_epi8(x,y)
+#define SIMDSubi16(x,y) _mm256_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm256_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm256_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm256_mul_ps(x,y)
+#define SIMDMuli(x,y) _mm256_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm256_and_ps(x,y)
+#define SIMDAndi(x,y) _mm256_and_si256(x,y)
+#define SIMDAndNot(x,y) _mm256_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm256_andnot_si256(x,y)
+#define SIMDOr(x,y) _mm256_or_ps(x,y)
+#define SIMDOri(x,y) _mm256_or_si256(x,y)
+#define SIMDShiftLeft(a, n) (n) < 16 ? \
+    _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), (16-(n))) : \
+    _mm256_slli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), ((n)-16))
+
+#define SIMDShiftRight(a, n) (n) < 16 ? \
+    _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), (n)) : \
+    _mm256_srli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), ((n)-16))
+#define SIMDShiftLeftOnei16(x,y) _mm256_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm256_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm256_slli_epi64(x,y)
+#define SIMDShiftRightOnei16(x,y) _mm256_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm256_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm256_srli_epi64(x,y)
+#define SIMDEqual(x,y)  _mm256_cmp_ps(x,y,_CMP_EQ_OQ) 
+#define SIMDEquali16(x,y) _mm256_cmpeq_epi16(x,y)
+#define SIMDEquali8(x,y) _mm256_cmpeq_epi8(x,y)
+#define SIMDEquali32(x,y) _mm256_cmpeq_epi32(x,y)
+#define SIMDEquali64(x,y) _mm256_cmpeq_epi64(x,y)
+#define SIMDGreaterThan(x,y) _mm256_cmp_ps(x,y,_CMP_GT_OQ)
+#define SIMDGreaterThani16(x,y) _mm256_cmpgt_epi16(x,y)
+#define SIMDGreaterThani8(x,y) _mm256_cmpgt_epi8(x,y)
+#define SIMDGreaterThani32(x,y) _mm256_cmpgt_epi32(x,y)
+#define SIMDGreaterThani64(x,y) _mm256_cmpgt_epi64(x,y) 
+#define SIMDFloor(x) _mm256_floor_ps(x)
+#define SIMDMax(x,y) _mm256_max_ps(x,y)
+#define SIMDMaxi8(x,y) _mm256_max_epi8(x,y)
+#define SIMDMaxi16(x,y) _mm256_max_epi16(x,y)
+#define SIMDMaxi32(x,y) _mm256_max_epi32(x,y)
+#define SIMDMaxi64(x,y) _mm256_max_epi64(x,y)
+#define SIMDMin(x,y) _mm256_min_ps(x,y)
+#define SIMDMini8(x,y) _mm256_min_epi8(x,y)
+#define SIMDMini16(x,y) _mm256_min_epi16(x,y)
+#define SIMDMini32(x,y) _mm256_min_epi32(x,y)
+
+#define SIMDBlendV(x,y,z) _mm256_blendv_ps(x,y,z)
+#define SIMDBlendVi8(x,y,z) _mm256_blendv_epi8(x,y,z)
+
+// end of AVX2 only
+ 
+#else // SSE4.1 SSE2
+
+// start of SSE4.1 and SSE2
+// m128 will be our base type
+typedef __m128 SIMDf;   //for floats
+typedef __m128i SIMDi; //for integers
+
+#define SIMDStore(x,y) _mm_store_ps(x,y)
+#define SIMDLoad(x) _mm_load_ps(x)
+#define SIMDStorei(x,y) _mm_store_si128(x,y)
+#define SIMDLoadi(x) _mm_load_si128(x)
+#define SIMDSeti8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16) __mm_set_epi8(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16)
+#define SIMDSeti16(x1,x2,x3,x4,x5,x6,x7,x8) __mm_set_epi16(x1,x2,x3,x4,x5,x6,x7,x8)
+#define SIMDSeti32(x1,x2,x3,x4) __mm_set_epi32(x1,x2,x3,x4)
+#define SIMDSeti64(x,y) __mm_set_epi64(x,y)
+#define SIMDSetOne(x) _mm_set1_ps(x)
+#define SIMDSetZero() _mm_setzero_ps()
+#define SIMDSetOnei8(x) _mm_set1_epi8(x)
+#define SIMDSetOnei16(x) _mm_set1_epi16(x)
+#define SIMDSetOnei32(x) _mm_set1_epi32(x)
+#define SIMDSetOnei64(x) _mm_set1_epi64(x)
+#define SIMDSetZeroi() _mm_setzero_si128()
+#define SIMDAdd(x,y) _mm_add_ps(x,y)
+#define SIMDAddi8(x,y) _mm_add_epi8(x,y)
+#define SIMDAddi16(x,y) _mm_add_epi16(x,y)
+#define SIMDAddi32(x,y) _mm_add_epi32(x,y)
+#define SIMDAddi64(x,y) _mm_add_epi64(x,y)
+#define SIMDSub(x,y) _mm_sub_ps(x,y)
+#define SIMDSubi8(x,y) _mm_sub_epi8(x,y)
+#define SIMDSubi16(x,y) _mm_sub_epi16(x,y)
+#define SIMDSubi32(x,y) _mm_sub_epi32(x,y)
+#define SIMDSubi64(x,y) _mm_sub_epi64(x,y)
+#define SIMDMul(x,y) _mm_mul_ps(x,y)
+#define SIMDMuli(x,y) _mm_mul_epi32(x,y)
+#define SIMDAnd(x,y) _mm_and_ps(x,y)
+#define SIMDAndi(x,y) _mm_and_si128(x,y)
+#define SIMDAndNot(x,y) _mm_andnot_ps(x,y)
+#define SIMDAndNoti(x,y) _mm_andnot_si128(x,y)
+#define SIMDOr(x,y) _mm_or_ps(x,y)
+#define SIMDOri(x,y) _mm_or_si128(x,y)
+#define SIMDShiftLeft(x,y) _mm_slli_si128(x,y) // shift whole x by y bits
+#define SIMDShiftRight(x,y) _mm_srli_si128(x,y)
+#define SIMDShiftLeftOnei16(x,y) _mm_slli_epi16(x,y)
+#define SIMDShiftLeftOnei32(x,y) _mm_slli_epi32(x,y)
+#define SIMDShiftLeftOnei64(x,y) _mm_slli_epi64(x,y)
+#define SIMDShiftRightOnei16(x,y) _mm_srli_epi16(x,y)
+#define SIMDShiftRightOnei32(x,y) _mm_srli_epi32(x,y)
+#define SIMDShiftRightOnei64(x,y) _mm_srli_epi64(x,y)
+#define SIMDEqual(x,y)  _mm_cmpeq_ps(x,y)
+#define SIMDEquali8(x,y) _mm_cmpeq_epi8(x,y)
+#define SIMDEquali16(x,y) _mm_cmpeq_epi16(x,y)
+#define SIMDEquali32(x,y) _mm_cmpeq_epi32(x,y)
+#define SIMDGreaterThan(x,y) _mm_cmpgt_ps(x,y)
+#define SIMDGreaterThani8(x,y) _mm_cmpgt_epi8(x,y)
+#define SIMDGreaterThani16(x,y) _mm_cmpgt_epi16(x,y)
+#define SIMDGreaterThani32(x,y) _mm_cmpgt_epi32(x,y)
+#define SIMDLessThan(x,y) _mm_cmplt_ps(x,y)
+#define SIMDLessThani8(x,y) _mm_cmplt_epi8(x,y) 
+#define SIMDLessThani16(x,y) _mm_cmplt_epi16(x,y) 
+#define SIMDLessThani32(x,y) _mm_cmplt_epi32(x,y) 
+#define SIMDMax(x,y) _mm_max_ps(x,y)
+#define SIMDMaxi16(x,y) _mm_max_epi16(x,y)
+#define SIMDMin(x,y) _mm_min_ps(x,y)
+#define SIMDMini16(x,y) _mm_min_epi16(x,y)
+
+#define Maski16 __mmask8
+#define Maski32 __mmask8
+
+#ifdef __SSE4_1__
+
+// start of SSE4.1 only
+#define SIMDBlendV(x,y,z) _mm_blendv_ps(x,y,z)	    // z is __mask
+#define SIMDBlendVi8(x,y,z) _mm_blendv_epi8(x,y,z)	
+#define SIMDEquali64(x,y) _mm_cmpeq_epi64(x,y)
+#define SIMDFloor(x) _mm_floor_ps(x)
+#define SIMDMaxi8(x,y) _mm_max_epi8(x,y)
+#define SIMDMini8(x,y) _mm_min_epi8(x,y)
+#define SIMDMaxi32(x,y) _mm_max_epi32(x,y)
+#define SIMDMini32(x,y) _mm_min_epi32(x,y)
+// end of SSE4.1 only
+
+#else  // SSE2
+
+// start of SSE2 only
+#define SIMDBlendV(x,y,z) SIMDOr(SIMDAndNot(z,x), SIMDAnd(z,y))   //if we don't have sse4
+#define SIMDBlendVi8(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))    //if we don't have sse4
+#define SIMDMaxi8(x,y) SIMDBlendVi8(y, x, SIMDGreaterThani8(x,y))
+#define SIMDMini8(x,y) SIMDBlendVi8(x, y, SIMDGreaterThani8(x,y))
+#define SIMDMaxi32(x,y) SIMDBlendi32(y, x, SIMDGreaterThani32(x,y))
+#define SIMDMini32(x,y) SIMDBlendi32(x, y, SIMDGreaterThani32((x,y))
+// end of SSE2 only
+// end of SSE4.1 and SSE2
+
+#endif // SSE4.1
+
+#endif // AVX2
+
+// start of no AVX512F (AVX2/SSE4.1/SSE2)
+#define SIMDBlendi16(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))
+#define SIMDBlendi32(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))
+#define SIMDBlendi64(x,y,z) SIMDOri(SIMDAndNoti(z,x), SIMDAndi(z,y))
+
+/* x = a == b ? c : d */ 
+#define SIMDSetIfEquali8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDEquali8(a,b)); } 
+#define SIMDSetIfEquali16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDEquali16(a,b)); } 
+#define SIMDSetIfEquali32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDEquali32(a,b)); } 
+#define SIMDSetIfEquali64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDEquali64(a,b)); } 
+/* x = a > b ? c : d */
+#define SIMDSetIfGreateri8(x,a,b,c,d)  { x = SIMDBlendVi8(d, c, SIMDGreaterThani8(a,b)); } 
+#define SIMDSetIfGreateri16(x,a,b,c,d) { x = SIMDBlendi16(d, c, SIMDGreaterThani16(a,b)); } 
+#define SIMDSetIfGreateri32(x,a,b,c,d) { x = SIMDBlendi32(d, c, SIMDGreaterThani32(a,b)); } 
+#define SIMDSetIfGreateri64(x,a,b,c,d) { x = SIMDBlendi64(d, c, SIMDGreaterThani64(a,b)); } 
+/* x = a < b ? c : d */
+#define SIMDSetIfLessi8(x,a,b,c,d)  { x = BlendVi8(d, c, SIMDGreaterThani8(b,a)); } 
+#define SIMDSetIfLessi16(x,a,b,c,d) { x = Blendi16(d, c, SIMDGreaterThani16(b,a)); } 
+#define SIMDSetIfLessi32(x,a,b,c,d) { x = Blendi32(d, c, SIMDGreaterThani32(b,a)); } 
+#define SIMDSetIfLessi64(x,a,b,c,d) { x = Blendi64(d, c, SIMDGreaterThani64(b,a)); } 
+
+/* x = a > b ? c : d, y = a > b ? a : b */
+#define SIMDGetIfGreateri8(x,y,a,b,c,d)  { SIMDi cmp = SIMDGreaterThani8(a,b);  x = SIMDBlendVi8(d, c, cmp); y = SIMDBlendVi8(b, a, cmp); } 
+#define SIMDGetIfGreateri16(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani16(a,b); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfGreateri32(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani32(a,b); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfGreateri64(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani64(a,b); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+/* x = a < b ? c : d, y = a < b ? a : b */
+#define SIMDGetIfLessi8(x,y,a,b,c,d)  { SIMDi cmp = SIMDGreaterThani8(b,a);  x = SIMDBlendVi8(d, c, cmp); y = SIMDBlendVi8(b, a, cmp); } 
+#define SIMDGetIfLessi16(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani16(b,a); x = SIMDBlendi16(d, c, cmp); y = SIMDBlendi16(b, a, cmp); } 
+#define SIMDGetIfLessi32(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani32(b,a); x = SIMDBlendi32(d, c, cmp); y = SIMDBlendi32(b, a, cmp); } 
+#define SIMDGetIfLessi64(x,y,a,b,c,d) { SIMDi cmp = SIMDGreaterThani64(b,a); x = SIMDBlendi64(d, c, cmp); y = SIMDBlendi64(b, a, cmp); } 
+// end of no AVX512F (AVX2/SSE4.1/SSE2)
+
+#endif // AVX512F
+#endif // AVX512BW
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// int simd_check(void);
+
+/*
+static void *SIMDMalloc(size_t size, size_t align) {
+    void *ret = (void*)_mm_malloc(size, align);
+    if (ret == NULL) {
+        fprintf(stderr, "[%s] mm_Malloc fail!\nSize: %ld\n", __func__, size);
+        exit(1);
+    }
+    else return ret;
+}*/
+
+// use posix_memalign
+static void *SIMDMalloc(size_t size, size_t align) {
+    void *ret; int res;
+    res = posix_memalign(&ret, align, size);
+    if (res != 0) {
+        char error[10];
+        if (res == EINVAL) strcpy(error, "EINVAR");
+        else if (res == ENOMEM)
+            strcpy(error, "ENOMEM");
+        else strcpy(error, "Unknown");
+        fprintf(stderr, "[%s] posix_memalign fail!\nSize: %ld, Error: %s\n", __func__, size, error);
+        exit(1);
+    }
+    else return ret;
+}
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // SIMD_INSTRUCTION_H
diff --git a/src/utils.c b/src/utils.c
new file mode 100644
index 0000000..a5d7f42
--- /dev/null
+++ b/src/utils.c
@@ -0,0 +1,407 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+#define FSYNC_ON_FLUSH
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <errno.h>
+#ifdef FSYNC_ON_FLUSH
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <time.h>
+#include "utils.h"
+
+#include "ksort.h"
+#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
+KSORT_INIT(128, pair64_t, pair64_lt)
+KSORT_INIT(64,  uint64_t, ks_lt_generic)
+
+#include "kseq.h"
+KSEQ_INIT2(, gzFile, err_gzread)
+
+/********************
+ * System utilities *
+ ********************/
+
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
+{
+	FILE *fp = 0;
+	if (strcmp(fn, "-") == 0)
+		return (strstr(mode, "r"))? stdin : stdout;
+	if ((fp = fopen(fn, mode)) == 0) {
+		err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
+	}
+	return fp;
+}
+
+FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
+{
+	if (freopen(fn, mode, fp) == 0) {
+		err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
+	}
+	return fp;
+}
+
+gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
+{
+	gzFile fp;
+	if (strcmp(fn, "-") == 0) {
+		fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
+		/* According to zlib.h, this is the only reason gzdopen can fail */
+		if (!fp) err_fatal(func, "Out of memory");
+		return fp;
+	}
+	if ((fp = gzopen(fn, mode)) == 0) {
+		err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory");
+	}
+	return fp;
+}
+
+void err_fatal(const char *header, const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	fprintf(stderr, "[%s] ", header);
+	vfprintf(stderr, fmt, args);
+	fprintf(stderr, "\n");
+	va_end(args);
+	exit(EXIT_FAILURE);
+}
+
+void err_fatal_core(const char *header, const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	fprintf(stderr, "[%s] ", header);
+	vfprintf(stderr, fmt, args);
+	fprintf(stderr, " Abort!\n");
+	va_end(args);
+	abort();
+}
+
+void _err_fatal_simple(const char *func, const char *msg)
+{
+	fprintf(stderr, "[%s] %s\n", func, msg);
+	exit(EXIT_FAILURE);
+}
+
+void _err_fatal_simple_core(const char *func, const char *msg)
+{
+	fprintf(stderr, "[%s] %s Abort!\n", func, msg);
+	abort();
+}
+
+size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	size_t ret = fwrite(ptr, size, nmemb, stream);
+	if (ret != nmemb) 
+		_err_fatal_simple("fwrite", strerror(errno));
+	return ret;
+}
+
+size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	size_t ret = fread(ptr, size, nmemb, stream);
+	if (ret != nmemb)
+	{
+		_err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file");
+	}
+	return ret;
+}
+
+int err_gzread(gzFile file, void *ptr, unsigned int len)
+{
+	int ret = gzread(file, ptr, len);
+
+	if (ret < 0)
+	{
+		int errnum = 0;
+		const char *msg = gzerror(file, &errnum);
+		_err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg);
+	}
+
+	return ret;
+}
+
+int err_fseek(FILE *stream, long offset, int whence)
+{
+	int ret = fseek(stream, offset, whence);
+	if (0 != ret)
+	{
+		_err_fatal_simple("fseek", strerror(errno));
+	}
+	return ret;
+}
+
+long err_ftell(FILE *stream)
+{
+	long ret = ftell(stream);
+	if (-1 == ret)
+	{
+		_err_fatal_simple("ftell", strerror(errno));
+	}
+	return ret;
+}
+
+int err_func_printf(const char *func, const char *format, ...)
+{
+    fprintf(stderr, "[%s] ", func);
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stderr, format, arg);
+    fprintf(stderr, "\n");
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) _err_fatal_simple("vfprintf(stderr)", strerror(saveErrno));
+	return done;
+}
+
+int err_printf(const char *format, ...) 
+{
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stderr, format, arg);
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) _err_fatal_simple("vfprintf(stderr)", strerror(saveErrno));
+	return done;
+}
+
+int stdout_printf(const char *format, ...) 
+{
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stdout, format, arg);
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
+	return done;
+}
+
+int err_fprintf(FILE *stream, const char *format, ...) 
+{
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stream, format, arg);
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno));
+	return done;
+}
+
+int err_fputc(int c, FILE *stream)
+{
+	int ret = putc(c, stream);
+	if (EOF == ret)
+	{
+		_err_fatal_simple("fputc", strerror(errno));
+	}
+
+	return ret;
+}
+
+int err_fputs(const char *s, FILE *stream)
+{
+	int ret = fputs(s, stream);
+	if (EOF == ret)
+	{
+		_err_fatal_simple("fputs", strerror(errno));
+	}
+
+	return ret;
+}
+
+void err_fgets(char *buff, size_t s, FILE *fp)
+{
+    if (fgets(buff, s, fp) == NULL) {
+        err_fatal_simple("fgets error.\n");
+    }
+}
+
+int err_puts(const char *s)
+{
+	int ret = puts(s);
+	if (EOF == ret)
+	{
+		_err_fatal_simple("puts", strerror(errno));
+	}
+
+	return ret;
+}
+
+int err_fflush(FILE *stream) 
+{
+    int ret = fflush(stream);
+    if (ret != 0) _err_fatal_simple("fflush", strerror(errno));
+
+#ifdef FSYNC_ON_FLUSH
+	/* Calling fflush() ensures that all the data has made it to the
+	   kernel buffers, but this may not be sufficient for remote filesystems
+	   (e.g. NFS, lustre) as an error may still occur while the kernel
+	   is copying the buffered data to the file server.  To be sure of
+	   catching these errors, we need to call fsync() on the file
+	   descriptor, but only if it is a regular file.  */
+	{
+		struct stat sbuf;
+		if (0 != fstat(fileno(stream), &sbuf))
+			_err_fatal_simple("fstat", strerror(errno));
+		
+		if (S_ISREG(sbuf.st_mode))
+		{
+			if (0 != fsync(fileno(stream)))
+				_err_fatal_simple("fsync", strerror(errno));
+		}
+	}
+#endif
+    return ret;
+}
+
+int err_fclose(FILE *stream) 
+{
+	int ret = fclose(stream);
+	if (ret != 0) _err_fatal_simple("fclose", strerror(errno));
+	return ret;
+}
+
+int err_gzclose(gzFile file)
+{
+	int ret = gzclose(file);
+	if (Z_OK != ret)
+	{
+		_err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret));
+	}
+
+	return ret;
+}
+
+/*********
+ * alloc *
+ *********/
+void *err_malloc(const char *func, size_t s)
+{
+    void *ret = (void*)malloc(s);
+    if (ret == NULL) err_fatal_core(func, "Malloc fail!\nSize: %lld\n", s);
+    else return ret;
+}
+
+void *err_calloc(const char *func, size_t n, size_t s)
+{
+    void *ret = (void*)calloc(n, s);
+    if (ret == NULL) err_fatal_core(func, "Calloc fail!\nN: %d\tSize: %lld\n", n, s);
+    else return ret;
+}
+
+void *err_realloc(const char *func, void *p, size_t s)
+{
+    void *ret = (void*)realloc(p, s);
+    if (ret == NULL) err_fatal_core(func, "Realloc fail!\nSize: %lld\n", s);
+    else return ret;
+}
+
+/*********
+ * Timer *
+ *********/
+void usr_sys_cputime(double *usr_t, double *sys_t)
+{
+	struct rusage r;
+	getrusage(RUSAGE_SELF, &r);
+    *usr_t = r.ru_utime.tv_sec + 1e-6 * r.ru_utime.tv_usec;
+	*sys_t = r.ru_stime.tv_sec + 1e-6 * + r.ru_stime.tv_usec;
+}
+
+double cputime()
+{
+	struct rusage r;
+	getrusage(RUSAGE_SELF, &r);
+	return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
+}
+
+double realtime()
+{
+	struct timeval tp;
+	struct timezone tzp;
+	gettimeofday(&tp, &tzp);
+	return tp.tv_sec + tp.tv_usec * 1e-6;
+}
+
+long peakrss(void)
+{
+	struct rusage r;
+	getrusage(RUSAGE_SELF, &r);
+#ifdef __linux__
+	return r.ru_maxrss * 1024;
+#else
+	return r.ru_maxrss;
+#endif
+}
+
+void get_cur_time(const char *prefix)
+{
+    time_t now = time(0);
+    struct tm ts; char buf[1024];
+    ts = *localtime(&now);
+    err_printf("[%s] ", prefix);
+    strftime(buf, sizeof(buf), "%Y-%m-%d-%s", &ts);
+}
+
+void print_format_time(FILE *out)
+{
+    time_t rawtime;
+    struct tm *info;
+    char buffer[80];
+
+    time(&rawtime);
+    info = localtime( &rawtime );
+    strftime(buffer,80,"%m-%d-%Y %X", info);
+    fprintf(out, "== %s == ", buffer);
+}
+
+int err_func_format_printf(const char *func, const char *format, ...)
+{
+    print_format_time(stderr);
+    fprintf(stderr, "[%s] ", func);
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stderr, format, arg);
+    fprintf(stderr, "\n");
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) _err_fatal_simple("vfprintf(stderr)", strerror(saveErrno));
+	return done;
+}
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000..69d3698
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,277 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kroundup64
+#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
+#endif
+ 
+#ifdef __GNUC__
+// Tell GCC to validate printf format string and args
+#define ATTRIBUTE(list) __attribute__ (list)
+#else
+#define ATTRIBUTE(list)
+#endif
+
+#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg)
+#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg)
+
+#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
+#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
+#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
+
+#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg)
+
+#define _err_simple_func_printf(msg) err_func_printf(__func__, msg)
+
+typedef struct {
+	uint64_t x, y;
+} pair64_t;
+
+typedef struct { size_t n, m; uint64_t *a; } uint64_v;
+typedef struct { size_t n, m; pair64_t *a; } pair64_v;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
+	void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
+	void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn));
+	void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn));
+	FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
+	FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp);
+	gzFile err_xzopen_core(const char *func, const char *fn, const char *mode);
+    size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+	size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
+
+	int err_gzread(gzFile file, void *ptr, unsigned int len);
+	int err_fseek(FILE *stream, long offset, int whence);
+#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET)
+	long err_ftell(FILE *stream);
+	int err_fprintf(FILE *stream, const char *format, ...)
+        ATTRIBUTE((format(printf, 2, 3)));
+	int err_printf(const char *format, ...)
+        ATTRIBUTE((format(printf, 1, 2)));
+	int err_func_printf(const char *func, const char *format, ...)
+        ATTRIBUTE((format(printf, 2, 3)));
+	int stdout_printf(const char *format, ...)
+        ATTRIBUTE((format(printf, 1, 2)));
+	int err_fputc(int c, FILE *stream);
+#define err_putchar(C) err_fputc((C), stdout)
+	int err_fputs(const char *s, FILE *stream);
+	int err_puts(const char *s);
+    void err_fgets(char *buff, size_t s, FILE *fp);
+	int err_fflush(FILE *stream);
+	int err_fclose(FILE *stream);
+	int err_gzclose(gzFile file);
+
+#define _err_malloc(s) err_malloc(__func__, s)
+#define _err_calloc(n, s) err_calloc(__func__, n, s)
+#define _err_realloc(p, s) err_realloc(__func__, p, s)
+    void *err_malloc(const char* func, size_t s);
+    void *err_calloc(const char* func, size_t n, size_t s);
+    void *err_realloc(const char* func, void *p, size_t s);
+
+    void usr_sys_cputime(double *usr_t, double *sys_t);
+	double cputime();
+	double realtime();
+    long peakrss(void);
+    void print_format_time(FILE *out);
+    int err_func_format_printf(const char *func, const char *format, ...);
+
+	void ks_introsort_64 (size_t n, uint64_t *a);
+	void ks_introsort_128(size_t n, pair64_t *a);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#define _uni_realloc(p, n, m, type) {                   \
+    if (m <= 0) {                                       \
+        m = 1;                                          \
+        m = MAX_OF_TWO(n, m);                           \
+        p = (type*)_err_malloc((m) * sizeof(type));     \
+    } else if (n >= m) {                                \
+        m = n + 1; kroundup32(m);                       \
+        p = (type*)_err_realloc(p, (m) * sizeof(type)); \
+    }                                                   \
+}
+
+#define _realloc(p, m, type) {(m) <<= 1; p = (type*)_err_realloc(p, (m) * sizeof(type));}
+
+#define _sim_insert_abpoa_utils(v, p, n, m, type) { \
+    if (n == m) {               \
+        _realloc(p, m, type)    \
+    }                           \
+    p[n++] = v;                 \
+}
+
+#define _insert_abpoa_utils(v, p, n, m, type) { \
+    int _i, _flag=0;                  \
+    for (_i = 0; _i < n; ++_i) {       \
+        if (p[_i] == v) {            \
+            _flag = 1;               \
+            break;                  \
+        }                           \
+    }                               \
+    if (_flag == 0) {                \
+        if (n == m) {               \
+            _realloc(p, m, type)    \
+        }                           \
+        p[n++] = v;                 \
+    }                               \
+}
+
+#define _bin_insert_abpoa_utils_idx(v, p, n, m, type, flag, k_i) { \
+    flag=0, k_i=-1;   \
+    int _left=0,_right=n-1,_mid;    \
+    type _mid_v, _tmp_v;                 \
+    if (_right == -1) k_i = 0;   \
+    else {                      \
+        while (_left <= _right) { \
+            _mid = (_left+_right) >> 1;    \
+            _mid_v = p[_mid];             \
+            if (_mid_v == v) {           \
+                k_i = _mid; \
+                flag = 1; break;        \
+            } else if (_mid_v > v) {     \
+                if (_mid != 0) {         \
+                    _tmp_v = p[_mid-1];   \
+                }                       \
+                if (_mid == 0 || v > _tmp_v) { \
+                    k_i = _mid;          \
+                    break;              \
+                }                       \
+                else _right = _mid-1;     \
+            } else _left = _mid+1;        \
+        }                               \
+    }                                   \
+    if (k_i == -1) k_i = n;         \
+}
+     
+#define _bin_insert_abpoa_utils(v, p, n, m, type) { \
+    int _k_i, _flag;    \
+    _bin_insert_abpoa_utils_idx(v, p, n, m, type, _flag, _k_i)   \
+    if (_flag == 0) {                \
+        if (n == m) {               \
+            _realloc(p, m, type)    \
+        }                           \
+        if (_k_i <= n-1)             \
+            memmove(p+_k_i+1, p+_k_i, (n-_k_i)*sizeof(type));  \
+        (p)[_k_i] = v;               \
+        (n)++;                      \
+    }                               \
+}
+
+#define _bin_search(v, p, n, type, hit, i) { \
+    int _left =0,_right=n-1,_mid; \
+    type _mid_v;    \
+    hit = 0;               \
+    if (_right == -1) hit=0;   \
+    else {  \
+        while (_left <= _right) {   \
+            _mid = (_left+_right) >> 1; \
+            _mid_v = p[_mid];       \
+            if (_mid_v == v) {  \
+                i = _mid;   \
+                hit = 1;   \
+                break;      \
+            } else if (_mid_v > v) {   \
+                _right = _mid-1;    \
+            } else {    \
+                _left = _mid+1; \
+            }   \
+        }   \
+    }   \
+} 
+
+#define MIN_OF_TWO(a, b) ((a) < (b) ? (a) : (b))
+#define MAX_OF_TWO(a, b) ((a) > (b) ? (a) : (b))
+#define MIN_OF_THREE(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+#define MAX_OF_THREE(a, b, c) ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c)))
+#define AVG_OF_TWO(a, b) (((a)&(b)) + (((a)^(b)) >> 1))
+
+static inline uint64_t hash_64(uint64_t key)
+{
+	key += ~(key << 32);
+	key ^= (key >> 22);
+	key += ~(key << 13);
+	key ^= (key >> 8);
+	key += (key << 3);
+	key ^= (key >> 15);
+	key += ~(key << 27);
+	key ^= (key >> 31);
+	return key;
+}
+#ifndef _PRINT_FORMAT_H_
+#define _PRINT_FORMAT_H_
+
+#define NONE "\e[0m" //remove color/font
+#define BLACK "\e[0;30m" // black
+#define B_BLACK "\e[1;30m" // bold black
+#define RED "\e[0;31m" // read
+#define B_RED "\e[1;31m" // bold red
+#define GREEN "\e[0;32m" // green
+#define B_GREEN "\e[1;32m" // bold gren
+#define BROWN "\e[0;33m" // brown
+#define YELLOW "\e[1;33m" // yellow
+#define BLUE "\e[0;34m" // blue
+#define B_BLUE "\e[1;34m" // bold blue
+#define PURPLE "\e[0;35m" // purple
+#define B_PURPLE "\e[1;35m" // bold purple
+#define CYAN "\e[0;36m" // cyan
+#define B_CYAN "\e[1;36m" // bold cyan
+#define GRAY "\e[0;37m" // gray
+#define WHITE "\e[1;37m" // white, bold
+#define BOLD "\e[1m" // bold
+#define UNDERLINE "\e[4m" // underline
+#define BLINK "\e[5m" // blink
+#define REVERSE "\e[7m" // reverse background and foreground
+#define HIDE "\e[8m" // hide
+#define STRIKE "\e[9m" // strikethrough
+#define CLEAR "\e[2J" // clear
+#define CLRLINE "\r\e[K" // clear line
+
+// from https://blog.csdn.net/MoDa_Li/java/article/details/82156888
+
+#endif
+
+#endif
diff --git a/sub_example.c b/sub_example.c
new file mode 100644
index 0000000..0794694
--- /dev/null
+++ b/sub_example.c
@@ -0,0 +1,128 @@
+/* sub_example.c libabpoa usage example
+   To compile:
+gcc -g sub_example.c -I ./include -L ./lib -labpoa -lz -lm -o sub_example
+or:
+gcc -g sub_example.c -I ./include ./lib/libabpoa.a -lz -lm -o sub_example
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "include/abpoa.h"
+
+// AaCcGgTtNn ... ==> 0,1,2,3,4 ...
+// BbDdEeFf   ... ==> 5,6,7,8 ...
+unsigned char _char26_table[256] = {
+	 0,  1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15, 
+	16, 17, 18, 19,  20, 21, 22, 23,  24, 25, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26,  0,  5,  1,   6,  7,  8,  2,   9, 10, 11, 12,  13, 14,  4, 15, 
+	16, 17, 18, 19,   3, 20, 21, 22,  23, 24, 25, 26,  26, 26, 26, 26, 
+	26,  0,  5,  1,   6,  7,  8,  2,   9, 10, 11, 12,  13, 14,  4, 15, 
+	16, 17, 18, 19,   3, 20, 21, 22,  23, 24, 25, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26, 
+	26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26,  26, 26, 26, 26
+};
+
+int main(void) {
+    int i, j, n_seqs = 6;
+    char seqs[100][1000] = {
+         // 0       1         2         3
+         // 23456789012345678901234567890123               
+           "CGTCAATCTATCGAAGCATACGCGGGCAGAGC",
+        "CCACGTCAATCTATCGAAGCATACGCGGCAGC",
+               "AATCTATCGAAGCATACG",
+              "CAATGCTAGTCGAAGCAGCTGCGGCAG",
+           "CGTCAATCTATCGAAGCATTCTACGCGGCAGAGC",
+        "CGTCAATCTAGAAGCATACGCGGCAAGAGC",
+        "CGTCAATCTATCGGTAAAGCATACGCTCTGTAGC",
+        "CGTCAATCTATCTTCAAGCATACGCGGCAGAGC",
+        "CGTCAATGGATCGAGTACGCGGCAGAGC",
+        "CGTCAATCTAATCGAAGCATACGCGGCAGAGC"
+        };
+
+    int beg_end_id[100][2] = {
+        {0, 1}, 
+        {2, 33},
+        {6, 23}, 
+        {5, 30}, 
+        {0, 1}, 
+        {0, 1}, 
+        {0, 1}, 
+        {0, 1}, 
+        {0, 1}, 
+        {0, 1}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52}, 
+        //{2, 52} 
+    };
+
+    // initialize variables
+    abpoa_t *ab = abpoa_init();
+    abpoa_para_t *abpt = abpoa_init_para();
+
+    // alignment parameters
+    // abpt->align_mode = 0; // 0:global 1:local, 2:extension
+    // abpt->match = 2;      // match score
+    // abpt->mismatch = 4;   // mismatch penalty
+    // abpt->gap_mode = ABPOA_CONVEX_GAP; // gap penalty mode
+    // abpt->gap_open1 = 4;  // gap open penalty #1
+    // abpt->gap_ext1 = 2;   // gap extension penalty #1
+    // abpt->gap_open2 = 24; // gap open penalty #2
+    // abpt->gap_ext2 = 1;   // gap extension penalty #2
+                             // gap_penalty = min{gap_open1 + gap_len * gap_ext1, gap_open2 + gap_len * gap_ext2}
+    // abpt->bw = 10;        // extra band used in adaptive banded DP
+    // abpt->bf = 0.01; 
+     
+    // output options
+    abpt->out_msa = 1; // generate Row-Column multiple sequence alignment(RC-MSA), set 0 to disable
+    abpt->out_cons = 1; // generate consensus sequence, set 0 to disable
+
+    abpoa_post_set_para(abpt);
+
+    // collect sequence length, trasform ACGT to 0123
+    int *seq_lens = (int*)malloc(sizeof(int) * n_seqs);
+    uint8_t **bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * n_seqs);
+    for (i = 0; i < n_seqs; ++i) {
+        seq_lens[i] = strlen(seqs[i]);
+        bseqs[i] = (uint8_t*)malloc(sizeof(uint8_t) * seq_lens[i]);
+        for (j = 0; j < seq_lens[i]; ++j)
+            bseqs[i][j] = _char26_table[(int)seqs[i][j]];
+    }
+
+    // perform abpoa-msa
+    ab->abs->n_seq = n_seqs;
+    abpoa_res_t res;
+    for (i = 0; i < n_seqs; ++i) {
+        res.graph_cigar = 0, res.n_cigar = 0;
+        int exc_beg, exc_end;
+        if (i != 0) abpoa_subgraph_nodes(ab, abpt, beg_end_id[i][0], beg_end_id[i][1], &exc_beg, &exc_end);
+        else exc_beg = 0, exc_end = 1;
+        fprintf(stderr, "i: %d, beg: %d, end: %d\n", i, exc_beg, exc_end);
+        abpoa_align_sequence_to_subgraph(ab, abpt, exc_beg, exc_end, bseqs[i], seq_lens[i], &res);
+        abpoa_add_subgraph_alignment(ab, abpt, exc_beg, exc_end, bseqs[i], NULL, seq_lens[i], NULL, res, i, n_seqs, 0);
+        if (res.n_cigar) free(res.graph_cigar);
+    }
+
+    abpoa_output(ab, abpt, stdout);
+
+    /* generate DOT partial order graph plot */
+    abpt->out_pog = strdup("sub_example.png"); // dump parital order graph to file
+    if (abpt->out_pog != NULL) abpoa_dump_pog(ab, abpt);
+    for (i = 0; i < n_seqs; ++i) free(bseqs[i]); free(bseqs); free(seq_lens);
+    abpoa_free(ab); abpoa_free_para(abpt);
+    return 0;
+}
diff --git a/test_data/heter.fa b/test_data/heter.fa
new file mode 100644
index 0000000..fc29729
--- /dev/null
+++ b/test_data/heter.fa
@@ -0,0 +1,30 @@
+>m64062_200517_230654/46596725/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200517_230654/122620624/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200604_115437/178193244/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200517_230654/73204047/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200604_115437/157813105/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200604_115437/141952744/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCAGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCCTCCACCAACATCCCCACCATCCCCACCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200604_115437/19531191/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTAACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACACCATTCTCACCATCTCCACCAACATCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCAATCCCCACCATCC
+>m64062_200604_115437/120652681/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200604_115437/28773203/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200604_115437/75628767/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTCCATCCATTCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCACCACCATTTCCACCATCCCACCATCATCCCCACCACCATCCCCAGTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200517_230654/180226763/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200517_230654/29755012/ccs
+CCATTCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200517_230654/154468686/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATGCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCACCACCATCCTCACTACCATCCCACCACCATTCCACCATTCCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCACCACCATCTCCATTACCATCCCCACCACCATCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCATCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCATCCCCACCGCCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCACCCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATC
+>m64062_200604_115437/146211230/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
+>m64062_200517_230654/85983296/ccs
+CCATTCCCACCATCCTTACCATCAACATCACCATCCCCACCATCCCCAACACCATTCCCACCATCCCTACCATCACCATCACCATCCCCACCAACATCCCCACCACCATCCTCACTACCATCCCCACCACCATTTCCACCATTCCCACCACAGTCACCATCACCCCCACCATCCCCATCATCATCCGCACCATCCCCACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCTCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATCCCCATTACCATCCCCACCACCATTTCCACCATTCCCACCATCATCCCCACCACCATCCTCGTTACCATCCCCACCACCTTTTCCACCATTCCCACCATCTCCAACACCTCCCCCACCATCATCCCCACCATCCCCACCACCTTCTCCACCATCATTCTCACCATCCCCACCACCATCTCCACCACCATTCTCACCATCTCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCAACATCCCCACCATCCCCACCCCCATGCCCACCATCATCCCCACCATCC
diff --git a/test_data/seq.fa b/test_data/seq.fa
new file mode 100644
index 0000000..3037eb9
--- /dev/null
+++ b/test_data/seq.fa
@@ -0,0 +1,20 @@
+>1
+CGTCAATCTATCGAAGCATACGCGGGCAGAGCCGAAGACCTCGGCAATCCA
+>2
+CCACGTCAATCTATCGAAGCATACGCGGCAGCCGAACTCGACCTCGGCAATCAC
+>3
+CGTCAATCTATCGAAGCATACGCGGCAGAGCCCGGAAGACCTCGGCAATCAC
+>4
+CGTCAATGCTAGTCGAAGCAGCTGCGGCAGAGCCGAAGACCTCGGCAATCAC
+>5
+CGTCAATCTATCGAAGCATTCTACGCGGCAGAGCCGACCTCGGCAATCAC
+>6
+CGTCAATCTAGAAGCATACGCGGCAAGAGCCGAAGACCTCGGCCAATCAC
+>7
+CGTCAATCTATCGGTAAAGCATACGCTCTGTAGCCGAAGACCTCGGCAATCAC
+>8
+CGTCAATCTATCTTCAAGCATACGCGGCAGAGCCGAAGACCTCGGCAATC
+>9
+CGTCAATGGATCGAGTACGCGGCAGAGCCGAAGACCTCGGCAATCAC
+>10
+CGTCAATCTAATCGAAGCATACGCGGCAGAGCCGTCTACCTCGGCAATCACGT
diff --git a/test_data/test.fa b/test_data/test.fa
new file mode 100644
index 0000000..d867b72
--- /dev/null
+++ b/test_data/test.fa
@@ -0,0 +1,8 @@
+>1
+ACGTGTACAGTTGAC
+>2
+AGGTACACGTTAC
+>3
+AGTGTCACGTTGAC
+>4
+ACGTGTACATTGAC
-- 
cgit v1.2.3


From 7f89ebccca075e4ece6f5cf5975d4fcdc3911f12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Sat, 23 Jul 2022 21:23:15 +0200
Subject: Import abpoa_1.4.1-2.debian.tar.xz

[dgit import tarball abpoa 1.4.1-2 abpoa_1.4.1-2.debian.tar.xz]
---
 abpoa-dispatch.sh                 |  19 ++++++
 changelog                         |  13 ++++
 control                           |  61 +++++++++++++++++++
 copyright                         |  42 +++++++++++++
 examples                          |   3 +
 install                           |   1 +
 manpages                          |   1 +
 manuals/abpoa.1                   | 124 ++++++++++++++++++++++++++++++++++++++
 manuals/abpoa.avx.1               |   1 +
 manuals/abpoa.avx2.1              |   1 +
 manuals/abpoa.generic.1           |   1 +
 manuals/abpoa.sse3.1              |   1 +
 manuals/abpoa.sse4.1.1            |   1 +
 manuals/abpoa.ssse3.1             |   1 +
 patches/baseline.patch            |  63 +++++++++++++++++++
 patches/buildflags.patch          |  28 +++++++++
 patches/debian-simde.patch        |  50 +++++++++++++++
 patches/series                    |   4 ++
 patches/typos.patch               |  37 ++++++++++++
 rules                             |  49 +++++++++++++++
 salsa-ci.yml                      |   7 +++
 source/format                     |   1 +
 tests/autopkgtest-pkg-python.conf |   1 +
 tests/control                     |   3 +
 tests/run-unit-test               |  43 +++++++++++++
 upstream/metadata                 |  17 ++++++
 watch                             |   4 ++
 27 files changed, 577 insertions(+)
 create mode 100755 abpoa-dispatch.sh
 create mode 100644 changelog
 create mode 100644 control
 create mode 100644 copyright
 create mode 100644 examples
 create mode 100644 install
 create mode 100644 manpages
 create mode 100644 manuals/abpoa.1
 create mode 120000 manuals/abpoa.avx.1
 create mode 120000 manuals/abpoa.avx2.1
 create mode 120000 manuals/abpoa.generic.1
 create mode 120000 manuals/abpoa.sse3.1
 create mode 120000 manuals/abpoa.sse4.1.1
 create mode 120000 manuals/abpoa.ssse3.1
 create mode 100644 patches/baseline.patch
 create mode 100644 patches/buildflags.patch
 create mode 100644 patches/debian-simde.patch
 create mode 100644 patches/series
 create mode 100644 patches/typos.patch
 create mode 100755 rules
 create mode 100644 salsa-ci.yml
 create mode 100644 source/format
 create mode 100644 tests/autopkgtest-pkg-python.conf
 create mode 100644 tests/control
 create mode 100644 tests/run-unit-test
 create mode 100644 upstream/metadata
 create mode 100644 watch

diff --git a/abpoa-dispatch.sh b/abpoa-dispatch.sh
new file mode 100755
index 0000000..d51948b
--- /dev/null
+++ b/abpoa-dispatch.sh
@@ -0,0 +1,19 @@
+#! /bin/sh
+set -e
+BIN="$0"
+
+test_and_run () {
+	local flag="$1"
+	if grep -q "$flag" /proc/cpuinfo && [ -x "$BIN.$flag" ]
+	then
+		shift
+		exec "$BIN.$flag" "$@"
+	fi
+}
+
+for SIMDE in avx2 avx sse4.1 ssse3 sse3 sse2 sse
+do test_and_run "$SIMDE" "$@"
+done
+
+# fallback to plain option
+exec "$BIN.generic" "$@"
diff --git a/changelog b/changelog
new file mode 100644
index 0000000..6bd0f11
--- /dev/null
+++ b/changelog
@@ -0,0 +1,13 @@
+abpoa (1.4.1-2) unstable; urgency=medium
+
+  * Source-only upload.
+  * d/t/run-unit-test: fix dispatch to enable test of generic executable.
+  * typos.patch: forwarded upstream.
+
+ -- Étienne Mollier <emollier@debian.org>  Sat, 23 Jul 2022 21:23:15 +0200
+
+abpoa (1.4.1-1) unstable; urgency=medium
+
+  * Initial release. (Closes: #1014647)
+
+ -- Étienne Mollier <emollier@debian.org>  Sat, 09 Jul 2022 19:26:20 +0200
diff --git a/control b/control
new file mode 100644
index 0000000..bdd092f
--- /dev/null
+++ b/control
@@ -0,0 +1,61 @@
+Source: abpoa
+Section: science
+Priority: optional
+Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
+Uploaders: Étienne Mollier <emollier@debian.org>
+Build-Depends: debhelper-compat (= 13),
+               dh-python,
+               cython3,
+               graphviz <!nocheck>,
+               libsimde-dev,
+               python3-all-dev,
+               zlib1g-dev
+Standards-Version: 4.6.1
+Vcs-Browser: https://salsa.debian.org/med-team/abpoa
+Vcs-Git: https://salsa.debian.org/med-team/abpoa.git
+Homepage: https://github.com/yangao07/abPOA
+Testsuite: autopkgtest-pkg-python
+Rules-Requires-Root: no
+
+Package: abpoa
+Architecture: any
+Depends: ${shlibs:Depends}, ${misc:Depends},
+         graphviz
+Built-Using: ${simde:Built-Using}
+Description: adaptive banded Partial Order Alignment
+ abPOA is an extended version of Partial Order Alignment (POA) that performs
+ adaptive banded dynamic programming (DP) with an SIMD implementation. abPOA
+ can perform multiple sequence alignment (MSA) on a set of input sequences and
+ generate a consensus sequence by applying the heaviest bundling algorithm to
+ the final alignment graph.
+ .
+ abPOA can generate high-quality consensus sequences from error-prone long
+ reads and offer significant speed improvement over existing tools.
+ .
+ abPOA supports three alignment modes (global, local, extension) and flexible
+ scoring schemes that allow linear, affine and convex gap penalties. It right
+ now supports SSE2/SSE4.1/AVX2 vectorization.
+ .
+ For more information please refer to the paper[1] published in Bioinformatics.
+ .
+ [1]: https://dx.doi.org/10.1093/bioinformatics/btaa963
+
+Package: python3-pyabpoa
+Architecture: any
+Section: python
+Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends}
+Description: adaptive banded Partial Order Alignment - python3 module
+ abPOA is an extended version of Partial Order Alignment (POA) that performs
+ adaptive banded dynamic programming (DP) with an SIMD implementation. abPOA
+ can perform multiple sequence alignment (MSA) on a set of input sequences and
+ generate a consensus sequence by applying the heaviest bundling algorithm to
+ the final alignment graph.
+ .
+ abPOA can generate high-quality consensus sequences from error-prone long
+ reads and offer significant speed improvement over existing tools.
+ .
+ abPOA supports three alignment modes (global, local, extension) and flexible
+ scoring schemes that allow linear, affine and convex gap penalties. It right
+ now supports SSE2/SSE4.1/AVX2 vectorization.
+ .
+ This package provides the python3 module of abPOA.
diff --git a/copyright b/copyright
new file mode 100644
index 0000000..18dbfa2
--- /dev/null
+++ b/copyright
@@ -0,0 +1,42 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: abPOA
+Source: https://github.com/yangao07/abPOA
+
+Files: *
+Copyright: 2020, Yan Gao <gaoy1@chop.edu>
+License: Expat
+
+Files: src/khash.h
+       src/kseq.h
+       src/kstring.h
+       src/kvec.h
+Copyright: 2008-2011, Attractive Chaos <attractor@live.co.uk>
+License: Expat
+
+Files: src/utils.c
+       src/utils.h
+Copyright: 2008, Genome Research Ltd (GRL).
+License: Expat
+
+Files: debian/*
+Copyright: 2022, Étienne Mollier <emollier@debian.org>
+License: Expat
+
+License: Expat
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ .
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+ .
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
diff --git a/examples b/examples
new file mode 100644
index 0000000..eb5f738
--- /dev/null
+++ b/examples
@@ -0,0 +1,3 @@
+example.c
+sub_example.c
+test_data/
diff --git a/install b/install
new file mode 100644
index 0000000..e2dc6c9
--- /dev/null
+++ b/install
@@ -0,0 +1 @@
+bin/abpoa*
diff --git a/manpages b/manpages
new file mode 100644
index 0000000..d7ed10d
--- /dev/null
+++ b/manpages
@@ -0,0 +1 @@
+debian/manuals/abpoa*.1
diff --git a/manuals/abpoa.1 b/manuals/abpoa.1
new file mode 100644
index 0000000..b786179
--- /dev/null
+++ b/manuals/abpoa.1
@@ -0,0 +1,124 @@
+.TH ABPOA "1" "July 2022" "abpoa 1.4.1" "User Commands"
+.SH NAME
+abpoa, abpoa.avx2, abpoa.avx, abpoa.sse4.1, abpoa.ssse3, abpoa.sse3, abpoa.generic \- adaptive banded Partial Order Alignment
+.SH SYNOPSIS
+.B abpoa
+[\fI\,options\/\fR] \fI\,<in.fa/fq> > cons.fa/msa.out/abpoa.gfa\/\fR
+.SH DESCRIPTION
+.PP
+abPOA is an extended version of Partial Order Alignment (POA) that performs adaptive banded dynamic programming (DP) with an SIMD implementation.
+abPOA can perform multiple sequence alignment (MSA) on a set of input sequences and generate a consensus sequence by applying the heaviest bundling algorithm to the final alignment graph.
+.PP
+abPOA can generate high-quality consensus sequences from error-prone long reads and offer significant speed improvement over existing tools.
+.PP
+abPOA supports three alignment modes (global, local, extension) and flexible scoring schemes that allow linear, affine and convex gap penalties.
+It right now supports SSE2/SSE4.1/AVX2 vectorization.
+.SH OPTIONS
+.PP
+Alignment:
+.TP
+\fB\-m\fR \fB\-\-aln\-mode\fR
+INT     alignment mode [0]
+0: global, 1: local, 2: extension
+.TP
+\fB\-M\fR \fB\-\-match\fR
+INT     match score [2]
+.TP
+\fB\-X\fR \fB\-\-mismatch\fR
+INT     mismatch penalty [4]
+.TP
+\fB\-t\fR \fB\-\-matrix\fR
+FILE     scoring matrix file, '\-M' and '\-X' are not used when '\-t' is used [Null]
+e.g., 'HOXD70.mtx, BLOSUM62.mtx'
+.HP
+\fB\-O\fR \fB\-\-gap\-open\fR INT(,INT) gap opening penalty (O1,O2) [4,24]
+.TP
+\fB\-E\fR \fB\-\-gap\-ext\fR
+INT(,INT) gap extension penalty (E1,E2) [2,1]
+abPOA provides three gap penalty modes, cost of a g\-long gap:
+\- convex (default): min{O1+g*E1, O2+g*E2}
+\- affine (set O2 as 0): O1+g*E1
+\- linear (set O1 as 0): g*E1
+.TP
+\fB\-s\fR \fB\-\-amb\-strand\fR
+ambiguous strand mode [False]
+for each input sequence, try the reverse complement if the current
+alignment score is too low, and pick the strand with a higher score
+.IP
+Adaptive banded DP:
+.TP
+\fB\-b\fR \fB\-\-extra\-b\fR
+INT     first adaptive banding parameter [10]
+set b as < 0 to disable adaptive banded DP
+.TP
+\fB\-f\fR \fB\-\-extra\-f\fR
+FLOAT     second adaptive banding parameter [0.01]
+the number of extra bases added on both sites of the band is
+b+f*L, where L is the length of the aligned sequence
+.IP
+Minimizer\-based seeding and partition (only effective in global alignment mode):
+.TP
+\fB\-S\fR \fB\-\-seeding\fR
+enable minimizer\-based seeding and anchoring [False]
+.TP
+\fB\-k\fR \fB\-\-k\-mer\fR
+INT    minimizer k\-mer size [19]
+.TP
+\fB\-w\fR \fB\-\-window\fR
+INT    minimizer window size [10]
+.TP
+\fB\-n\fR \fB\-\-min\-poa\-win\fR INT
+min. size of window to perform POA [500]
+.TP
+\fB\-p\fR \fB\-\-progressive\fR
+build guide tree and perform progressive partial order alignment [False]
+.IP
+Input/Output:
+.TP
+\fB\-Q\fR \fB\-\-use\-qual\-weight\fR
+take base quality score from FASTQ input file as graph edge weight [False]
+.TP
+\fB\-c\fR \fB\-\-amino\-acid\fR
+input sequences are amino acid (default is nucleotide) [False]
+.TP
+\fB\-l\fR \fB\-\-in\-list\fR
+input file is a list of sequence file names [False]
+each line is one sequence file containing a set of sequences
+which will be aligned by abPOA to generate a consensus sequence
+.TP
+\fB\-i\fR \fB\-\-incrmnt\fR
+FILE    incrementally align sequences to an existing graph/MSA [Null]
+graph could be in GFA or MSA format generated by abPOA
+.TP
+\fB\-o\fR \fB\-\-output\fR
+FILE    output to FILE [stdout]
+.TP
+\fB\-r\fR \fB\-\-result\fR
+INT    output result mode [0]
+\- 0: consensus in FASTA format
+\- 1: MSA in PIR format
+\- 2: both 0 & 1
+\- 3: graph in GFA format
+\- 4: graph with consensus path in GFA format
+\- 5: consensus in FASTQ format
+.TP
+\fB\-d\fR \fB\-\-maxnum\-cons\fR INT
+max. number of consensus sequence to generate [1]
+.TP
+\fB\-q\fR \fB\-\-min\-freq\fR
+FLOAT    min. frequency of each consensus sequence (only effective when \fB\-d\fR/\-\-num\-cons > 1) [0.25]
+.TP
+\fB\-g\fR \fB\-\-out\-pog\fR
+FILE    dump final alignment graph to FILE (.pdf/.png) [Null]
+.TP
+\fB\-h\fR \fB\-\-help\fR
+print this help usage information
+.TP
+\fB\-v\fR \fB\-\-version\fR
+show version number
+.SH SEE ALSO
+.PP
+For more information please refer to the paper published in Bioinformatics:
+.PP
+.UR https://dx.doi.org/10.1093/bioinformatics/btaa963
+.UE
diff --git a/manuals/abpoa.avx.1 b/manuals/abpoa.avx.1
new file mode 120000
index 0000000..198e1a9
--- /dev/null
+++ b/manuals/abpoa.avx.1
@@ -0,0 +1 @@
+abpoa.1
\ No newline at end of file
diff --git a/manuals/abpoa.avx2.1 b/manuals/abpoa.avx2.1
new file mode 120000
index 0000000..198e1a9
--- /dev/null
+++ b/manuals/abpoa.avx2.1
@@ -0,0 +1 @@
+abpoa.1
\ No newline at end of file
diff --git a/manuals/abpoa.generic.1 b/manuals/abpoa.generic.1
new file mode 120000
index 0000000..198e1a9
--- /dev/null
+++ b/manuals/abpoa.generic.1
@@ -0,0 +1 @@
+abpoa.1
\ No newline at end of file
diff --git a/manuals/abpoa.sse3.1 b/manuals/abpoa.sse3.1
new file mode 120000
index 0000000..198e1a9
--- /dev/null
+++ b/manuals/abpoa.sse3.1
@@ -0,0 +1 @@
+abpoa.1
\ No newline at end of file
diff --git a/manuals/abpoa.sse4.1.1 b/manuals/abpoa.sse4.1.1
new file mode 120000
index 0000000..198e1a9
--- /dev/null
+++ b/manuals/abpoa.sse4.1.1
@@ -0,0 +1 @@
+abpoa.1
\ No newline at end of file
diff --git a/manuals/abpoa.ssse3.1 b/manuals/abpoa.ssse3.1
new file mode 120000
index 0000000..198e1a9
--- /dev/null
+++ b/manuals/abpoa.ssse3.1
@@ -0,0 +1 @@
+abpoa.1
\ No newline at end of file
diff --git a/patches/baseline.patch b/patches/baseline.patch
new file mode 100644
index 0000000..62d085d
--- /dev/null
+++ b/patches/baseline.patch
@@ -0,0 +1,63 @@
+Description: respect the CPU baseline.
+ FIXME: implement multiple executables using SIMDe as distributed by Debian.
+Author: Étienne Mollier <emollier@debian.org>
+Forwarded: not-needed
+Last-Update: 2022-07-09
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- abpoa.orig/CMakeLists.txt
++++ abpoa/CMakeLists.txt
+@@ -10,7 +10,7 @@
+ 
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+ 
+-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
++#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
+ 
+ # build abPOA as a static library by default
+ set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
+--- abpoa.orig/Makefile
++++ abpoa/Makefile
+@@ -2,19 +2,19 @@
+ EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
+ CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
+ 
+-SIMD_FLAG   = -march=native
++#SIMD_FLAG   = -march=native
+ 
+-ifneq ($(armv7),) # for ARMv7
+-	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
+-else
+-ifneq ($(armv8),) # for ARMv8
+-ifneq ($(aarch64),) # for Aarch64 
+-	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
+-else # for Aarch32
+-	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
+-endif
+-endif
+-endif
++#ifneq ($(armv7),) # for ARMv7
++#	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
++#else
++#ifneq ($(armv8),) # for ARMv8
++#ifneq ($(aarch64),) # for Aarch64 
++#	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
++#else # for Aarch32
++#	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
++#endif
++#endif
++#endif
+ 
+ # for debug
+ ifneq ($(debug),)
+--- abpoa.orig/setup.py
++++ abpoa/setup.py
+@@ -64,7 +64,7 @@
+                     include_dirs=[inc_dir],
+                     depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_output.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'],
+                     libraries = ['z', 'm', 'pthread'],
+-                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde, simd_flag])],
++                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde])],
+     install_requires=['cython'],
+     cmdclass = cmdclass
+ )
diff --git a/patches/buildflags.patch b/patches/buildflags.patch
new file mode 100644
index 0000000..3d46b60
--- /dev/null
+++ b/patches/buildflags.patch
@@ -0,0 +1,28 @@
+Description: inject standard debian build flags.
+Author: Étienne Mollier <emollier@debian.org>
+Forwarded: not-needed
+Last-Update: 2022-07-09
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- abpoa.orig/Makefile
++++ abpoa/Makefile
+@@ -1,6 +1,6 @@
+ #CC          = gcc
+ EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
+-CFLAGS      = -Wall -O3 $(EXTRA_FLAGS)
++CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
+ 
+ SIMD_FLAG   = -march=native
+ 
+@@ -22,9 +22,9 @@
+ endif
+ # for gdb
+ ifneq ($(gdb),)
+-	CFLAGS   = -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
++	CFLAGS   += -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
+ else
+-	CFLAGS   = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
++	CFLAGS   += -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
+ endif
+ 
+ # for gprof
diff --git a/patches/debian-simde.patch b/patches/debian-simde.patch
new file mode 100644
index 0000000..f5d3374
--- /dev/null
+++ b/patches/debian-simde.patch
@@ -0,0 +1,50 @@
+Description: refer to debian's simde headers.
+Author: Étienne Mollier <emollier@debian.org>
+Forwarded: not-needed
+Last-Update: 2022-07-07
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- abpoa.orig/src/simd_instruction.h
++++ abpoa/src/simd_instruction.h
+@@ -21,15 +21,15 @@
+ #include <immintrin.h>
+ #else // use SIMDE
+ #ifdef __AVX512F__
+-#include "simde/simde/x86/avx512.h"
++#include <simde/x86/avx512.h>
+ #else
+ #ifdef __AVX2__
+-#include "simde/simde/x86/avx2.h"
++#include <simde/x86/avx2.h>
+ #else
+ #ifdef __SSE4_1__
+-#include "simde/simde/x86/sse4.1.h"
++#include <simde/x86/sse4.1.h>
+ #else
+-#include "simde/simde/x86/sse2.h"
++#include <simde/x86/sse2.h>
+ #endif // end of sse41
+ #endif // end of AVX2
+ #endif // end of 512F
+--- abpoa.orig/include/simd_instruction.h
++++ abpoa/include/simd_instruction.h
+@@ -21,15 +21,15 @@
+ #include <immintrin.h>
+ #else // use SIMDE
+ #ifdef __AVX512F__
+-#include "simde/simde/x86/avx512.h"
++#include <simde/x86/avx512.h|>
+ #else
+ #ifdef __AVX2__
+-#include "simde/simde/x86/avx2.h"
++#include <simde/x86/avx2.h>
+ #else
+ #ifdef __SSE4_1__
+-#include "simde/simde/x86/sse4.1.h"
++#include <simde/x86/sse4.1.h>
+ #else
+-#include "simde/simde/x86/sse2.h"
++#include <simde/x86/sse2.h>
+ #endif // end of sse41
+ #endif // end of AVX2
+ #endif // end of 512F
diff --git a/patches/series b/patches/series
new file mode 100644
index 0000000..728d10b
--- /dev/null
+++ b/patches/series
@@ -0,0 +1,4 @@
+debian-simde.patch
+buildflags.patch
+typos.patch
+baseline.patch
diff --git a/patches/typos.patch b/patches/typos.patch
new file mode 100644
index 0000000..cdaa72f
--- /dev/null
+++ b/patches/typos.patch
@@ -0,0 +1,37 @@
+Description: fix typos caught by lintian.
+Author: Étienne Mollier <emollier@debian.org>
+Forwarded: https://github.com/yangao07/abPOA/pull/40
+Last-Update: 2022-07-09
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- abpoa.orig/src/abpoa.c
++++ abpoa/src/abpoa.c
+@@ -31,7 +31,7 @@
+     { "extra-b", 1, NULL, 'b' },
+     { "extra-f", 1, NULL, 'f' },
+     { "zdrop", 1, NULL, 'z' },
+-    { "bouns", 1, NULL, 'e' },
++    { "bonus", 1, NULL, 'e' },
+ 
+     { "seeding", 0, NULL, 'S'},
+     { "k-mer", 1, NULL, 'k' },
+@@ -108,7 +108,7 @@
+     err_printf("                            which will be aligned by abPOA to generate a consensus sequence\n");
+     err_printf("    -i --incrmnt    FILE    incrementally align sequences to an existing graph/MSA [Null]\n");
+     err_printf("                            graph could be in GFA or MSA format generated by abPOA\n");
+-    err_printf("    -o --output     FILE    ouput to FILE [stdout]\n");
++    err_printf("    -o --output     FILE    output to FILE [stdout]\n");
+     err_printf("    -r --result      INT    output result mode [%d]\n", ABPOA_OUT_CONS);
+     err_printf("                            - %d: consensus in FASTA format\n", ABPOA_OUT_CONS);
+     err_printf("                            - %d: MSA in PIR format\n", ABPOA_OUT_MSA);
+--- abpoa.orig/src/abpoa_align.c
++++ abpoa/src/abpoa_align.c
+@@ -95,7 +95,7 @@
+     abpt->align_mode = ABPOA_GLOBAL_MODE;
+     abpt->gap_mode = ABPOA_CONVEX_GAP;
+     abpt->zdrop = -1;     // disable zdrop
+-    abpt->end_bonus = -1; // disable end bouns
++    abpt->end_bonus = -1; // disable end bonus
+     abpt->wb = ABPOA_EXTRA_B; // extra bandwidth
+     abpt->wf = ABPOA_EXTRA_F; // extra bandwidth
+ 
diff --git a/rules b/rules
new file mode 100755
index 0000000..78da02a
--- /dev/null
+++ b/rules
@@ -0,0 +1,49 @@
+#!/usr/bin/make -f
+# DH_VERBOSE := 1
+export LC_ALL=C.UTF-8
+
+include /usr/share/dpkg/default.mk
+export DEB_BUILD_MAINT_OPTIONS=hardening=+all
+export DEB_CFLAGS_MAINT_APPEND+=-DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
+export DEB_CXXFLAGS_MAINT_APPEND+=-DSIMDE_ENABLE_OPENMP -fopenmp-simd -O3
+export PYBUILD_NAME=pyabpoa
+
+%:
+	dh $@ --with=python3 --buildsystem=pybuild
+
+execute_after_dh_auto_build:
+ifeq ($(DEB_HOST_ARCH),amd64)
+	set -e \
+	; for SIMDE in avx2 avx sse4.1 ssse3 sse3 \
+	; do	CFLAGS="$(CFLAGS) -m$${SIMDE}" \
+		CXXFLAGS="$(CXXFLAGS) -m$${SIMDE}" \
+		dh_auto_build --buildsystem=makefile -- all \
+	; 	mv -v bin/abpoa bin/abpoa.$${SIMDE} \
+	; 	dh_auto_clean --buildsystem=makefile -- clean \
+	; done
+endif
+	dh_auto_build --buildsystem=makefile -- all
+	mv -v bin/abpoa bin/abpoa.generic
+	# copy the dispatch script
+	cp debian/abpoa-dispatch.sh bin/abpoa
+
+override_dh_auto_test:
+ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
+	# Tests from the README.md document
+	./bin/abpoa ./test_data/seq.fa > cons.fa
+	./bin/abpoa ./test_data/heter.fa > 2cons.fa
+	./bin/abpoa -r1 ./test_data/seq.fa > out.msa
+	./bin/abpoa -r2 ./test_data/seq.fa > out_cons.msa
+	./bin/abpoa -r3 ./test_data/seq.fa > out.gfa
+	./bin/abpoa -r4 ./test_data/seq.fa > out4.gfa
+	cp out.gfa in.gfa
+	cp out.msa in.msa
+	./bin/abpoa -i in.gfa ./test_data/seq.fa -r3 > out.gfa
+	./bin/abpoa -i in.msa ./test_data/seq.fa -r1 > out.msa
+	./bin/abpoa ./test_data/seq.fa -g poa.png > cons.fa
+	./bin/abpoa ./test_data/heter.fa -d2
+endif
+
+override_dh_gencontrol:
+	dh_gencontrol -- \
+		-Vsimde:Built-Using="$(shell dpkg-query -f '$${source:Package} (= $${source:Version}), ' -W "libsimde-dev")"
diff --git a/salsa-ci.yml b/salsa-ci.yml
new file mode 100644
index 0000000..b463b38
--- /dev/null
+++ b/salsa-ci.yml
@@ -0,0 +1,7 @@
+---
+include:
+  - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml
+  - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml
+
+variables:
+  SALSA_CI_DISABLE_BUILD_PACKAGE_I386: "true"
diff --git a/source/format b/source/format
new file mode 100644
index 0000000..163aaf8
--- /dev/null
+++ b/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/tests/autopkgtest-pkg-python.conf b/tests/autopkgtest-pkg-python.conf
new file mode 100644
index 0000000..e9b85d5
--- /dev/null
+++ b/tests/autopkgtest-pkg-python.conf
@@ -0,0 +1 @@
+import_name = pyabpoa
diff --git a/tests/control b/tests/control
new file mode 100644
index 0000000..d2aa55a
--- /dev/null
+++ b/tests/control
@@ -0,0 +1,3 @@
+Tests: run-unit-test
+Depends: @
+Restrictions: allow-stderr
diff --git a/tests/run-unit-test b/tests/run-unit-test
new file mode 100644
index 0000000..43ad9b9
--- /dev/null
+++ b/tests/run-unit-test
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+pkg=abpoa
+
+export LC_ALL=C.UTF-8
+if [ "${AUTOPKGTEST_TMP}" = "" ] ; then
+  AUTOPKGTEST_TMP=$(mktemp -d /tmp/${pkg}-test.XXXXXX)
+  # Double quote below to expand the temporary directory variable now versus
+  # later is on purpose.
+  # shellcheck disable=SC2064
+  trap "rm -rf ${AUTOPKGTEST_TMP}" 0 INT QUIT ABRT PIPE TERM
+fi
+
+cp -a "/usr/share/doc/${pkg}/examples"/* "${AUTOPKGTEST_TMP}"
+cd "${AUTOPKGTEST_TMP}"
+
+set -x
+
+# Test as many variants of the program as the test node permits
+for SIMDE in avx2 avx sse4.1 ssse3 sse3 generic dispatch
+do
+	if [ "$SIMDE" = "dispatch" ]
+	then BINARY="abpoa"
+	elif grep -q "$SIMDE" /proc/cpuinfo || [ "$SIMDE" = "generic" ]
+	then BINARY="abpoa.$SIMDE"
+	else continue
+	fi
+
+	# Tests inspired by the README.md document
+	"$BINARY" ./test_data/seq.fa > cons.fa
+	"$BINARY" ./test_data/heter.fa > 2cons.fa
+	"$BINARY" -r1 ./test_data/seq.fa > out.msa
+	"$BINARY" -r2 ./test_data/seq.fa > out_cons.msa
+	"$BINARY" -r3 ./test_data/seq.fa > out.gfa
+	"$BINARY" -r4 ./test_data/seq.fa > out4.gfa
+	cp out.gfa in.gfa
+	cp out.msa in.msa
+	"$BINARY" -i in.gfa ./test_data/seq.fa -r3 > out.gfa
+	"$BINARY" -i in.msa ./test_data/seq.fa -r1 > out.msa
+	"$BINARY" ./test_data/seq.fa -g poa.png > cons.fa
+	"$BINARY" ./test_data/heter.fa -d2
+done
diff --git a/upstream/metadata b/upstream/metadata
new file mode 100644
index 0000000..6c1c0eb
--- /dev/null
+++ b/upstream/metadata
@@ -0,0 +1,17 @@
+Bug-Database: https://github.com/yangao07/abPOA/issues
+Bug-Submit: https://github.com/yangao07/abPOA/issues/new
+Repository: https://github.com/yangao07/abPOA.git
+Repository-Browse: https://github.com/yangao07/abPOA
+Reference:
+ - Author: >
+    Yan Gao, Yongzhuang Liu, Yanmei Ma, Bo Liu, Yadong Wang, Yi Xing
+   Title: >
+    abPOA: an SIMD-based C library for fast partial order alignment using
+    adaptive band
+   Journal: Bioinformatics
+   Year: 2021
+   Volume: 37
+   Number: 15
+   Pages: 2209–2211
+   DOI: 10.1093/bioinformatics/btaa963 
+   URL: https://doi.org/10.1093/bioinformatics/btaa963
diff --git a/watch b/watch
new file mode 100644
index 0000000..f4ad60f
--- /dev/null
+++ b/watch
@@ -0,0 +1,4 @@
+version=4
+opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*@ARCHIVE_EXT@)%@PACKAGE@-$1%" \
+https://github.com/yangao07/abPOA/tags \
+(?:.*?/)?v?@ANY_VERSION@@ARCHIVE_EXT@
-- 
cgit v1.2.3


From 0b3db8337509fb74d9b0daffb950d06818b20cb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Sat, 23 Jul 2022 21:23:15 +0200
Subject: refer to debian's simde headers.

Forwarded: not-needed
Last-Update: 2022-07-07

Last-Update: 2022-07-07
Gbp-Pq: Name debian-simde.patch
---
 include/simd_instruction.h | 8 ++++----
 src/simd_instruction.h     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/simd_instruction.h b/include/simd_instruction.h
index 41deb16..064eb8c 100644
--- a/include/simd_instruction.h
+++ b/include/simd_instruction.h
@@ -21,15 +21,15 @@
 #include <immintrin.h>
 #else // use SIMDE
 #ifdef __AVX512F__
-#include "simde/simde/x86/avx512.h"
+#include <simde/x86/avx512.h|>
 #else
 #ifdef __AVX2__
-#include "simde/simde/x86/avx2.h"
+#include <simde/x86/avx2.h>
 #else
 #ifdef __SSE4_1__
-#include "simde/simde/x86/sse4.1.h"
+#include <simde/x86/sse4.1.h>
 #else
-#include "simde/simde/x86/sse2.h"
+#include <simde/x86/sse2.h>
 #endif // end of sse41
 #endif // end of AVX2
 #endif // end of 512F
diff --git a/src/simd_instruction.h b/src/simd_instruction.h
index 41deb16..e70215a 100644
--- a/src/simd_instruction.h
+++ b/src/simd_instruction.h
@@ -21,15 +21,15 @@
 #include <immintrin.h>
 #else // use SIMDE
 #ifdef __AVX512F__
-#include "simde/simde/x86/avx512.h"
+#include <simde/x86/avx512.h>
 #else
 #ifdef __AVX2__
-#include "simde/simde/x86/avx2.h"
+#include <simde/x86/avx2.h>
 #else
 #ifdef __SSE4_1__
-#include "simde/simde/x86/sse4.1.h"
+#include <simde/x86/sse4.1.h>
 #else
-#include "simde/simde/x86/sse2.h"
+#include <simde/x86/sse2.h>
 #endif // end of sse41
 #endif // end of AVX2
 #endif // end of 512F
-- 
cgit v1.2.3


From fd8d97e18d9bb446fe3bc17200695cbccf8cf94d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Sat, 23 Jul 2022 21:23:15 +0200
Subject: inject standard debian build flags.

Forwarded: not-needed
Last-Update: 2022-07-09

Last-Update: 2022-07-09
Gbp-Pq: Name buildflags.patch
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 1284a68..af68387 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 #CC          = gcc
 EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
-CFLAGS      = -Wall -O3 $(EXTRA_FLAGS)
+CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
 
 SIMD_FLAG   = -march=native
 
@@ -22,9 +22,9 @@ ifneq ($(debug),)
 endif
 # for gdb
 ifneq ($(gdb),)
-	CFLAGS   = -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
+	CFLAGS   += -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
 else
-	CFLAGS   = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
+	CFLAGS   += -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
 endif
 
 # for gprof
-- 
cgit v1.2.3


From 1dfd633dbb0ece7bbf632d8e0783d3a099d3608b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Sat, 23 Jul 2022 21:23:15 +0200
Subject: fix typos caught by lintian.

Forwarded: https://github.com/yangao07/abPOA/pull/40
Last-Update: 2022-07-09

Last-Update: 2022-07-09
Gbp-Pq: Name typos.patch
---
 src/abpoa.c       | 4 ++--
 src/abpoa_align.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/abpoa.c b/src/abpoa.c
index e9eb1d4..69a0afc 100644
--- a/src/abpoa.c
+++ b/src/abpoa.c
@@ -31,7 +31,7 @@ const struct option abpoa_long_opt [] = {
     { "extra-b", 1, NULL, 'b' },
     { "extra-f", 1, NULL, 'f' },
     { "zdrop", 1, NULL, 'z' },
-    { "bouns", 1, NULL, 'e' },
+    { "bonus", 1, NULL, 'e' },
 
     { "seeding", 0, NULL, 'S'},
     { "k-mer", 1, NULL, 'k' },
@@ -108,7 +108,7 @@ int abpoa_usage(void)
     err_printf("                            which will be aligned by abPOA to generate a consensus sequence\n");
     err_printf("    -i --incrmnt    FILE    incrementally align sequences to an existing graph/MSA [Null]\n");
     err_printf("                            graph could be in GFA or MSA format generated by abPOA\n");
-    err_printf("    -o --output     FILE    ouput to FILE [stdout]\n");
+    err_printf("    -o --output     FILE    output to FILE [stdout]\n");
     err_printf("    -r --result      INT    output result mode [%d]\n", ABPOA_OUT_CONS);
     err_printf("                            - %d: consensus in FASTA format\n", ABPOA_OUT_CONS);
     err_printf("                            - %d: MSA in PIR format\n", ABPOA_OUT_MSA);
diff --git a/src/abpoa_align.c b/src/abpoa_align.c
index 00bf94e..7eca502 100644
--- a/src/abpoa_align.c
+++ b/src/abpoa_align.c
@@ -95,7 +95,7 @@ abpoa_para_t *abpoa_init_para(void) {
     abpt->align_mode = ABPOA_GLOBAL_MODE;
     abpt->gap_mode = ABPOA_CONVEX_GAP;
     abpt->zdrop = -1;     // disable zdrop
-    abpt->end_bonus = -1; // disable end bouns
+    abpt->end_bonus = -1; // disable end bonus
     abpt->wb = ABPOA_EXTRA_B; // extra bandwidth
     abpt->wf = ABPOA_EXTRA_F; // extra bandwidth
 
-- 
cgit v1.2.3


From 277dfb6becda549550d8074a4cc15ba732d5a2cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Sat, 23 Jul 2022 21:23:15 +0200
Subject: respect the CPU baseline.

Forwarded: not-needed
Last-Update: 2022-07-09

FIXME: implement multiple executables using SIMDe as distributed by Debian.
Last-Update: 2022-07-09
Gbp-Pq: Name baseline.patch
---
 CMakeLists.txt |  2 +-
 Makefile       | 26 +++++++++++++-------------
 setup.py       |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e4b642..3015c4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
 
 # build abPOA as a static library by default
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
diff --git a/Makefile b/Makefile
index af68387..a77b8d7 100644
--- a/Makefile
+++ b/Makefile
@@ -2,19 +2,19 @@
 EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
 CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
 
-SIMD_FLAG   = -march=native
-
-ifneq ($(armv7),) # for ARMv7
-	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
-else
-ifneq ($(armv8),) # for ARMv8
-ifneq ($(aarch64),) # for Aarch64 
-	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
-else # for Aarch32
-	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
-endif
-endif
-endif
+#SIMD_FLAG   = -march=native
+
+#ifneq ($(armv7),) # for ARMv7
+#	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
+#else
+#ifneq ($(armv8),) # for ARMv8
+#ifneq ($(aarch64),) # for Aarch64 
+#	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
+#else # for Aarch32
+#	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
+#endif
+#endif
+#endif
 
 # for debug
 ifneq ($(debug),)
diff --git a/setup.py b/setup.py
index a082250..030df24 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@ setup(
                     include_dirs=[inc_dir],
                     depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_output.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'],
                     libraries = ['z', 'm', 'pthread'],
-                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde, simd_flag])],
+                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde])],
     install_requires=['cython'],
     cmdclass = cmdclass
 )
-- 
cgit v1.2.3


From bf8fda999d165314b0e13a08dc5349a90ae8f7d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Wed, 3 Aug 2022 13:15:25 +0200
Subject: refer to debian's simde headers.

Forwarded: not-needed
Last-Update: 2022-07-07

Last-Update: 2022-07-07
Gbp-Pq: Name debian-simde.patch
---
 include/simd_instruction.h | 8 ++++----
 src/simd_instruction.h     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/simd_instruction.h b/include/simd_instruction.h
index 41deb16..064eb8c 100644
--- a/include/simd_instruction.h
+++ b/include/simd_instruction.h
@@ -21,15 +21,15 @@
 #include <immintrin.h>
 #else // use SIMDE
 #ifdef __AVX512F__
-#include "simde/simde/x86/avx512.h"
+#include <simde/x86/avx512.h|>
 #else
 #ifdef __AVX2__
-#include "simde/simde/x86/avx2.h"
+#include <simde/x86/avx2.h>
 #else
 #ifdef __SSE4_1__
-#include "simde/simde/x86/sse4.1.h"
+#include <simde/x86/sse4.1.h>
 #else
-#include "simde/simde/x86/sse2.h"
+#include <simde/x86/sse2.h>
 #endif // end of sse41
 #endif // end of AVX2
 #endif // end of 512F
diff --git a/src/simd_instruction.h b/src/simd_instruction.h
index 41deb16..e70215a 100644
--- a/src/simd_instruction.h
+++ b/src/simd_instruction.h
@@ -21,15 +21,15 @@
 #include <immintrin.h>
 #else // use SIMDE
 #ifdef __AVX512F__
-#include "simde/simde/x86/avx512.h"
+#include <simde/x86/avx512.h>
 #else
 #ifdef __AVX2__
-#include "simde/simde/x86/avx2.h"
+#include <simde/x86/avx2.h>
 #else
 #ifdef __SSE4_1__
-#include "simde/simde/x86/sse4.1.h"
+#include <simde/x86/sse4.1.h>
 #else
-#include "simde/simde/x86/sse2.h"
+#include <simde/x86/sse2.h>
 #endif // end of sse41
 #endif // end of AVX2
 #endif // end of 512F
-- 
cgit v1.2.3


From f3e32e27963aacdea550f5dfddc012e8575ec572 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Wed, 3 Aug 2022 13:15:25 +0200
Subject: inject standard debian build flags.

Forwarded: not-needed
Last-Update: 2022-07-09

Last-Update: 2022-07-09
Gbp-Pq: Name buildflags.patch
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 1284a68..af68387 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 #CC          = gcc
 EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
-CFLAGS      = -Wall -O3 $(EXTRA_FLAGS)
+CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
 
 SIMD_FLAG   = -march=native
 
@@ -22,9 +22,9 @@ ifneq ($(debug),)
 endif
 # for gdb
 ifneq ($(gdb),)
-	CFLAGS   = -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
+	CFLAGS   += -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
 else
-	CFLAGS   = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
+	CFLAGS   += -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
 endif
 
 # for gprof
-- 
cgit v1.2.3


From fccd6165b1dd181e50b0e75aa75620439cf733f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Wed, 3 Aug 2022 13:15:25 +0200
Subject: fix typos caught by lintian.

Forwarded: https://github.com/yangao07/abPOA/pull/40
Last-Update: 2022-07-09

Last-Update: 2022-07-09
Gbp-Pq: Name typos.patch
---
 src/abpoa.c       | 4 ++--
 src/abpoa_align.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/abpoa.c b/src/abpoa.c
index e9eb1d4..69a0afc 100644
--- a/src/abpoa.c
+++ b/src/abpoa.c
@@ -31,7 +31,7 @@ const struct option abpoa_long_opt [] = {
     { "extra-b", 1, NULL, 'b' },
     { "extra-f", 1, NULL, 'f' },
     { "zdrop", 1, NULL, 'z' },
-    { "bouns", 1, NULL, 'e' },
+    { "bonus", 1, NULL, 'e' },
 
     { "seeding", 0, NULL, 'S'},
     { "k-mer", 1, NULL, 'k' },
@@ -108,7 +108,7 @@ int abpoa_usage(void)
     err_printf("                            which will be aligned by abPOA to generate a consensus sequence\n");
     err_printf("    -i --incrmnt    FILE    incrementally align sequences to an existing graph/MSA [Null]\n");
     err_printf("                            graph could be in GFA or MSA format generated by abPOA\n");
-    err_printf("    -o --output     FILE    ouput to FILE [stdout]\n");
+    err_printf("    -o --output     FILE    output to FILE [stdout]\n");
     err_printf("    -r --result      INT    output result mode [%d]\n", ABPOA_OUT_CONS);
     err_printf("                            - %d: consensus in FASTA format\n", ABPOA_OUT_CONS);
     err_printf("                            - %d: MSA in PIR format\n", ABPOA_OUT_MSA);
diff --git a/src/abpoa_align.c b/src/abpoa_align.c
index 00bf94e..7eca502 100644
--- a/src/abpoa_align.c
+++ b/src/abpoa_align.c
@@ -95,7 +95,7 @@ abpoa_para_t *abpoa_init_para(void) {
     abpt->align_mode = ABPOA_GLOBAL_MODE;
     abpt->gap_mode = ABPOA_CONVEX_GAP;
     abpt->zdrop = -1;     // disable zdrop
-    abpt->end_bonus = -1; // disable end bouns
+    abpt->end_bonus = -1; // disable end bonus
     abpt->wb = ABPOA_EXTRA_B; // extra bandwidth
     abpt->wf = ABPOA_EXTRA_F; // extra bandwidth
 
-- 
cgit v1.2.3


From 0f2488f91458079242d455f58bcf5550b2cf92ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Wed, 3 Aug 2022 13:15:25 +0200
Subject: respect the CPU baseline.

Forwarded: not-needed
Last-Update: 2022-07-09

FIXME: implement multiple executables using SIMDe as distributed by Debian.
Last-Update: 2022-07-09
Gbp-Pq: Name baseline.patch
---
 CMakeLists.txt |  2 +-
 Makefile       | 26 +++++++++++++-------------
 setup.py       |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e4b642..3015c4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
 
 # build abPOA as a static library by default
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
diff --git a/Makefile b/Makefile
index af68387..a77b8d7 100644
--- a/Makefile
+++ b/Makefile
@@ -2,19 +2,19 @@
 EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
 CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
 
-SIMD_FLAG   = -march=native
-
-ifneq ($(armv7),) # for ARMv7
-	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
-else
-ifneq ($(armv8),) # for ARMv8
-ifneq ($(aarch64),) # for Aarch64 
-	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
-else # for Aarch32
-	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
-endif
-endif
-endif
+#SIMD_FLAG   = -march=native
+
+#ifneq ($(armv7),) # for ARMv7
+#	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
+#else
+#ifneq ($(armv8),) # for ARMv8
+#ifneq ($(aarch64),) # for Aarch64 
+#	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
+#else # for Aarch32
+#	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
+#endif
+#endif
+#endif
 
 # for debug
 ifneq ($(debug),)
diff --git a/setup.py b/setup.py
index a082250..030df24 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@ setup(
                     include_dirs=[inc_dir],
                     depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_output.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'],
                     libraries = ['z', 'm', 'pthread'],
-                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde, simd_flag])],
+                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde])],
     install_requires=['cython'],
     cmdclass = cmdclass
 )
-- 
cgit v1.2.3


From 9ab5c7147b5920c53f04be427b13fe6cb1c54d72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Thu, 17 Aug 2023 23:18:44 +0200
Subject: refer to debian's simde headers.

Forwarded: not-needed
Last-Update: 2022-07-07

Last-Update: 2022-07-07
Gbp-Pq: Name debian-simde.patch
---
 include/simd_instruction.h | 8 ++++----
 src/simd_instruction.h     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/simd_instruction.h b/include/simd_instruction.h
index 41deb16..064eb8c 100644
--- a/include/simd_instruction.h
+++ b/include/simd_instruction.h
@@ -21,15 +21,15 @@
 #include <immintrin.h>
 #else // use SIMDE
 #ifdef __AVX512F__
-#include "simde/simde/x86/avx512.h"
+#include <simde/x86/avx512.h|>
 #else
 #ifdef __AVX2__
-#include "simde/simde/x86/avx2.h"
+#include <simde/x86/avx2.h>
 #else
 #ifdef __SSE4_1__
-#include "simde/simde/x86/sse4.1.h"
+#include <simde/x86/sse4.1.h>
 #else
-#include "simde/simde/x86/sse2.h"
+#include <simde/x86/sse2.h>
 #endif // end of sse41
 #endif // end of AVX2
 #endif // end of 512F
diff --git a/src/simd_instruction.h b/src/simd_instruction.h
index 41deb16..e70215a 100644
--- a/src/simd_instruction.h
+++ b/src/simd_instruction.h
@@ -21,15 +21,15 @@
 #include <immintrin.h>
 #else // use SIMDE
 #ifdef __AVX512F__
-#include "simde/simde/x86/avx512.h"
+#include <simde/x86/avx512.h>
 #else
 #ifdef __AVX2__
-#include "simde/simde/x86/avx2.h"
+#include <simde/x86/avx2.h>
 #else
 #ifdef __SSE4_1__
-#include "simde/simde/x86/sse4.1.h"
+#include <simde/x86/sse4.1.h>
 #else
-#include "simde/simde/x86/sse2.h"
+#include <simde/x86/sse2.h>
 #endif // end of sse41
 #endif // end of AVX2
 #endif // end of 512F
-- 
cgit v1.2.3


From 71a92c8ad3b470f053f1d179bed5a44add3af12b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Thu, 17 Aug 2023 23:18:44 +0200
Subject: inject standard debian build flags.

Forwarded: not-needed
Last-Update: 2022-07-09

Last-Update: 2022-07-09
Gbp-Pq: Name buildflags.patch
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 1284a68..af68387 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 #CC          = gcc
 EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
-CFLAGS      = -Wall -O3 $(EXTRA_FLAGS)
+CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
 
 SIMD_FLAG   = -march=native
 
@@ -22,9 +22,9 @@ ifneq ($(debug),)
 endif
 # for gdb
 ifneq ($(gdb),)
-	CFLAGS   = -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
+	CFLAGS   += -Wall -g ${DFLAGS} $(EXTRA_FLAGS)
 else
-	CFLAGS   = -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
+	CFLAGS   += -Wall -O3 ${DFLAGS} $(EXTRA_FLAGS)
 endif
 
 # for gprof
-- 
cgit v1.2.3


From 43b995f162499e4de2454a175629e9bb45138c43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Thu, 17 Aug 2023 23:18:44 +0200
Subject: fix typos caught by lintian.

Forwarded: https://github.com/yangao07/abPOA/pull/40
Last-Update: 2022-07-09

Last-Update: 2022-07-09
Gbp-Pq: Name typos.patch
---
 src/abpoa.c       | 4 ++--
 src/abpoa_align.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/abpoa.c b/src/abpoa.c
index e9eb1d4..69a0afc 100644
--- a/src/abpoa.c
+++ b/src/abpoa.c
@@ -31,7 +31,7 @@ const struct option abpoa_long_opt [] = {
     { "extra-b", 1, NULL, 'b' },
     { "extra-f", 1, NULL, 'f' },
     { "zdrop", 1, NULL, 'z' },
-    { "bouns", 1, NULL, 'e' },
+    { "bonus", 1, NULL, 'e' },
 
     { "seeding", 0, NULL, 'S'},
     { "k-mer", 1, NULL, 'k' },
@@ -108,7 +108,7 @@ int abpoa_usage(void)
     err_printf("                            which will be aligned by abPOA to generate a consensus sequence\n");
     err_printf("    -i --incrmnt    FILE    incrementally align sequences to an existing graph/MSA [Null]\n");
     err_printf("                            graph could be in GFA or MSA format generated by abPOA\n");
-    err_printf("    -o --output     FILE    ouput to FILE [stdout]\n");
+    err_printf("    -o --output     FILE    output to FILE [stdout]\n");
     err_printf("    -r --result      INT    output result mode [%d]\n", ABPOA_OUT_CONS);
     err_printf("                            - %d: consensus in FASTA format\n", ABPOA_OUT_CONS);
     err_printf("                            - %d: MSA in PIR format\n", ABPOA_OUT_MSA);
diff --git a/src/abpoa_align.c b/src/abpoa_align.c
index 00bf94e..7eca502 100644
--- a/src/abpoa_align.c
+++ b/src/abpoa_align.c
@@ -95,7 +95,7 @@ abpoa_para_t *abpoa_init_para(void) {
     abpt->align_mode = ABPOA_GLOBAL_MODE;
     abpt->gap_mode = ABPOA_CONVEX_GAP;
     abpt->zdrop = -1;     // disable zdrop
-    abpt->end_bonus = -1; // disable end bouns
+    abpt->end_bonus = -1; // disable end bonus
     abpt->wb = ABPOA_EXTRA_B; // extra bandwidth
     abpt->wf = ABPOA_EXTRA_F; // extra bandwidth
 
-- 
cgit v1.2.3


From 8c26ed0aa50a417e61ed20fae5a6ce469fe4b1da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=89tienne=20Mollier?= <emollier@debian.org>
Date: Thu, 17 Aug 2023 23:18:44 +0200
Subject: respect the CPU baseline.

Forwarded: not-needed
Last-Update: 2022-07-09

FIXME: implement multiple executables using SIMDe as distributed by Debian.
Last-Update: 2022-07-09
Gbp-Pq: Name baseline.patch
---
 CMakeLists.txt |  2 +-
 Makefile       | 26 +++++++++++++-------------
 setup.py       |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e4b642..3015c4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") # SIMD
 
 # build abPOA as a static library by default
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared")
diff --git a/Makefile b/Makefile
index af68387..a77b8d7 100644
--- a/Makefile
+++ b/Makefile
@@ -2,19 +2,19 @@
 EXTRA_FLAGS = -Wno-unused-function -Wno-misleading-indentation -DUSE_SIMDE -DSIMDE_ENABLE_NATIVE_ALIASES
 CFLAGS      += $(CPPFLAGS) -Wall -O3 $(EXTRA_FLAGS) $(LDFLAGS)
 
-SIMD_FLAG   = -march=native
-
-ifneq ($(armv7),) # for ARMv7
-	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
-else
-ifneq ($(armv8),) # for ARMv8
-ifneq ($(aarch64),) # for Aarch64 
-	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
-else # for Aarch32
-	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
-endif
-endif
-endif
+#SIMD_FLAG   = -march=native
+
+#ifneq ($(armv7),) # for ARMv7
+#	SIMD_FLAG   =  -march=armv7-a -mfpu=neon -D__AVX2__
+#else
+#ifneq ($(armv8),) # for ARMv8
+#ifneq ($(aarch64),) # for Aarch64 
+#	SIMD_FLAG   =  -march=armv8-a+simd -D__AVX2__
+#else # for Aarch32
+#	SIMD_FLAG   =  -march=armv8-a+simd -mfpu=auto -D__AVX2__
+#endif
+#endif
+#endif
 
 # for debug
 ifneq ($(debug),)
diff --git a/setup.py b/setup.py
index a082250..030df24 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@ setup(
                     include_dirs=[inc_dir],
                     depends=[src_dir+'abpoa.h', src_dir+'abpoa_align.h', src_dir+'abpoa_graph.h', src_dir+'abpoa_output.h', src_dir+'abpoa_seed.h', src_dir+'abpoa_seq.h', src_dir+'kalloc.h', src_dir+'khash.h', src_dir+'kdq.h', src_dir+'kseq.h', src_dir+'ksort.h', src_dir+'kstring.h', src_dir+'kvec.h', src_dir+'simd_abpoa_align.h', src_dir+'simd_instruction.h', src_dir+'utils.h', 'python/cabpoa.pxd'],
                     libraries = ['z', 'm', 'pthread'],
-                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde, simd_flag])],
+                    extra_compile_args=['-O3', '-Wno-error=declaration-after-statement', simde])],
     install_requires=['cython'],
     cmdclass = cmdclass
 )
-- 
cgit v1.2.3