// Copyright 2016 The Go Authors. All rights reserved. // Copyright (c) 2019 Klaus Post. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build !appengine // +build gc // +build !noasm #include "textflag.h" #define R_TMP0 AX #define R_TMP1 BX #define R_LEN CX #define R_OFF DX #define R_SRC SI #define R_DST DI #define R_DBASE R8 #define R_DLEN R9 #define R_DEND R10 #define R_SBASE R11 #define R_SLEN R12 #define R_SEND R13 #define R_TMP2 R14 #define R_TMP3 R15 // The asm code generally follows the pure Go code in decode_other.go, except // where marked with a "!!!". // func decode(dst, src []byte) int // // All local variables fit into registers. The non-zero stack size is only to // spill registers and push args when issuing a CALL. The register allocation: // - R_TMP0 scratch // - R_TMP1 scratch // - R_LEN length or x (shared) // - R_OFF offset // - R_SRC &src[s] // - R_DST &dst[d] // + R_DBASE dst_base // + R_DLEN dst_len // + R_DEND dst_base + dst_len // + R_SBASE src_base // + R_SLEN src_len // + R_SEND src_base + src_len // - R_TMP2 used by doCopy // - R_TMP3 used by doCopy // // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the // function, and after a CALL returns, and are not otherwise modified. // // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. TEXT ·s2Decode(SB), NOSPLIT, $48-56 // Initialize R_SRC, R_DST and R_DBASE-R_SEND. MOVQ dst_base+0(FP), R_DBASE MOVQ dst_len+8(FP), R_DLEN MOVQ R_DBASE, R_DST MOVQ R_DBASE, R_DEND ADDQ R_DLEN, R_DEND MOVQ src_base+24(FP), R_SBASE MOVQ src_len+32(FP), R_SLEN MOVQ R_SBASE, R_SRC MOVQ R_SBASE, R_SEND ADDQ R_SLEN, R_SEND XORQ R_OFF, R_OFF loop: // for s < len(src) CMPQ R_SRC, R_SEND JEQ end // R_LEN = uint32(src[s]) // // switch src[s] & 0x03 MOVBLZX (R_SRC), R_LEN MOVL R_LEN, R_TMP1 ANDL $3, R_TMP1 CMPL R_TMP1, $1 JAE tagCopy // ---------------------------------------- // The code below handles literal tags. // case tagLiteral: // x := uint32(src[s] >> 2) // switch SHRL $2, R_LEN CMPL R_LEN, $60 JAE tagLit60Plus // case x < 60: // s++ INCQ R_SRC doLit: // This is the end of the inner "switch", when we have a literal tag. // // We assume that R_LEN == x and x fits in a uint32, where x is the variable // used in the pure Go decode_other.go code. // length = int(x) + 1 // // Unlike the pure Go code, we don't need to check if length <= 0 because // R_LEN can hold 64 bits, so the increment cannot overflow. INCQ R_LEN // Prepare to check if copying length bytes will run past the end of dst or // src. // // R_TMP0 = len(dst) - d // R_TMP1 = len(src) - s MOVQ R_DEND, R_TMP0 SUBQ R_DST, R_TMP0 MOVQ R_SEND, R_TMP1 SUBQ R_SRC, R_TMP1 // !!! Try a faster technique for short (16 or fewer bytes) copies. // // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { // goto callMemmove // Fall back on calling runtime·memmove. // } // // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s // against 21 instead of 16, because it cannot assume that all of its input // is contiguous in memory and so it needs to leave enough source bytes to // read the next tag without refilling buffers, but Go's Decode assumes // contiguousness (the src argument is a []byte). CMPQ R_LEN, $16 JGT callMemmove CMPQ R_TMP0, $16 JLT callMemmove CMPQ R_TMP1, $16 JLT callMemmove // !!! Implement the copy from src to dst as a 16-byte load and store. // (Decode's documentation says that dst and src must not overlap.) // // This always copies 16 bytes, instead of only length bytes, but that's // OK. If the input is a valid Snappy encoding then subsequent iterations // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a // non-nil error), so the overrun will be ignored. // // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or // 16-byte loads and stores. This technique probably wouldn't be as // effective on architectures that are fussier about alignment. MOVOU 0(R_SRC), X0 MOVOU X0, 0(R_DST) // d += length // s += length ADDQ R_LEN, R_DST ADDQ R_LEN, R_SRC JMP loop callMemmove: // if length > len(dst)-d || length > len(src)-s { etc } CMPQ R_LEN, R_TMP0 JGT errCorrupt CMPQ R_LEN, R_TMP1 JGT errCorrupt // copy(dst[d:], src[s:s+length]) // // This means calling runtime·memmove(&dst[d], &src[s], length), so we push // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those // three registers to the stack, to save local variables across the CALL. MOVQ R_DST, 0(SP) MOVQ R_SRC, 8(SP) MOVQ R_LEN, 16(SP) MOVQ R_DST, 24(SP) MOVQ R_SRC, 32(SP) MOVQ R_LEN, 40(SP) MOVQ R_OFF, 48(SP) CALL runtime·memmove(SB) // Restore local variables: unspill registers from the stack and // re-calculate R_DBASE-R_SEND. MOVQ 24(SP), R_DST MOVQ 32(SP), R_SRC MOVQ 40(SP), R_LEN MOVQ 48(SP), R_OFF MOVQ dst_base+0(FP), R_DBASE MOVQ dst_len+8(FP), R_DLEN MOVQ R_DBASE, R_DEND ADDQ R_DLEN, R_DEND MOVQ src_base+24(FP), R_SBASE MOVQ src_len+32(FP), R_SLEN MOVQ R_SBASE, R_SEND ADDQ R_SLEN, R_SEND // d += length // s += length ADDQ R_LEN, R_DST ADDQ R_LEN, R_SRC JMP loop tagLit60Plus: // !!! This fragment does the // // s += x - 58; if uint(s) > uint(len(src)) { etc } // // checks. In the asm version, we code it once instead of once per switch case. ADDQ R_LEN, R_SRC SUBQ $58, R_SRC CMPQ R_SRC, R_SEND JA errCorrupt // case x == 60: CMPL R_LEN, $61 JEQ tagLit61 JA tagLit62Plus // x = uint32(src[s-1]) MOVBLZX -1(R_SRC), R_LEN JMP doLit tagLit61: // case x == 61: // x = uint32(src[s-2]) | uint32(src[s-1])<<8 MOVWLZX -2(R_SRC), R_LEN JMP doLit tagLit62Plus: CMPL R_LEN, $62 JA tagLit63 // case x == 62: // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 // We read one byte, safe to read one back, since we are just reading tag. // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 MOVL -4(R_SRC), R_LEN SHRL $8, R_LEN JMP doLit tagLit63: // case x == 63: // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 MOVL -4(R_SRC), R_LEN JMP doLit // The code above handles literal tags. // ---------------------------------------- // The code below handles copy tags. tagCopy4: // case tagCopy4: // s += 5 ADDQ $5, R_SRC // if uint(s) > uint(len(src)) { etc } CMPQ R_SRC, R_SEND JA errCorrupt // length = 1 + int(src[s-5])>>2 SHRQ $2, R_LEN INCQ R_LEN // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) MOVLQZX -4(R_SRC), R_OFF JMP doCopy tagCopy2: // case tagCopy2: // s += 3 ADDQ $3, R_SRC // if uint(s) > uint(len(src)) { etc } CMPQ R_SRC, R_SEND JA errCorrupt // length = 1 + int(src[s-3])>>2 SHRQ $2, R_LEN INCQ R_LEN // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) MOVWQZX -2(R_SRC), R_OFF JMP doCopy tagCopy: // We have a copy tag. We assume that: // - R_TMP1 == src[s] & 0x03 // - R_LEN == src[s] CMPQ R_TMP1, $2 JEQ tagCopy2 JA tagCopy4 // case tagCopy1: // s += 2 ADDQ $2, R_SRC // if uint(s) > uint(len(src)) { etc } CMPQ R_SRC, R_SEND JA errCorrupt // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) // length = 4 + int(src[s-2])>>2&0x7 MOVBQZX -1(R_SRC), R_TMP1 MOVQ R_LEN, R_TMP0 SHRQ $2, R_LEN ANDQ $0xe0, R_TMP0 ANDQ $7, R_LEN SHLQ $3, R_TMP0 ADDQ $4, R_LEN ORQ R_TMP1, R_TMP0 // check if repeat code, ZF set by ORQ. JZ repeatCode // This is a regular copy, transfer our temporary value to R_OFF (length) MOVQ R_TMP0, R_OFF JMP doCopy // This is a repeat code. repeatCode: // If length < 9, reuse last offset, with the length already calculated. CMPQ R_LEN, $9 JL doCopyRepeat // Read additional bytes for length. JE repeatLen1 // Rare, so the extra branch shouldn't hurt too much. CMPQ R_LEN, $10 JE repeatLen2 JMP repeatLen3 // Read repeat lengths. repeatLen1: // s ++ ADDQ $1, R_SRC // if uint(s) > uint(len(src)) { etc } CMPQ R_SRC, R_SEND JA errCorrupt // length = src[s-1] + 8 MOVBQZX -1(R_SRC), R_LEN ADDL $8, R_LEN JMP doCopyRepeat repeatLen2: // s +=2 ADDQ $2, R_SRC // if uint(s) > uint(len(src)) { etc } CMPQ R_SRC, R_SEND JA errCorrupt // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) MOVWQZX -2(R_SRC), R_LEN ADDL $260, R_LEN JMP doCopyRepeat repeatLen3: // s +=3 ADDQ $3, R_SRC // if uint(s) > uint(len(src)) { etc } CMPQ R_SRC, R_SEND JA errCorrupt // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) // Read one byte further back (just part of the tag, shifted out) MOVL -4(R_SRC), R_LEN SHRL $8, R_LEN ADDL $65540, R_LEN JMP doCopyRepeat doCopy: // This is the end of the outer "switch", when we have a copy tag. // // We assume that: // - R_LEN == length && R_LEN > 0 // - R_OFF == offset // if d < offset { etc } MOVQ R_DST, R_TMP1 SUBQ R_DBASE, R_TMP1 CMPQ R_TMP1, R_OFF JLT errCorrupt // Repeat values can skip the test above, since any offset > 0 will be in dst. doCopyRepeat: // if offset <= 0 { etc } CMPQ R_OFF, $0 JLE errCorrupt // if length > len(dst)-d { etc } MOVQ R_DEND, R_TMP1 SUBQ R_DST, R_TMP1 CMPQ R_LEN, R_TMP1 JGT errCorrupt // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length // // Set: // - R_TMP2 = len(dst)-d // - R_TMP3 = &dst[d-offset] MOVQ R_DEND, R_TMP2 SUBQ R_DST, R_TMP2 MOVQ R_DST, R_TMP3 SUBQ R_OFF, R_TMP3 // !!! Try a faster technique for short (16 or fewer bytes) forward copies. // // First, try using two 8-byte load/stores, similar to the doLit technique // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is // still OK if offset >= 8. Note that this has to be two 8-byte load/stores // and not one 16-byte load/store, and the first store has to be before the // second load, due to the overlap if offset is in the range [8, 16). // // if length > 16 || offset < 8 || len(dst)-d < 16 { // goto slowForwardCopy // } // copy 16 bytes // d += length CMPQ R_LEN, $16 JGT slowForwardCopy CMPQ R_OFF, $8 JLT slowForwardCopy CMPQ R_TMP2, $16 JLT slowForwardCopy MOVQ 0(R_TMP3), R_TMP0 MOVQ R_TMP0, 0(R_DST) MOVQ 8(R_TMP3), R_TMP1 MOVQ R_TMP1, 8(R_DST) ADDQ R_LEN, R_DST JMP loop slowForwardCopy: // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we // can still try 8-byte load stores, provided we can overrun up to 10 extra // bytes. As above, the overrun will be fixed up by subsequent iterations // of the outermost loop. // // The C++ snappy code calls this technique IncrementalCopyFastPath. Its // commentary says: // // ---- // // The main part of this loop is a simple copy of eight bytes at a time // until we've copied (at least) the requested amount of bytes. However, // if d and d-offset are less than eight bytes apart (indicating a // repeating pattern of length < 8), we first need to expand the pattern in // order to get the correct results. For instance, if the buffer looks like // this, with the eight-byte and patterns marked as // intervals: // // abxxxxxxxxxxxx // [------] d-offset // [------] d // // a single eight-byte copy from to will repeat the pattern // once, after which we can move two bytes without moving : // // ababxxxxxxxxxx // [------] d-offset // [------] d // // and repeat the exercise until the two no longer overlap. // // This allows us to do very well in the special case of one single byte // repeated many times, without taking a big hit for more general cases. // // The worst case of extra writing past the end of the match occurs when // offset == 1 and length == 1; the last copy will read from byte positions // [0..7] and write to [4..11], whereas it was only supposed to write to // position 1. Thus, ten excess bytes. // // ---- // // That "10 byte overrun" worst case is confirmed by Go's // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy // and finishSlowForwardCopy algorithm. // // if length > len(dst)-d-10 { // goto verySlowForwardCopy // } SUBQ $10, R_TMP2 CMPQ R_LEN, R_TMP2 JGT verySlowForwardCopy // We want to keep the offset, so we use R_TMP2 from here. MOVQ R_OFF, R_TMP2 makeOffsetAtLeast8: // !!! As above, expand the pattern so that offset >= 8 and we can use // 8-byte load/stores. // // for offset < 8 { // copy 8 bytes from dst[d-offset:] to dst[d:] // length -= offset // d += offset // offset += offset // // The two previous lines together means that d-offset, and therefore // // R_TMP3, is unchanged. // } CMPQ R_TMP2, $8 JGE fixUpSlowForwardCopy MOVQ (R_TMP3), R_TMP1 MOVQ R_TMP1, (R_DST) SUBQ R_TMP2, R_LEN ADDQ R_TMP2, R_DST ADDQ R_TMP2, R_TMP2 JMP makeOffsetAtLeast8 fixUpSlowForwardCopy: // !!! Add length (which might be negative now) to d (implied by R_DST being // &dst[d]) so that d ends up at the right place when we jump back to the // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if // length is positive, copying the remaining length bytes will write to the // right place. MOVQ R_DST, R_TMP0 ADDQ R_LEN, R_DST finishSlowForwardCopy: // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative // length means that we overrun, but as above, that will be fixed up by // subsequent iterations of the outermost loop. CMPQ R_LEN, $0 JLE loop MOVQ (R_TMP3), R_TMP1 MOVQ R_TMP1, (R_TMP0) ADDQ $8, R_TMP3 ADDQ $8, R_TMP0 SUBQ $8, R_LEN JMP finishSlowForwardCopy verySlowForwardCopy: // verySlowForwardCopy is a simple implementation of forward copy. In C // parlance, this is a do/while loop instead of a while loop, since we know // that length > 0. In Go syntax: // // for { // dst[d] = dst[d - offset] // d++ // length-- // if length == 0 { // break // } // } MOVB (R_TMP3), R_TMP1 MOVB R_TMP1, (R_DST) INCQ R_TMP3 INCQ R_DST DECQ R_LEN JNZ verySlowForwardCopy JMP loop // The code above handles copy tags. // ---------------------------------------- end: // This is the end of the "for s < len(src)". // // if d != len(dst) { etc } CMPQ R_DST, R_DEND JNE errCorrupt // return 0 MOVQ $0, ret+48(FP) RET errCorrupt: // return decodeErrCodeCorrupt MOVQ $1, ret+48(FP) RET