From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 18 Feb 2020 11:35:53 +0000
Subject: [PATCH] Added support for big endian platforms.
Forwarded: https://github.com/jkbonfield/htscodecs/commit/91323f5fa5142239116048f5ef640a81acc64fbb.patch

Mostly this is just via endian agnostic ways of doing things, but the
16-bit rans is slightly better if it can make endianness assumptions
so we also now have a new header file to detect endianness.

If the detection fails it uses the case code path as big endian, so
this is safe.

diff --git a/htscodecs/htscodecs_endian.h b/htscodecs/htscodecs_endian.h
new file mode 100644
index 0000000..e038aec
--- /dev/null
+++ b/htscodecs/htscodecs_endian.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020 Genome Research Ltd.
+ * Author(s): James Bonfield
+ * 
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions are met:
+ * 
+ *    1. Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ * 
+ *    2. Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ * 
+ *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
+ *    Institute nor the names of its contributors may be used to endorse
+ *    or promote products derived from this software without specific
+ *    prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
+ * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HTSCODECS_ENDIAN_H
+#define HTSCODECS_ENDIAN_H
+
+// Endianness checking.
+
+// Sets HTSCODECS_ENDIAN_KNOWN if system type detected and either
+// HTSCODECS_LITTLE_ENDIAN or HTSCODECS_BIG_ENDIAN.
+
+/*
+ * In general our preferred route is to write code in an endian agnostic
+ * fashion, but our data formats are natively little endian.  Therefore
+ * in time critical code it's sometimes best to exploit that.
+ *
+ * Therefore we'll optimise code along the lines of:
+ *
+ * #ifdef HTSCODECS_LITTLE_ENDIAN
+ *     // do something little endian specific
+ * #else
+ *     // do something in an endian agnostic fashion
+ * #endif
+ *
+ * This means our code works even if we fail to recognise the
+ * specific machine type.
+ */
+
+#if (defined(__i386__)      \
+ ||  defined(__i386)        \
+ ||  defined(__amd64__)     \
+ ||  defined(__amd64)       \
+ ||  defined(__x86_64__)    \
+ ||  defined(__x86_64)      \
+ ||  defined(__i686__)      \
+ ||  defined(__i686))       \
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+ ||  defined(__LITTLE_ENDIAN__)                                            \
+ ||  defined(__ARMEL__)     \
+ ||  defined(__THUMBEL__)   \
+ ||  defined(__AARCH64EL__) \
+ ||  defined(_MIPSEL)       \
+ ||  defined(__MIPSEL)      \
+ ||  defined(__MIPSEL__) 
+    // Little endian
+#   define HTSCODECS_LITTLE_ENDIAN
+#   define HTSCODECS_ENDIAN_KNOWN
+#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \
+   ||  defined(__BIG_ENDIAN__) \
+   || defined(__ARMEB__)       \
+   || defined(__THUMBEB__)     \
+   || defined(__AAARCHEB__)    \
+   || defined(_MIPSEB)         \
+   || defined(__MIPSEB)        \
+   || defined(__MIPSEB__)
+    // Big endian
+#   define HTSCODECS_BIG_ENDIAN
+#   define HTSCODECS_ENDIAN_KNOWN
+#else
+//    Unknown - code will need to check HTSCODES_ENDIAN_KNOWN and do endian agnostic
+#endif
+
+#endif /* HTSCODECS_ENDIAN_H */
diff --git a/htscodecs/rANS_word.h b/htscodecs/rANS_word.h
index f9c2d5b..0edfca4 100644
--- a/htscodecs/rANS_word.h
+++ b/htscodecs/rANS_word.h
@@ -19,6 +19,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>
+#include "htscodecs_endian.h"
 
 #ifdef assert
 #define RansAssert assert
@@ -285,19 +286,22 @@ static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol
     uint32_t x = *r;
     uint32_t x_max = sym->x_max;
 
-//    uint32_t c = x < sym->x_max;
-//    uint16_t *p16 = (uint16_t *)(*pptr-2);
-//    uint32_t p1 = x, x1 = x >> 16;
-//    *p16  = c ? *p16 : p1;
-//    *pptr = c ? *pptr : (uint8_t *)p16;
-//    x     = c ? x : x1;
-
+#ifdef HTSCODECS_LITTLE_ENDIAN
+    if (x >= x_max) {
+	(*pptr) -= 2;
+        **(uint16_t **)pptr = x;
+	x >>= 16;
+    }
+#else
     if (x >= x_max) {
-	uint16_t* ptr = *(uint16_t **)pptr;
-	*--ptr = x;//(uint16_t) (x & 0xffff);
+	uint8_t* ptr = *pptr;
+        ptr -= 2;
+	ptr[0] = x & 0xff;
+	ptr[1] = (x >> 8) & 0xff;
 	x >>= 16;
-	*pptr = (uint8_t *)ptr;
+	*pptr = ptr;
     }
+#endif
 
     // x = C(s,x)
     // NOTE: written this way so we get a 32-bit "multiply high" when
@@ -342,10 +346,8 @@ static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* s
 
 // Renormalize.
 
-// FIXME: this is endian specific.  We need big endian versions of these
-// functions.
-
 #if defined(__x86_64) && !defined(__ILP32__)
+
 /*
  * Assembly variants of the RansDecRenorm code.
  * These are based on joint ideas from Rob Davies and from looking at
@@ -381,16 +383,35 @@ static inline void RansDecRenorm(RansState* r, uint8_t** pptr)
     // renormalize
     uint32_t x = *r;
 
-    //       q4        q40
-    // clang 801/540   750/438 (generated cmov; equiv to asm above)
-    // gcc8  896/624   352/286 (conditionals)
-    uint16_t* ptr = *(uint16_t **)pptr;
-    uint32_t y = *ptr;
+    // Generally this is faster than asm for highly predictable data
+    // (branch prediction is efficient), but slower for complex data.
+    //
+    // However we'll be using RLE and/or PACK for highly predictable
+    // data, turning into smaller but less predictable as far as
+    // entropy encoding goes.  So the latter is what we tune for.
+    // We do however still have to use this code on big endian systems.
+
+    // Times for asm code (above) / this code (LE) / this code (BE).
+    //
+    //       q4-0      q40-0
+    // clang 648/903/904   648/392/391  this faster q4, slower q40
+    // gcc7  650/851/851   650/394/393
+    //       q4-1      q40-1(OPT)
+    // clang 487/567/568   445/310/312  this faster q4, slower q40
+    // gcc7  413/497/498   381/282/282
+    //       q4-193(OPT)
+    // clang 990/907/903                this slower
+    // gcc7  952/875/868                this slower
+#ifdef HTSCODECS_LITTLE_ENDIAN
+    uint32_t y = **(uint16_t **)pptr; // 32-bit quicker here
+#else
+    uint16_t y = **(uint16_t **)pptr;
+    y = (y<<8) | (y>>8);
+#endif
     if (x < RANS_BYTE_L)
-	ptr++;
+	(*pptr)+=2;
     if (x < RANS_BYTE_L)
 	x = (x << 16) | y;
-    *pptr = (uint8_t *)ptr;
 
     *r = x;
 }
@@ -400,9 +421,15 @@ static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_
 {
     uint32_t x = *r;
     if (x >= RANS_BYTE_L || *pptr+1 >= ptr_end) return;
+#ifdef HTSCODECS_LITTLE_ENDIAN
     uint16_t* ptr = *(uint16_t **)pptr;
     x = (x << 16) | *ptr++;
     *pptr = (uint8_t *)ptr;
+#else
+    uint16_t y = (*pptr)[0] + ((*pptr)[1]<<8);
+    x = (x << 16) | y;
+    (*pptr) += 2;
+#endif
     *r = x;
 }
 
diff --git a/htscodecs/tokenise_name3.c b/htscodecs/tokenise_name3.c
index 4cb10d2..3e05e3f 100644
--- a/htscodecs/tokenise_name3.c
+++ b/htscodecs/tokenise_name3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Genome Research Ltd.
+ * Copyright (c) 2016-2020 Genome Research Ltd.
  * Author(s): James Bonfield
  * 
  * Redistribution and use in source and binary forms, with or without 
@@ -362,8 +362,11 @@ static int encode_token_int(name_context *ctx, int ntok,
     if (encode_token_type(ctx, ntok, type) < 0) return -1;
     if (descriptor_grow(&ctx->desc[id], 4) < 0)	return -1;
 
-    // Assumes little endian and unalign access OK.
-    *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l) = val;
+    uint8_t *cp = &ctx->desc[id].buf[ctx->desc[id].buf_l];
+    cp[0] = (val >>  0) & 0xff;
+    cp[1] = (val >>  8) & 0xff;
+    cp[2] = (val >> 16) & 0xff;
+    cp[3] = (val >> 24) & 0xff;
     ctx->desc[id].buf_l += 4;
 
     return 0;
@@ -377,8 +380,8 @@ static int decode_token_int(name_context *ctx, int ntok,
     if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a)
 	return -1;
 
-    // Assumes little endian and unalign access OK.
-    *val = *(uint32_t *)(ctx->desc[id].buf + ctx->desc[id].buf_l);
+    uint8_t *cp = ctx->desc[id].buf + ctx->desc[id].buf_l;
+    *val = (cp[0]) + (cp[1]<<8) + (cp[2]<<16) + (cp[3]<<24);
     ctx->desc[id].buf_l += 4;
 
     return 0;
@@ -1478,12 +1481,18 @@ uint8_t *encode_names(char *blk, int len, int level, int use_arith,
     }
 
     uint8_t *cp = out;
-    //*out_len = tot_size+4;
-    //*(uint32_t *)cp = tot_size;   cp += 4;
 
     *out_len = tot_size;
-    *(uint32_t *)cp = last_start; cp += 4;
-    *(uint32_t *)cp = nreads;     cp += 4;
+//    *(uint32_t *)cp = last_start; cp += 4;
+//    *(uint32_t *)cp = nreads;     cp += 4;
+    *cp++ = (last_start >>  0) & 0xff;
+    *cp++ = (last_start >>  8) & 0xff;
+    *cp++ = (last_start >> 16) & 0xff;
+    *cp++ = (last_start >> 24) & 0xff;
+    *cp++ = (nreads     >>  0) & 0xff;
+    *cp++ = (nreads     >>  8) & 0xff;
+    *cp++ = (nreads     >> 16) & 0xff;
+    *cp++ = (nreads     >> 24) & 0xff;
     *cp++ = use_arith;
     //write(1, &nreads, 4);
     int last_tnum = -1;
@@ -1524,12 +1533,14 @@ uint8_t *decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) {
 	return NULL;
 
     int i, o = 9;
-    int ulen   = *(uint32_t *)in;
+    //int ulen   = *(uint32_t *)in;
+    int ulen   = (in[0]<<0) | (in[1]<<8) | (in[2]<<16) | (in[3]<<24);
 
     if (ulen < 0 || ulen >= INT_MAX-1024)
 	return NULL;
 
-    int nreads = *(uint32_t *)(in+4);
+    //int nreads = *(uint32_t *)(in+4);
+    int nreads = (in[4]<<0) | (in[5]<<8) | (in[6]<<16) | (in[7]<<24);
     int use_arith = in[8];
     name_context *ctx = create_context(nreads);
     if (!ctx)
