PNGdec: Sync with upstream.

* Fixed pixel conversion of 1-bpp grayscale to RGB565
* Improved perf for systems which allow unaligned memory accesses
* corrected optimized code to not go past buffer end
* speed improvements
This commit is contained in:
Laurence Bank 2024-03-26 12:28:41 +00:00 committed by Phil Howard
parent 6eb0f90e53
commit fca0bb076a
4 changed files with 174 additions and 59 deletions

View File

@ -8,6 +8,20 @@
#include "inflate.h"
#include "inffast.h"
#if (INTPTR_MAX == INT64_MAX) || defined(HAL_ESP32_HAL_H_) || defined(TEENSYDUINO) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM7)
#define ALLOWS_UNALIGNED
#endif
#if INTPTR_MAX == INT64_MAX
#define REGISTER_WIDTH 64
typedef uint64_t BIGUINT;
typedef uint32_t SMALLUINT;
#else
#define REGISTER_WIDTH 32
typedef uint32_t BIGUINT;
typedef uint16_t SMALLUINT;
#endif // native register size
#ifdef ASMINF
# pragma message("Assembler code may have bugs -- use at your own risk")
#else
@ -64,7 +78,8 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
unsigned whave; /* valid bytes in the window */
unsigned wnext; /* window write index */
unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
unsigned long hold; /* local strm->hold */
BIGUINT hold, tmpbits; /* local strm->hold */
// unsigned long hold; /* local strm->hold */
unsigned bits; /* local strm->bits */
code const FAR *lcode; /* local strm->lencode */
code const FAR *dcode; /* local strm->distcode */
@ -101,11 +116,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
do {
if (bits < 15) {
if (bits < (REGISTER_WIDTH/2)) { // helps on 32 and 64-bit CPUs
#ifdef ALLOWS_UNALIGNED
tmpbits = *(SMALLUINT *)in;
hold |= (BIGUINT)(tmpbits << bits);
in += sizeof(SMALLUINT);
bits += (REGISTER_WIDTH / 2);
#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
#endif
}
here = lcode[hold & lmask];
dolen:
@ -123,20 +145,29 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
len = (unsigned)(here.val);
op &= 15; /* number of extra bits */
if (op) {
#if REGISTER_WIDTH == 32
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
hold += (uint32_t)(*in++) << bits;
bits += 8;
}
#endif
len += (unsigned)hold & ((1U << op) - 1);
hold >>= op;
bits -= op;
}
Tracevv((stderr, "inflate: length %u\n", len));
if (bits < 15) {
if (bits < (REGISTER_WIDTH/2)) { // helps on 32 and 64-bit CPUs
#ifdef UNALIGNED_OK
tmpbits = *(SMALLUINT *)in;
hold |= (BIGUINT)(tmpbits << bits);
in += sizeof(SMALLUINT);
bits += (REGISTER_WIDTH / 2);
#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
#endif
}
here = dcode[hold & dmask];
dodist:
@ -147,14 +178,22 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
if (op & 16) { /* distance base */
dist = (unsigned)(here.val);
op &= 15; /* number of extra bits */
#if REGISTER_WIDTH == 32
if (bits < op) {
#ifdef ALLOWS_UNALIGNED
hold |= (*(uint16_t *)in << bits);
bits += 16;
in += 2;
#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
if (bits < op) {
if (bits < op) { // this is NEVER true
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
#endif // ALLOWS_UNALIGNED
}
#endif // 32-bit CPU
dist += (unsigned)hold & ((1U << op) - 1);
#ifdef INFLATE_STRICT
if (dist > dmax) {
@ -236,12 +275,18 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
from = out - dist; /* rest from output */
}
}
// if (len > 50 && len < dist) {
// memmove(out, from, len);
// out += len;
// from += len;
// len = 0;
// } else {
#ifdef ALLOWS_UNALIGNED
{
uint8_t *pEnd = out+len;
while (out < pEnd) {
*(uint32_t *)out = *(uint32_t *)from;
out += 4;
from += 4;
}
// correct for possible overshoot of destination ptr
out = pEnd;
}
#else
while (len > 2) {
*out++ = *from++;
*out++ = *from++;
@ -253,22 +298,38 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
if (len > 1)
*out++ = *from++;
}
// }
#endif // ALLOWS_UNALIGNED
}
else {
from = out - dist; /* copy direct from output */
// Larry Bank added -
// For relatively large runs, it's faster to let memmove
// use whatever code is efficient on the target platform
// if (dist == 1) { // frequent case for images
// memset(out, *from, len);
// out += len;
// } else if (len > 50 && len < dist) {
// memmove(out, from, len);
// out += len;
// from += len;
// len = 0;
// } else {
#ifdef ALLOWS_UNALIGNED
{
uint8_t *pEnd = out+len;
int overlap = (int)(intptr_t)(out-from);
if (overlap >= 4) { // overlap of source/dest won't impede normal copy
while (out < pEnd) {
*(uint32_t *)out = *(uint32_t *)from;
out += 4;
from += 4;
}
// correct for possible overshoot of destination ptr
out = pEnd;
} else if (overlap == 1) { // copy 1-byte pattern
uint32_t pattern = *from;
pattern = pattern | (pattern << 8);
pattern = pattern | (pattern << 16);
while (out < pEnd) {
*(uint32_t *)out = pattern;
out += 4;
}
out = pEnd; // correct possible overshoot
} else { // overlap of 2 or 3
while (out < pEnd) {
*out++ = *from++;
}
}
}
#else
do { /* minimum length is three */
*out++ = *from++;
*out++ = *from++;
@ -280,7 +341,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
if (len > 1)
*out++ = *from++;
}
// }
#endif // ALLOWS_UNALIGNED
}
}
else if ((op & 64) == 0) { /* 2nd level distance code */
@ -310,10 +371,10 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
} while (in < last && out < end);
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
len = bits >> 3;
in -= len;
bits -= len << 3;
hold &= (1U << bits) - 1;
// len = bits >> 3;
// in -= len;
// bits -= len << 3;
// hold &= (1 << bits) - 1;
/* update state and return */
strm->next_in = in;

View File

@ -85,6 +85,10 @@
#include "inflate.h"
#include "inffast.h"
#if (INTPTR_MAX == INT64_MAX) || defined(HAL_ESP32_HAL_H_) || defined(TEENSYDUINO) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM7)
#define ALLOWS_UNALIGNED
#endif
#ifdef MAKEFIXED
# ifndef BUILDFIXED
# define BUILDFIXED
@ -262,7 +266,8 @@ int value;
state->bits = 0;
return Z_OK;
}
if (bits > 16 || state->bits + (uInt)bits > 32) return Z_STREAM_ERROR;
if (bits > 16 || state->bits + (uInt)bits > 32)
return Z_STREAM_ERROR;
value &= (1L << bits) - 1;
state->hold += (unsigned)value << state->bits;
state->bits += (uInt)bits;
@ -1191,9 +1196,39 @@ int check_crc;
if (copy > left) copy = left;
left -= copy;
state->length -= copy;
#ifdef ALLOWS_UNALIGNED
{
uint8_t *pEnd = put+copy;
int overlap = (int)(intptr_t)(put-from);
if (overlap >= 4) { // overlap of source/dest won't impede normal copy
while (put < pEnd-3) { // overwriting the output buffer here would be bad, so respect the true length
*(uint32_t *)put = *(uint32_t *)from;
put += 4;
from += 4;
}
while (put < pEnd) { // tail end
*put++ = *from++;
}
} else if (overlap == 1) { // copy 1-byte pattern
uint32_t pattern = *from;
pattern = pattern | (pattern << 8);
pattern = pattern | (pattern << 16);
while (put < pEnd) {
*(uint32_t *)put = pattern;
put += 4;
}
put = pEnd; // correct possible overshoot
} else { // overlap of 2 or 3
while (put < pEnd) {
*put++ = *from++;
}
}
}
#else
do {
*put++ = *from++;
} while (--copy);
#endif // ALLOWS_UNALIGNED
if (state->length == 0) state->mode = LEN;
break;
case LIT:

View File

@ -98,7 +98,8 @@ struct inflate_state {
unsigned wnext; /* window write index */
unsigned char FAR *window; /* allocated sliding window, if needed */
/* bit accumulator */
unsigned long hold; /* input bit accumulator */
uint64_t hold; /* input bit accumulator */
// unsigned long hold; /* input bit accumulator */
unsigned bits; /* number of bits in "in" */
/* for string and stored block copying */
unsigned length; /* literal or length of data to copy */

View File

@ -243,15 +243,33 @@ PNG_STATIC void PNGRGB565(PNGDRAW *pDraw, uint16_t *pPixels, int iEndiannes, uin
}
break;
case PNG_PIXEL_GRAYSCALE:
for (x=0; x<pDraw->iWidth; x++) {
c = *s++;
usPixel = (c >> 3); // blue
usPixel |= ((c >> 2) << 5); // green
usPixel |= ((c >> 3) << 11); // red
if (iEndiannes == PNG_RGB565_BIG_ENDIAN)
usPixel = __builtin_bswap16(usPixel);
*pDest++ = usPixel;
}
switch (pDraw->iBpp) {
case 8:
for (x=0; x<pDraw->iWidth; x++) {
c = *s++;
usPixel = (c >> 3); // blue
usPixel |= ((c >> 2) << 5); // green
usPixel |= ((c >> 3) << 11); // red
if (iEndiannes == PNG_RGB565_BIG_ENDIAN)
usPixel = __builtin_bswap16(usPixel);
*pDest++ = usPixel;
}
break;
case 1:
for (x=0; x<pDraw->iWidth; x++) {
if ((x & 7) == 0) {
c = *s++;
}
if (c & 0x80) {
usPixel = 0xffff;
} else {
usPixel = 0;
}
*pDest++ = usPixel;
c <<= 1;
}
break;
} // switch on bpp
break;
case PNG_PIXEL_TRUECOLOR:
for (x=0; x<pDraw->iWidth; x++) {
@ -302,15 +320,15 @@ PNG_STATIC void PNGRGB565(PNGDRAW *pDraw, uint16_t *pPixels, int iEndiannes, uin
}
break;
case 1:
for (x=0; x<pDraw->iWidth; x+=4) {
c = *s++;
for (j=0; j<8; j++) { // work on pairs of bits
usPixel = pDraw->pFastPalette[c >> 7];
if (iEndiannes == PNG_RGB565_BIG_ENDIAN)
usPixel = __builtin_bswap16(usPixel);
*pDest++ = usPixel;
c <<= 1;
for (x=0; x<pDraw->iWidth; x++) {
if ((x & 7) == 0) {
c = *s++;
}
usPixel = pDraw->pFastPalette[c >> 7];
if (iEndiannes == PNG_RGB565_BIG_ENDIAN)
usPixel = __builtin_bswap16(usPixel);
*pDest++ = usPixel;
c <<= 1;
}
break;
} // switch on bpp
@ -379,18 +397,18 @@ PNG_STATIC void PNGRGB565(PNGDRAW *pDraw, uint16_t *pPixels, int iEndiannes, uin
}
break;
case 1:
for (x=0; x<pDraw->iWidth; x+=4) {
c = *s++;
for (j=0; j<8; j++) { // work on pairs of bits
pPal = &pDraw->pPalette[(c >> 7) * 3];
usPixel = (pPal[2] >> 3); // blue
usPixel |= ((pPal[1] >> 2) << 5); // green
usPixel |= ((pPal[0] >> 3) << 11); // red
if (iEndiannes == PNG_RGB565_BIG_ENDIAN)
usPixel = __builtin_bswap16(usPixel);
*pDest++ = usPixel;
c <<= 1;
for (x=0; x<pDraw->iWidth; x++) {
if ((x & 7) == 0) {
c = *s++;
}
pPal = &pDraw->pPalette[(c >> 7) * 3];
usPixel = (pPal[2] >> 3); // blue
usPixel |= ((pPal[1] >> 2) << 5); // green
usPixel |= ((pPal[0] >> 3) << 11); // red
if (iEndiannes == PNG_RGB565_BIG_ENDIAN)
usPixel = __builtin_bswap16(usPixel);
*pDest++ = usPixel;
c <<= 1;
}
break;
} // switch on bits per pixel