Updated Sensor API (markdown)

2019-06-25 23:36:38 +02:00 · 2019-06-25 23:36:38 +02:00 · af1657a4dc
parent 6aa8cce866
commit af1657a4dc
1 changed files with 439 additions and 0 deletions
--- a/Sensor-API.md
+++ b/Sensor-API.md
@ -311,3 +311,442 @@ This pre-processor directive saves RAM by storing strings in flash instead of RA
 This pre-processor directive saves RAM by storing strings in flash instead of RAM.

 You may then reference them directly (if the type matches the parameter required) or force it to 4 byte alignment by using the variable as `FPSTR(MyTextStaticVariable)`
+
+# Keeping ESP8266 code compact
+
+Below are various tips and tricks to keep ESP8266 code compact and save both Flash and Memory. Flash code is limited to 1024k but keep in mind that to allow OTA upgrade, you need Flash memory to contain two firmwares at the same time. To go beyond 512k, you typically use `sonoff-minimal` as an intermediate firmware. `sonoff-minimal` takes roughly 360k, so it's safe not to gouint32_t beyond 620k of Flash. Memory is even more limited: 80k. With Arduino Core and basic Tasmota, there are 25k-30k left of heap space. Heap memory is very precious, running out of memory will generally cause a crash.
+
+## About ESP8266
+
+ESP8266 is based on [Xtensa instruction set](https://0x04.net/~mwk/doc/xtensa.pdf). Xtensa is a 32 bits RISC processor core, containing 16 x 32 bits registers. ESP8266 supports integer operations, including 32x32 multiplication. It does not contain an FPU for floating point operations, nor integer divisions.
+
+Contraty to classical RISC processors, all instructions are 24 bits wide instead of 32 bits. To increase code compactness, some instructions have a 16 bits version used whenever possible by gcc.
+
+If you want to see what assembly is generated by gcc, in file `platform.ini`, at the section used to compile (ex: `[core_2_5_2]`) in section `build_flags` add:
+
+```-save-temps=obj -fverbose-asm```
+
+Gcc will store `<file>.s` in the same folder as the `.o` file, typically in `.pioenvs/`.
+
+### First example
+
+Let's take a basic function:
+
+```c++
+uint32_t Example(uint32_t a, uint32_t b) {
+  return a + b;
+}
+```
+
+Below is the generated assembly. Function names are mangled using satndard C++, i.e. their name derive from their arguments and return types:
+
+```asm
+_Z7Examplejj:
+	add.n	a2, a2, a3	#, a, b
+	ret.n
+```
+
+As you can see, this is the simplest function we can think of. Register A2 holds the first argument and is used for return value. A3 holds the second argument.
+
+### uint8_t or uint32_t ?
+
+```c++
+uint32_t Example(uint32_t a, uint32_t b) {
+  uint8_t c = a + b;
+  return c;
+}
+```
+
+Assembly:
+
+```asm
+_Z7Examplejj:
+	add.n	a2, a2, a3	# tmp52, a, b
+	extui	a2, a2, 0, 8	#, tmp52
+	ret.n
+```
+
+Whenever gcc needs to convert from `uin32\_t` to `uint8_t`, it uses an extra instruction `extui  <reg>, <reg>, 0, 8`.
+
+Whenever you allocate `uint8_t`as a local variable, it will anyways allocate 32 bits on the stack.
+
+In conclusion you can easily use `uint32_t` in many places in the code. The main reason to force `uint8_t` are:
+
+* in structures, to save memory. This is the only place where  `uint8_t` will take 1 byte and the compiler will try to pack as much as 4 `uint8_t` in 32 bits
+* when you want to ensure that the value can never exceed 255. Beware though that the compiler will just chunk the last 8 bits or a 32 bits value and will not report any overflow.
+
+#### Loops
+
+Should you use `uint8_t` or `uint32_t` for loops?
+
+Let's try:
+
+```c++
+uint32_t Example(uint32_t a, uint32_t b) {
+  for (uint8_t i = 0; i < 10; i++) {
+    a += b;
+  }
+  for (uint32_t j = 0; j < 10; j++) {
+    a += b;
+  }
+  return a;
+}
+```
+
+Assembly:
+
+```asm
+_Z7Examplejj:
+	movi.n	a3, 0	# ivtmp$7334,				<- loop 1
+.L2031:
+	add.n	a2, a2, a3	# a, a, ivtmp$7334
+	addi.n	a3, a3, 1	# ivtmp$7334, ivtmp$7334,
+	bnei	a3, 10, .L2031	# ivtmp$7334,,
+	movi.n	a3, 0	# j,						<- loop 2
+.L2033:
+	add.n	a2, a2, a3	# a, a, j
+	addi.n	a3, a3, 1	# j, j,
+	bnei	a3, 10, .L2033	# j,,
+	ret.n
+```
+
+As you can see here, both loops generate the same assembly for fixed size loops.
+
+Let's now see for variable size loops.
+
+```c++
+uint32_t Example(uint32_t a, uint32_t b) {
+  for (uint8_t i = 0; i < b; i++) {
+    a += i;
+  }
+  for (uint32_t j = 0; j < b; j++) {
+    a += j;
+  }
+  return a;
+}
+```
+
+Assembly:
+
+```asm
+_Z7Examplejj:
+	movi.n	a4, 0	# i,					<- loop 1
+	j	.L2030	#
+.L2031:
+	add.n	a2, a2, a4	# a, a, i
+	addi.n	a4, a4, 1	# tmp48, i,
+	extui	a4, a4, 0, 8	# i, tmp48		<- extra 32 to 8 bits conversion
+.L2030:
+	bltu	a4, a3, .L2031	# i, b,
+	movi.n	a4, 0	# j,					<- loop 2
+	j	.L2032	#
+.L2033:
+	add.n	a2, a2, a4	# a, a, j
+	addi.n	a4, a4, 1	# j, j,
+.L2032:
+	bne	a4, a3, .L2033	# j, b,
+	ret.n
+```
+
+In the first loop, the register a4 needs to be converted from 32 bits to 8 bits in each iteration.
+
+Again, there is no definitive rule, but keep in mind that using `uint8_t` can sometimes increase code size compared to `uint32_t`.
+
+### Floats, not doubles!
+
+ESP8266 does not have a FPU (Floating Point Unit), all floating point operations are emulated in software and provided in `libm.a`. The linker removes any unused functions, so we need to limit the number of floating point function calls.
+
+**Rule 1**: use ints where you can, avoid floating point operations.
+
+**Rule 2**: if you really need floating point, always use `float`, never **ever** use `double`.
+
+Let's now see why.
+
+`float`fits in 32 bits, with a mantissa of 20 bits, exponent of TODO. The mantissa is 20 bits wide, which provides enough precision for most of our needs.
+
+`float` is 32 bits wide and fits in a single register, whereas `double` is 64 bits and requires 2 registers.
+
+```c++
+float Examplef(float a, float b) {
+  return sinf(a) * (b + 0.4f) - 3.5f;
+}
+```
+
+Assembly:
+
+```asm
+	.literal .LC1012, 0x3ecccccd		<- 0.4f
+	.literal .LC1013, 0x40600000		<- 3.5f
+_Z8Examplefff:
+	addi	sp, sp, -16	#,,				<- reserve 16 bytes on stack
+	s32i.n	a0, sp, 12	#,				<- save a0 (return address) on stack
+	s32i.n	a12, sp, 8	#,				<- save a12 on stack, to free for local var
+	s32i.n	a13, sp, 4	#,				<- save a13 on stack, to free for local var
+	mov.n	a13, a3	# b, b				<- a3 holds 'b', save to a13
+	call0	sinf	#					<- calc sin of a2 (a)
+	l32r	a3, .LC1012	#,				<- load 0.4f in a3
+	mov.n	a12, a2	# D.171139,			<- save result 'sin(a)' to a12
+	mov.n	a2, a13	#, b				<- move a13 (second arg: b) to a2
+	call0	__addsf3	#				<- add floats a2 and a3, result to a2
+	mov.n	a3, a2	# D.171139,			<- copy result to a3
+	mov.n	a2, a12	#, D.171139			<- load a2 with a12: sin(a)
+	call0	__mulsf3	#				<- multiply 'sin(a)*(b+0.4f)'
+	l32r	a3, .LC1013	#,				<- load a3 with 3.5f
+	call0	__subsf3	#				<- substract 
+	l32i.n	a0, sp, 12	#,				<- restore a0 (return address)
+	l32i.n	a12, sp, 8	#,				<- resotre a12
+	l32i.n	a13, sp, 4	#,				<- resotre a13
+	addi	sp, sp, 16	#,,				<- free stack
+	ret.n								<- return
+```
+
+Now with `double`:
+
+```c++
+double Exampled(double a, double b) {
+  return sin(a) * (b + 0.4) - 3.5;
+}
+```
+
+Assembly:
+
+```asm
+	.literal .LC1014, 0x9999999a, 0x3fd99999	<- 0.4
+	.literal .LC1015, 0x00000000, 0x400c0000	<- 3.5
+_Z8Exampleddd:
+	addi	sp, sp, -32	#,,
+	s32i.n	a0, sp, 28	#,
+	s32i.n	a12, sp, 24	#,
+	s32i.n	a13, sp, 20	#,
+	s32i.n	a14, sp, 16	#,
+	s32i.n	a15, sp, 12	#,
+	mov.n	a14, a4	#,
+	mov.n	a15, a5	#,
+	call0	sin	#
+	l32r	a4, .LC1014	#,
+	l32r	a5, .LC1014+4	#,
+	mov.n	a12, a2	#,
+	mov.n	a13, a3	#,
+	mov.n	a2, a14	#,
+	mov.n	a3, a15	#,
+	call0	__adddf3	#
+	mov.n	a4, a2	#,
+	mov.n	a5, a3	#,
+	mov.n	a2, a12	#,
+	mov.n	a3, a13	#,
+	call0	__muldf3	#
+	l32r	a4, .LC1015	#,
+	l32r	a5, .LC1015+4	#,
+	call0	__subdf3	#
+	l32i.n	a0, sp, 28	#,
+	l32i.n	a12, sp, 24	#,
+	l32i.n	a13, sp, 20	#,
+	l32i.n	a14, sp, 16	#,
+	l32i.n	a15, sp, 12	#,
+	addi	sp, sp, 32	#,,
+	ret.n
+```
+
+As you can see the `double` needs to move many more registers around. Examplef (float) is 84 bytes, Exampled (double) is 119 bytes (+42% code size). Actually it's even worse, `sin` is larger than float version `sinf`.
+
+Also, never forget to explicitly tag literals as float: always put `1.5f` and not `1.5`. Let's see the impact:
+
+```c++
+float Examplef2(float a, float b) {
+  return sinf(a) * (b + 0.4) - 3.5;    // same as above with double literals
+}
+```
+
+Assembly:
+
+```asm
+
+	.literal .LC1014, 0x9999999a, 0x3fd99999
+	.literal .LC1015, 0x00000000, 0x400c0000
+	.align	4
+	.global	_Z9Examplef2ff
+	.type	_Z9Examplef2ff, @function
+_Z9Examplef2ff:
+	addi	sp, sp, -16	#,,
+	s32i.n	a0, sp, 12	#,
+	s32i.n	a12, sp, 8	#,
+	s32i.n	a13, sp, 4	#,
+	s32i.n	a14, sp, 0	#,
+	mov.n	a14, a3	# b, b
+	call0	sinf	#
+	call0	__extendsfdf2	#		<- extend float to double
+	mov.n	a12, a2	#,
+	mov.n	a2, a14	#, b
+	mov.n	a13, a3	#,
+	call0	__extendsfdf2	#		<- extend float to double
+	l32r	a4, .LC1014	#,
+	l32r	a5, .LC1014+4	#,
+	call0	__adddf3	#			<- add double
+	mov.n	a4, a2	#,
+	mov.n	a5, a3	#,
+	mov.n	a2, a12	#,
+	mov.n	a3, a13	#,
+	call0	__muldf3	#			<- multiply double
+	l32r	a4, .LC1015	#,
+	l32r	a5, .LC1015+4	#,
+	call0	__subdf3	#			<- substract double
+	call0	__truncdfsf2	#		<- truncate double to float
+	l32i.n	a0, sp, 12	#,
+	l32i.n	a12, sp, 8	#,
+	l32i.n	a13, sp, 4	#,
+	l32i.n	a14, sp, 0	#,
+	addi	sp, sp, 16	#,,
+	ret.n
+```
+
+The last example takes 143 bytes, which is even worse than the `double` version, because of conversions from `float` to `double` and back. Internally, if you don't force `float` literals, gcc will make all intermediate compute in `double` and convert to `float` in the end. This is usually what is wanted: compute with maximum precision and truncate at the last moment. But for ESP8266 we want the opposite: most compact code.
+
+### String concatenation
+
+Let's start with an easy example:
+
+```c++
+void ExampleStringConcat(String &s) {
+  s += "suffix";
+}
+```
+
+Assembly (25 bytes):
+
+```asm
+.LC1024:
+	.string	"suffix"
+	.literal .LC1025, .LC1024
+_Z19ExampleStringConcatR6String:
+	l32r	a3, .LC1025	#,
+	addi	sp, sp, -16	#,,
+	s32i.n	a0, sp, 12	#,
+	call0	_ZN6String6concatEPKc	#
+	l32i.n	a0, sp, 12	#,
+	addi	sp, sp, 16	#,,
+	ret.n
+```
+
+If you need to add more complex strings, do not concatenate using native c++ concat:
+
+```c++
+void ExampleStringConcat2(String &s, uint8_t a, uint8_t b) {
+  s += "[" + String(a) + "," + String(b) + "]";
+}
+```
+
+Assembly (122 bytes!):
+
+```asm
+.LC231:
+	.string	","
+.LC1026:
+	.string	"["
+.LC1029:
+	.string	"]"
+	.literal .LC1027, .LC1026
+	.literal .LC1028, .LC231
+	.literal .LC1030, .LC1029
+_Z20ExampleStringConcat2R6Stringhh:
+	addi	sp, sp, -64	#,,
+	s32i.n	a13, sp, 52	#,
+	extui	a13, a3, 0, 8	# a, a
+	l32r	a3, .LC1027	#,
+	s32i.n	a12, sp, 56	#,
+	mov.n	a12, a2	# s, s
+	addi.n	a2, sp, 12	#,,
+	s32i.n	a0, sp, 60	#,
+	s32i.n	a14, sp, 48	#,
+	extui	a14, a4, 0, 8	# b, b
+	call0	_ZN6StringC2EPKc	#			<- allocate String
+	movi.n	a4, 0xa	#,
+	addi	a2, sp, 24	#,,
+	mov.n	a3, a13	#, a
+	call0	_ZN6StringC1Ehh	#			<- allocate String
+	addi	a3, sp, 24	#,,
+	addi.n	a2, sp, 12	#,,
+	call0	_ZplRK15StringSumHelperRK6String	#
+	l32r	a3, .LC1028	#,
+	call0	_ZplRK15StringSumHelperPKc	#
+	movi.n	a4, 0xa	#,
+	mov.n	a13, a2	# D.171315,
+	mov.n	a3, a14	#, b
+	mov.n	a2, sp	#,
+	call0	_ZN6StringC1Ehh	#			<- allocate String
+	mov.n	a3, sp	#,
+	mov.n	a2, a13	#, D.171315
+	call0	_ZplRK15StringSumHelperRK6String	#
+	l32r	a3, .LC1030	#,
+	call0	_ZplRK15StringSumHelperPKc	#
+	mov.n	a3, a2	# D.171315,
+	mov.n	a2, a12	#, s
+	call0	_ZN6String6concatERKS_	#
+	mov.n	a2, sp	#,
+	call0	_ZN6StringD1Ev	#			<- destructor
+	addi	a2, sp, 24	#,,
+	call0	_ZN6StringD1Ev	#			<- destructor
+	addi.n	a2, sp, 12	#,,
+	call0	_ZN6StringD2Ev	#			<- destructor
+	l32i.n	a0, sp, 60	#,
+	l32i.n	a12, sp, 56	#,
+	l32i.n	a13, sp, 52	#,
+	l32i.n	a14, sp, 48	#,
+	addi	sp, sp, 64	#,,
+	ret.n
+```
+
+Instead use native `String` concat:
+
+```c++
+void ExampleStringConcat3(String &s, uint8_t a, uint8_t b) {
+  s += "[";
+  s += a;
+  s += ",";
+  s += b;
+  s += "]";
+}
+```
+
+Assembly (69 bytes, -43%):
+
+```asm
+.LC231:
+	.string	","
+.LC1026:
+	.string	"["
+.LC1029:
+	.string	"]"
+	.literal .LC1031, .LC1026
+	.literal .LC1032, .LC231
+	.literal .LC1033, .LC1029
+_Z20ExampleStringConcat3R6Stringhh:
+	addi	sp, sp, -16	#,,
+	s32i.n	a13, sp, 4	#,
+	extui	a13, a3, 0, 8	# a, a
+	l32r	a3, .LC1031	#,
+	s32i.n	a0, sp, 12	#,
+	s32i.n	a12, sp, 8	#,
+	s32i.n	a14, sp, 0	#,
+	mov.n	a12, a2	# s, s
+	extui	a14, a4, 0, 8	# b, b
+	call0	_ZN6String6concatEPKc	#			<- native char* add
+	mov.n	a3, a13	#, a
+	mov.n	a2, a12	#, s
+	call0	_ZN6String6concatEh	#				<- native int add
+	l32r	a3, .LC1032	#,
+	mov.n	a2, a12	#, s
+	call0	_ZN6String6concatEPKc	#			<- native char* add
+	mov.n	a3, a14	#, b
+	mov.n	a2, a12	#, s
+	call0	_ZN6String6concatEh	#				<- native int add
+	l32r	a3, .LC1033	#,
+	mov.n	a2, a12	#, s
+	call0	_ZN6String6concatEPKc	#			<- native char* add
+	l32i.n	a0, sp, 12	#,
+	l32i.n	a12, sp, 8	#,
+	l32i.n	a13, sp, 4	#,
+	l32i.n	a14, sp, 0	#,
+	addi	sp, sp, 16	#,,
+	ret.n
+```
+