-
-
Notifications
You must be signed in to change notification settings - Fork 32.4k
Closed
Labels
buildThe build process and cross-buildThe build process and cross-buildextension-modulesC modules in the Modules dirC modules in the Modules dirperformancePerformance or resource usagePerformance or resource usage
Description
Currently _asyncio
is built as a shared library module which causes it to use slower functions calls for getting the thread state whereas if it is built as static module then it can read the current thread state by using faster segment registers directly. This affects both free-threading and normal builds however on free-threading, critical sections heavily use thread state so it has a larger impact.
The function calls to _PyThreadState_GetCurrent
are completely eliminated after the change and it reads thread state directly via fs
register.
Normal build:
Before:
(gdb) disassemble _asyncio__get_running_loop
Dump of assembler code for function _asyncio__get_running_loop:
0x00007ffff73e1d80 <+0>: push %rax
0x00007ffff73e1d81 <+1>: call 0x7ffff73e1500 <_PyThreadState_GetCurrent@plt>
0x00007ffff73e1d86 <+6>: mov 0x358(%rax),%rax
0x00007ffff73e1d8d <+13>: test %rax,%rax
0x00007ffff73e1d90 <+16>: je 0x7ffff73e1da2 <_asyncio__get_running_loop+34>
0x00007ffff73e1d92 <+18>: mov (%rax),%ecx
0x00007ffff73e1d94 <+20>: cmp $0xbfffffff,%ecx
0x00007ffff73e1d9a <+26>: ja 0x7ffff73e1da0 <_asyncio__get_running_loop+32>
0x00007ffff73e1d9c <+28>: inc %ecx
0x00007ffff73e1d9e <+30>: mov %ecx,(%rax)
0x00007ffff73e1da0 <+32>: pop %rcx
0x00007ffff73e1da1 <+33>: ret
0x00007ffff73e1da2 <+34>: mov 0xb1ef(%rip),%rax # 0x7ffff73ecf98
0x00007ffff73e1da9 <+41>: pop %rcx
0x00007ffff73e1daa <+42>: ret
End of assembler dump.
(gdb)
After:
(gdb) disassemble _asyncio__get_running_loop
Dump of assembler code for function _asyncio__get_running_loop:
0x0000555555853c80 <+0>: mov $0xfffffffffffffff0,%rax
0x0000555555853c87 <+7>: mov %fs:(%rax),%rax
0x0000555555853c8b <+11>: mov 0x358(%rax),%rax
0x0000555555853c92 <+18>: test %rax,%rax
0x0000555555853c95 <+21>: je 0x555555853ca6 <_asyncio__get_running_loop+38>
0x0000555555853c97 <+23>: mov (%rax),%ecx
0x0000555555853c99 <+25>: cmp $0xbfffffff,%ecx
0x0000555555853c9f <+31>: ja 0x555555853ca5 <_asyncio__get_running_loop+37>
0x0000555555853ca1 <+33>: inc %ecx
0x0000555555853ca3 <+35>: mov %ecx,(%rax)
0x0000555555853ca5 <+37>: ret
0x0000555555853ca6 <+38>: lea 0x1f377b(%rip),%rax # 0x555555a47428 <_Py_NoneStruct>
0x0000555555853cad <+45>: ret
End of assembler dump.
free-threading:
Before:
(gdb) disassemble _asyncio_Future_done
Dump of assembler code for function _asyncio_Future_done:
0x00007ffff7412c30 <+0>: push %r14
0x00007ffff7412c32 <+2>: push %rbx
0x00007ffff7412c33 <+3>: sub $0x18,%rsp
0x00007ffff7412c37 <+7>: mov %rdi,%rbx
0x00007ffff7412c3a <+10>: lea 0xa(%rdi),%r14
0x00007ffff7412c3e <+14>: mov $0x1,%cl
0x00007ffff7412c40 <+16>: xor %eax,%eax
0x00007ffff7412c42 <+18>: lock cmpxchg %cl,0xa(%rdi)
0x00007ffff7412c47 <+23>: jne 0x7ffff7412c74 <_asyncio_Future_done+68>
0x00007ffff7412c49 <+25>: call 0x7ffff740c540 <_PyThreadState_GetCurrent@plt>
0x00007ffff7412c4e <+30>: mov %r14,0x10(%rsp)
0x00007ffff7412c53 <+35>: mov 0xb0(%rax),%rcx
0x00007ffff7412c5a <+42>: mov %rcx,0x8(%rsp)
0x00007ffff7412c5f <+47>: lea 0x8(%rsp),%rcx
0x00007ffff7412c64 <+52>: mov %rcx,0xb0(%rax)
0x00007ffff7412c6b <+59>: cmpq $0x0,0x20(%rbx)
0x00007ffff7412c70 <+64>: jne 0x7ffff7412c88 <_asyncio_Future_done+88>
0x00007ffff7412c72 <+66>: jmp 0x7ffff7412c8e <_asyncio_Future_done+94>
0x00007ffff7412c74 <+68>: lea 0x8(%rsp),%rdi
0x00007ffff7412c79 <+73>: mov %r14,%rsi
0x00007ffff7412c7c <+76>: call 0x7ffff740c130 <_PyCriticalSection_BeginSlow@plt>
0x00007ffff7412c81 <+81>: cmpq $0x0,0x20(%rbx)
0x00007ffff7412c86 <+86>: je 0x7ffff7412c8e <_asyncio_Future_done+94>
0x00007ffff7412c88 <+88>: cmpl $0x0,0x78(%rbx)
0x00007ffff7412c8c <+92>: jne 0x7ffff7412ca1 <_asyncio_Future_done+113>
0x00007ffff7412c8e <+94>: mov 0x92f3(%rip),%rbx # 0x7ffff741bf88
0x00007ffff7412c95 <+101>: mov 0x10(%rsp),%rdi
0x00007ffff7412c9a <+106>: test %rdi,%rdi
0x00007ffff7412c9d <+109>: jne 0x7ffff7412cb2 <_asyncio_Future_done+130>
0x00007ffff7412c9f <+111>: jmp 0x7ffff7412cdf <_asyncio_Future_done+175>
0x00007ffff7412ca1 <+113>: mov 0x92f8(%rip),%rbx # 0x7ffff741bfa0
0x00007ffff7412ca8 <+120>: mov 0x10(%rsp),%rdi
0x00007ffff7412cad <+125>: test %rdi,%rdi
0x00007ffff7412cb0 <+128>: je 0x7ffff7412cdf <_asyncio_Future_done+175>
0x00007ffff7412cb2 <+130>: xor %ecx,%ecx
0x00007ffff7412cb4 <+132>: mov $0x1,%al
0x00007ffff7412cb6 <+134>: lock cmpxchg %cl,(%rdi)
0x00007ffff7412cba <+138>: je 0x7ffff7412cc1 <_asyncio_Future_done+145>
0x00007ffff7412cbc <+140>: call 0x7ffff740c550 <PyMutex_Unlock@plt>
0x00007ffff7412cc1 <+145>: call 0x7ffff740c540 <_PyThreadState_GetCurrent@plt>
0x00007ffff7412cc6 <+150>: mov 0x8(%rsp),%rcx
0x00007ffff7412ccb <+155>: mov %rcx,0xb0(%rax)
0x00007ffff7412cd2 <+162>: test $0x1,%cl
0x00007ffff7412cd5 <+165>: je 0x7ffff7412cdf <_asyncio_Future_done+175>
0x00007ffff7412cd7 <+167>: mov %rax,%rdi
0x00007ffff7412cda <+170>: call 0x7ffff740c390 <_PyCriticalSection_Resume@plt>
0x00007ffff7412cdf <+175>: mov %rbx,%rax
0x00007ffff7412ce2 <+178>: add $0x18,%rsp
0x00007ffff7412ce6 <+182>: pop %rbx
0x00007ffff7412ce7 <+183>: pop %r14
0x00007ffff7412ce9 <+185>: ret
End of assembler dump.
After:
(gdb) disassemble _asyncio_Future_done
Dump of assembler code for function _asyncio_Future_done:
0x0000555555892fc0 <+0>: push %rbx
0x0000555555892fc1 <+1>: sub $0x10,%rsp
0x0000555555892fc5 <+5>: mov %rdi,%rbx
0x0000555555892fc8 <+8>: lea 0xa(%rdi),%rsi
0x0000555555892fcc <+12>: mov $0x1,%cl
0x0000555555892fce <+14>: xor %eax,%eax
0x0000555555892fd0 <+16>: lock cmpxchg %cl,0xa(%rdi)
0x0000555555892fd5 <+21>: jne 0x555555893005 <_asyncio_Future_done+69>
0x0000555555892fd7 <+23>: mov $0xfffffffffffffff0,%rax
0x0000555555892fde <+30>: mov %fs:(%rax),%rax
0x0000555555892fe2 <+34>: mov %rsi,0x8(%rsp)
0x0000555555892fe7 <+39>: mov 0xb0(%rax),%rcx
0x0000555555892fee <+46>: mov %rcx,(%rsp)
0x0000555555892ff2 <+50>: mov %rsp,%rcx
0x0000555555892ff5 <+53>: mov %rcx,0xb0(%rax)
0x0000555555892ffc <+60>: cmpq $0x0,0x20(%rbx)
0x0000555555893001 <+65>: jne 0x555555893014 <_asyncio_Future_done+84>
0x0000555555893003 <+67>: jmp 0x55555589301a <_asyncio_Future_done+90>
0x0000555555893005 <+69>: mov %rsp,%rdi
0x0000555555893008 <+72>: call 0x5555557a4970 <_PyCriticalSection_BeginSlow>
0x000055555589300d <+77>: cmpq $0x0,0x20(%rbx)
0x0000555555893012 <+82>: je 0x55555589301a <_asyncio_Future_done+90>
0x0000555555893014 <+84>: cmpl $0x0,0x78(%rbx)
0x0000555555893018 <+88>: jne 0x555555893034 <_asyncio_Future_done+116>
0x000055555589301a <+90>: lea 0x1e9f57(%rip),%rbx # 0x555555a7cf78 <_Py_FalseStruct>
0x0000555555893021 <+97>: mov 0x8(%rsp),%rdi
0x0000555555893026 <+102>: test %rdi,%rdi
0x0000555555893029 <+105>: jne 0x555555893045 <_asyncio_Future_done+133>
0x000055555589302b <+107>: mov %rbx,%rax
0x000055555589302e <+110>: add $0x10,%rsp
0x0000555555893032 <+114>: pop %rbx
0x0000555555893033 <+115>: ret
0x0000555555893034 <+116>: lea 0x1e9f0d(%rip),%rbx # 0x555555a7cf48 <_Py_TrueStruct>
0x000055555589303b <+123>: mov 0x8(%rsp),%rdi
0x0000555555893040 <+128>: test %rdi,%rdi
0x0000555555893043 <+131>: je 0x55555589302b <_asyncio_Future_done+107>
0x0000555555893045 <+133>: xor %ecx,%ecx
0x0000555555893047 <+135>: mov $0x1,%al
0x0000555555893049 <+137>: lock cmpxchg %cl,(%rdi)
0x000055555589304d <+141>: je 0x555555893054 <_asyncio_Future_done+148>
0x000055555589304f <+143>: call 0x5555557e2ec0 <PyMutex_Unlock>
0x0000555555893054 <+148>: mov (%rsp),%rax
0x0000555555893058 <+152>: mov $0xfffffffffffffff0,%rcx
0x000055555589305f <+159>: mov %fs:(%rcx),%rdi
0x0000555555893063 <+163>: mov %rax,0xb0(%rdi)
0x000055555589306a <+170>: test $0x1,%al
0x000055555589306c <+172>: je 0x55555589302b <_asyncio_Future_done+107>
0x000055555589306e <+174>: call 0x5555557a4ae0 <_PyCriticalSection_Resume>
0x0000555555893073 <+179>: mov %rbx,%rax
0x0000555555893076 <+182>: add $0x10,%rsp
0x000055555589307a <+186>: pop %rbx
0x000055555589307b <+187>: ret
End of assembler dump.
Linked PRs
Metadata
Metadata
Assignees
Labels
buildThe build process and cross-buildThe build process and cross-buildextension-modulesC modules in the Modules dirC modules in the Modules dirperformancePerformance or resource usagePerformance or resource usage