Skip to content

build _asyncio module as static module #136669

@kumaraditya303

Description

@kumaraditya303

Currently _asyncio is built as a shared library module which causes it to use slower functions calls for getting the thread state whereas if it is built as static module then it can read the current thread state by using faster segment registers directly. This affects both free-threading and normal builds however on free-threading, critical sections heavily use thread state so it has a larger impact.

The function calls to _PyThreadState_GetCurrent are completely eliminated after the change and it reads thread state directly via fs register.

Normal build:

Before:

(gdb) disassemble  _asyncio__get_running_loop
Dump of assembler code for function _asyncio__get_running_loop:
   0x00007ffff73e1d80 <+0>:     push   %rax
   0x00007ffff73e1d81 <+1>:     call   0x7ffff73e1500 <_PyThreadState_GetCurrent@plt>
   0x00007ffff73e1d86 <+6>:     mov    0x358(%rax),%rax
   0x00007ffff73e1d8d <+13>:    test   %rax,%rax
   0x00007ffff73e1d90 <+16>:    je     0x7ffff73e1da2 <_asyncio__get_running_loop+34>
   0x00007ffff73e1d92 <+18>:    mov    (%rax),%ecx
   0x00007ffff73e1d94 <+20>:    cmp    $0xbfffffff,%ecx
   0x00007ffff73e1d9a <+26>:    ja     0x7ffff73e1da0 <_asyncio__get_running_loop+32>
   0x00007ffff73e1d9c <+28>:    inc    %ecx
   0x00007ffff73e1d9e <+30>:    mov    %ecx,(%rax)
   0x00007ffff73e1da0 <+32>:    pop    %rcx
   0x00007ffff73e1da1 <+33>:    ret
   0x00007ffff73e1da2 <+34>:    mov    0xb1ef(%rip),%rax        # 0x7ffff73ecf98
   0x00007ffff73e1da9 <+41>:    pop    %rcx
   0x00007ffff73e1daa <+42>:    ret
End of assembler dump.
(gdb) 

After:

(gdb) disassemble _asyncio__get_running_loop
Dump of assembler code for function _asyncio__get_running_loop:
   0x0000555555853c80 <+0>:     mov    $0xfffffffffffffff0,%rax
   0x0000555555853c87 <+7>:     mov    %fs:(%rax),%rax
   0x0000555555853c8b <+11>:    mov    0x358(%rax),%rax
   0x0000555555853c92 <+18>:    test   %rax,%rax
   0x0000555555853c95 <+21>:    je     0x555555853ca6 <_asyncio__get_running_loop+38>
   0x0000555555853c97 <+23>:    mov    (%rax),%ecx
   0x0000555555853c99 <+25>:    cmp    $0xbfffffff,%ecx
   0x0000555555853c9f <+31>:    ja     0x555555853ca5 <_asyncio__get_running_loop+37>
   0x0000555555853ca1 <+33>:    inc    %ecx
   0x0000555555853ca3 <+35>:    mov    %ecx,(%rax)
   0x0000555555853ca5 <+37>:    ret
   0x0000555555853ca6 <+38>:    lea    0x1f377b(%rip),%rax        # 0x555555a47428 <_Py_NoneStruct>
   0x0000555555853cad <+45>:    ret
End of assembler dump.

free-threading:

Before:

(gdb) disassemble _asyncio_Future_done
Dump of assembler code for function _asyncio_Future_done:
   0x00007ffff7412c30 <+0>:     push   %r14
   0x00007ffff7412c32 <+2>:     push   %rbx
   0x00007ffff7412c33 <+3>:     sub    $0x18,%rsp
   0x00007ffff7412c37 <+7>:     mov    %rdi,%rbx
   0x00007ffff7412c3a <+10>:    lea    0xa(%rdi),%r14
   0x00007ffff7412c3e <+14>:    mov    $0x1,%cl
   0x00007ffff7412c40 <+16>:    xor    %eax,%eax
   0x00007ffff7412c42 <+18>:    lock cmpxchg %cl,0xa(%rdi)
   0x00007ffff7412c47 <+23>:    jne    0x7ffff7412c74 <_asyncio_Future_done+68>
   0x00007ffff7412c49 <+25>:    call   0x7ffff740c540 <_PyThreadState_GetCurrent@plt>
   0x00007ffff7412c4e <+30>:    mov    %r14,0x10(%rsp)
   0x00007ffff7412c53 <+35>:    mov    0xb0(%rax),%rcx
   0x00007ffff7412c5a <+42>:    mov    %rcx,0x8(%rsp)
   0x00007ffff7412c5f <+47>:    lea    0x8(%rsp),%rcx
   0x00007ffff7412c64 <+52>:    mov    %rcx,0xb0(%rax)
   0x00007ffff7412c6b <+59>:    cmpq   $0x0,0x20(%rbx)
   0x00007ffff7412c70 <+64>:    jne    0x7ffff7412c88 <_asyncio_Future_done+88>
   0x00007ffff7412c72 <+66>:    jmp    0x7ffff7412c8e <_asyncio_Future_done+94>
   0x00007ffff7412c74 <+68>:    lea    0x8(%rsp),%rdi
   0x00007ffff7412c79 <+73>:    mov    %r14,%rsi
   0x00007ffff7412c7c <+76>:    call   0x7ffff740c130 <_PyCriticalSection_BeginSlow@plt>
   0x00007ffff7412c81 <+81>:    cmpq   $0x0,0x20(%rbx)
   0x00007ffff7412c86 <+86>:    je     0x7ffff7412c8e <_asyncio_Future_done+94>
   0x00007ffff7412c88 <+88>:    cmpl   $0x0,0x78(%rbx)
   0x00007ffff7412c8c <+92>:    jne    0x7ffff7412ca1 <_asyncio_Future_done+113>
   0x00007ffff7412c8e <+94>:    mov    0x92f3(%rip),%rbx        # 0x7ffff741bf88
   0x00007ffff7412c95 <+101>:   mov    0x10(%rsp),%rdi
   0x00007ffff7412c9a <+106>:   test   %rdi,%rdi
   0x00007ffff7412c9d <+109>:   jne    0x7ffff7412cb2 <_asyncio_Future_done+130>
   0x00007ffff7412c9f <+111>:   jmp    0x7ffff7412cdf <_asyncio_Future_done+175>
   0x00007ffff7412ca1 <+113>:   mov    0x92f8(%rip),%rbx        # 0x7ffff741bfa0
   0x00007ffff7412ca8 <+120>:   mov    0x10(%rsp),%rdi
   0x00007ffff7412cad <+125>:   test   %rdi,%rdi
   0x00007ffff7412cb0 <+128>:   je     0x7ffff7412cdf <_asyncio_Future_done+175>
   0x00007ffff7412cb2 <+130>:   xor    %ecx,%ecx
   0x00007ffff7412cb4 <+132>:   mov    $0x1,%al
   0x00007ffff7412cb6 <+134>:   lock cmpxchg %cl,(%rdi)
   0x00007ffff7412cba <+138>:   je     0x7ffff7412cc1 <_asyncio_Future_done+145>
   0x00007ffff7412cbc <+140>:   call   0x7ffff740c550 <PyMutex_Unlock@plt>
   0x00007ffff7412cc1 <+145>:   call   0x7ffff740c540 <_PyThreadState_GetCurrent@plt>
   0x00007ffff7412cc6 <+150>:   mov    0x8(%rsp),%rcx
   0x00007ffff7412ccb <+155>:   mov    %rcx,0xb0(%rax)
   0x00007ffff7412cd2 <+162>:   test   $0x1,%cl
   0x00007ffff7412cd5 <+165>:   je     0x7ffff7412cdf <_asyncio_Future_done+175>
   0x00007ffff7412cd7 <+167>:   mov    %rax,%rdi
   0x00007ffff7412cda <+170>:   call   0x7ffff740c390 <_PyCriticalSection_Resume@plt>
   0x00007ffff7412cdf <+175>:   mov    %rbx,%rax
   0x00007ffff7412ce2 <+178>:   add    $0x18,%rsp
   0x00007ffff7412ce6 <+182>:   pop    %rbx
   0x00007ffff7412ce7 <+183>:   pop    %r14
   0x00007ffff7412ce9 <+185>:   ret
End of assembler dump.

After:

(gdb) disassemble _asyncio_Future_done
Dump of assembler code for function _asyncio_Future_done:
   0x0000555555892fc0 <+0>:     push   %rbx
   0x0000555555892fc1 <+1>:     sub    $0x10,%rsp
   0x0000555555892fc5 <+5>:     mov    %rdi,%rbx
   0x0000555555892fc8 <+8>:     lea    0xa(%rdi),%rsi
   0x0000555555892fcc <+12>:    mov    $0x1,%cl
   0x0000555555892fce <+14>:    xor    %eax,%eax
   0x0000555555892fd0 <+16>:    lock cmpxchg %cl,0xa(%rdi)
   0x0000555555892fd5 <+21>:    jne    0x555555893005 <_asyncio_Future_done+69>
   0x0000555555892fd7 <+23>:    mov    $0xfffffffffffffff0,%rax
   0x0000555555892fde <+30>:    mov    %fs:(%rax),%rax
   0x0000555555892fe2 <+34>:    mov    %rsi,0x8(%rsp)
   0x0000555555892fe7 <+39>:    mov    0xb0(%rax),%rcx
   0x0000555555892fee <+46>:    mov    %rcx,(%rsp)
   0x0000555555892ff2 <+50>:    mov    %rsp,%rcx
   0x0000555555892ff5 <+53>:    mov    %rcx,0xb0(%rax)
   0x0000555555892ffc <+60>:    cmpq   $0x0,0x20(%rbx)
   0x0000555555893001 <+65>:    jne    0x555555893014 <_asyncio_Future_done+84>
   0x0000555555893003 <+67>:    jmp    0x55555589301a <_asyncio_Future_done+90>
   0x0000555555893005 <+69>:    mov    %rsp,%rdi
   0x0000555555893008 <+72>:    call   0x5555557a4970 <_PyCriticalSection_BeginSlow>
   0x000055555589300d <+77>:    cmpq   $0x0,0x20(%rbx)
   0x0000555555893012 <+82>:    je     0x55555589301a <_asyncio_Future_done+90>
   0x0000555555893014 <+84>:    cmpl   $0x0,0x78(%rbx)
   0x0000555555893018 <+88>:    jne    0x555555893034 <_asyncio_Future_done+116>
   0x000055555589301a <+90>:    lea    0x1e9f57(%rip),%rbx        # 0x555555a7cf78 <_Py_FalseStruct>
   0x0000555555893021 <+97>:    mov    0x8(%rsp),%rdi
   0x0000555555893026 <+102>:   test   %rdi,%rdi
   0x0000555555893029 <+105>:   jne    0x555555893045 <_asyncio_Future_done+133>
   0x000055555589302b <+107>:   mov    %rbx,%rax
   0x000055555589302e <+110>:   add    $0x10,%rsp
   0x0000555555893032 <+114>:   pop    %rbx
   0x0000555555893033 <+115>:   ret
   0x0000555555893034 <+116>:   lea    0x1e9f0d(%rip),%rbx        # 0x555555a7cf48 <_Py_TrueStruct>
   0x000055555589303b <+123>:   mov    0x8(%rsp),%rdi
   0x0000555555893040 <+128>:   test   %rdi,%rdi
   0x0000555555893043 <+131>:   je     0x55555589302b <_asyncio_Future_done+107>
   0x0000555555893045 <+133>:   xor    %ecx,%ecx
   0x0000555555893047 <+135>:   mov    $0x1,%al
   0x0000555555893049 <+137>:   lock cmpxchg %cl,(%rdi)
   0x000055555589304d <+141>:   je     0x555555893054 <_asyncio_Future_done+148>
   0x000055555589304f <+143>:   call   0x5555557e2ec0 <PyMutex_Unlock>
   0x0000555555893054 <+148>:   mov    (%rsp),%rax
   0x0000555555893058 <+152>:   mov    $0xfffffffffffffff0,%rcx
   0x000055555589305f <+159>:   mov    %fs:(%rcx),%rdi
   0x0000555555893063 <+163>:   mov    %rax,0xb0(%rdi)
   0x000055555589306a <+170>:   test   $0x1,%al
   0x000055555589306c <+172>:   je     0x55555589302b <_asyncio_Future_done+107>
   0x000055555589306e <+174>:   call   0x5555557a4ae0 <_PyCriticalSection_Resume>
   0x0000555555893073 <+179>:   mov    %rbx,%rax
   0x0000555555893076 <+182>:   add    $0x10,%rsp
   0x000055555589307a <+186>:   pop    %rbx
   0x000055555589307b <+187>:   ret
End of assembler dump.

Linked PRs

Metadata

Metadata

Assignees

No one assigned

    Labels

    buildThe build process and cross-buildextension-modulesC modules in the Modules dirperformancePerformance or resource usage

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions