| « The Simplest Stream Cipher on Earth | Patented or not patented? » |
I was just told by AkKort that his unrolled EnRUPT-128-128 in MSVC 2005 is 40% faster than AES-128, a simple loop is 5% faster and a size-optimized 32-bit Intel Assembly implementation occupies 66 bytes and is only 15% slower than AES-128.
I don’t know yet what processor it was measured on or what the actual numbers of clock cycles are, but it sounds exciting! I can’t wait to see the speeds of all kinds of implementations of EnRUPT vs. AES vs. RC4 on all kinds of processors and hardware platforms…
PS: The stream RUPT/aeRUPT/mcRUPT modes are supposed to be about 3 times faster than the block enRUPT/mdRUPT modes…
Trackback URL (right click and copy shortcut/link location)
#pragma warning (disable:4731)
#define enRUPTa(a,b,c,r) \
__asm { mov esi,c }\
__asm { xor esi,r }\
__asm { lea ebp,[a*2] }\
__asm { xor esi,[edi+(r%4)*4] }\
__asm { xor b,[edi+(r%4)*4] }\
__asm { xor esi,ebp }\
__asm { ror esi,8 }\
__asm { lea esi,[esi*8+esi] }\
__asm { xor b,esi }
void enRUPT (u32 x[4], u32 key[4])
{
__asm { mov esi,[x] }
__asm { mov edi,[key] }
__asm { push ebp }
__asm { mov eax,[esi ] }
__asm { mov ebx,[esi+ 4] }
__asm { mov ecx,[esi+ 8] }
__asm { mov edx,[esi+12] }
enRUPTa(eax,ebx,ecx, 1); enRUPTa(ebx,ecx,edx, 2); enRUPTa(ecx,edx,eax, 3); enRUPTa(edx,eax,ebx, 4);
enRUPTa(eax,ebx,ecx, 5); enRUPTa(ebx,ecx,edx, 6); enRUPTa(ecx,edx,eax, 7); enRUPTa(edx,eax,ebx, 8);
enRUPTa(eax,ebx,ecx, 9); enRUPTa(ebx,ecx,edx,10); enRUPTa(ecx,edx,eax,11); enRUPTa(edx,eax,ebx,12);
enRUPTa(eax,ebx,ecx,13); enRUPTa(ebx,ecx,edx,14); enRUPTa(ecx,edx,eax,15); enRUPTa(edx,eax,ebx,16);
enRUPTa(eax,ebx,ecx,17); enRUPTa(ebx,ecx,edx,18); enRUPTa(ecx,edx,eax,19); enRUPTa(edx,eax,ebx,20);
enRUPTa(eax,ebx,ecx,21); enRUPTa(ebx,ecx,edx,22); enRUPTa(ecx,edx,eax,23); enRUPTa(edx,eax,ebx,24);
enRUPTa(eax,ebx,ecx,25); enRUPTa(ebx,ecx,edx,26); enRUPTa(ecx,edx,eax,27); enRUPTa(edx,eax,ebx,28);
enRUPTa(eax,ebx,ecx,29); enRUPTa(ebx,ecx,edx,30); enRUPTa(ecx,edx,eax,31); enRUPTa(edx,eax,ebx,32);
enRUPTa(eax,ebx,ecx,33); enRUPTa(ebx,ecx,edx,34); enRUPTa(ecx,edx,eax,35); enRUPTa(edx,eax,ebx,36);
enRUPTa(eax,ebx,ecx,37); enRUPTa(ebx,ecx,edx,38); enRUPTa(ecx,edx,eax,39); enRUPTa(edx,eax,ebx,40);
enRUPTa(eax,ebx,ecx,41); enRUPTa(ebx,ecx,edx,42); enRUPTa(ecx,edx,eax,43); enRUPTa(edx,eax,ebx,44);
enRUPTa(eax,ebx,ecx,45); enRUPTa(ebx,ecx,edx,46); enRUPTa(ecx,edx,eax,47); enRUPTa(edx,eax,ebx,48);
__asm { pop ebp }
__asm { mov esi,[x] }
__asm { mov [esi ],eax }
__asm { mov [esi+ 4],ebx }
__asm { mov [esi+ 8],ecx }
__asm { mov [esi+12],edx }
}
It is 50% faster than the fastest C implementation of enRUPT-128-128 I have.