这被称为莫顿数 https://graphics.stanford.edu/%7Eseander/bithacks.html#InterleaveTableObvious,这是一个具体情况平行展开这反过来又是压缩右在以下问题中
- 将 32 0/1 值打包到单个 32 位变量的位中的最快方法是什么? https://stackoverflow.com/q/26200961/995714
- 将屏蔽位移至 lsb https://stackoverflow.com/q/28282869/995714
一种通用的解决方案可能是
uint64_t bit_expand(uint64_t x)
{
// Input: 00000000ABCDEFGH, each character is a nibble
x = ((x & 0xFFFF0000) << 32) | ((x & 0x0000FFFF) << 16);
// Result: ABCD0000EFGH0000
x = (x & 0xFF000000FF000000) | ((x & 0x00FF000000FF0000) >> 8);
// Result: AB00CD00EF00GH00
x = (x & 0xF000F000F000F000) | ((x & 0x0F000F000F000F00) >> 4);
// Result: A0B0C0D0E0F0G0H0. Each byte: abcd0000
x = (x & 0xC0C0C0C0C0C0C0C0) | ((x & 0x3030303030303030) >> 2);
// Result: Each byte: ab00cd00
x = (x & 0x8888888888888888) | ((x & 0x4444444444444444) >> 1);
// Result: Each byte: a0b0c0d0
return x;
}
然而,常量生成在 RISC 架构上可能效率低下,因为 64 位立即数无法像 x86 那样存储在单个指令中。即使在 x86 上输出组件很长 https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGe1wAyeAyYAHI%2BAEaYxBIArKQADqgKhE4MHt6%2BekkpjgJBIeEsUTFc8XaYDmlCBEzEBBk%2Bfly2mPZ5DDV1BAVhkdFxtrX1jVktCsM9wX3FA2UAlLaoXsTI7BzmAMzByN5YANQmm24T%2BIIAdAhH2CYaAIK3d17BBABskgD6BPsRhB%2BYqgShnQEGegneX32qnm%2BwA9LD9hokcikXcLG4ACLYABiAHEABKPEwAdis932FKhh02GP2UFUhzMr0RqmxbOxKJhRzc3P2mzMXOJbjpEAZ5mZGlUKPZ2K5x15XFe8yOFkparhCLRmJROIJKMearFNLpYqZLLZKKRFstguF9MZEqlGmt0o5yLlN022H2AA5lZtVfD9mikdqNDikXqNAbKUbaaKHebkW7ncn3YchSLTY7U1a03mNB7rvtJP7A5qNBYNG4NBjw6ncRp8dHybHqfHsyya93q72ezXbVnE5LNhpR%2BOx5OJ0WvfsBSqYxS4ybh6ofeuN5ut36M3aE%2BKWZIj8eT6fS9TPd6uGXF/tiJgCCsGFCF/cSRiiW%2BzJssDQQvtsAADWUO5QgxSQICYGEoCgKDVx3bleXPEk90grkzUlZD5WOOcuViHlW3VIj1RQkU0NXAVqR5HDr13Mi4IPSVr3mZU33uNAGAmfYwQIH1IQCOQABUTFiCxFRE2kjkk0lbyAkCwIgxFFgA4DQPAiB9mvUgVPk9TcO0uS1MUzZFlk1SFI00sDPMvTYmUwyLP2JVrN0xTiVMwidKMjS/Rc7z9gATnsmyIK4Qs/IsrgtLM1yIC4AUIvUrgTMS0KrK8yK7KJYkPwDT8nheCFvl%2BAh/kBYEPgEwTQUKz5vmhbKyTuNV70fYhnygHiivmKqRIsfcvWLMwEIw1kJKo3lYiVOjOtqr4eqEvqBsvOdkNG7FxsQnDJD9W8SMzWbwTqhbhNE5ahsLVcNtiSTsOFSRLtIw63mO3qzqNFbFXQx1rtu6jhX5VjmuImaaqO%2Ba3v6j7i3i77zU2u7VsHZ7ush87Z19OHJV%2BiaaKVPa1SesGXohxb3ovYsSyxsabtx4VMdBrrXrJqGKYxx71oR/6KULV8HjfHL8qZyEFCBNYAHkvF4kqFGJvjiqBkkmrVJgpdQfYAFk7lxABJYUpJZH0NAeswNDCjQjZNs25ACAI%2BZVtXNbuIQAGkKQNyUja9i2fe9o2bbtvLPOF74ASBBh0B9GX232D4PgiZ5aEcBh44UAB3JgEneGCtd1twACoIjhrXXZnb13PtykQ/2UWmDWdBJYICxCAUGOw%2BBKOW/zzCwskLhVzuO5LXuIfLUrilWqfOla/rxvm4IVunpnzAG6l%2BfF8RsxS2pmVh%2BHvn3yFubvmXxv3hl4miqhRWZM8yf2unsXMEbzuF/R70t7lbmdqBkGZtPqWr9ZbQ1nF9OmfJ5yeX2naAB0sW7vwpAhRGX0CaUiXk/F%2BF8QHeiUuApSB9BZsTuBwRYtBOCxF4H4DgWhSCoE4DySw1ga7LFWJgRkmweCkAIJoEhiwADWIBTbnC4D6YaPpiRIi4AFWI0jjb6E4JIShPDaGcF4AoEAGguE8MWHAWAMBEAoFQCwBIdBojkEoGgYxpiYh1BYFFU2fA6AEGiOoiAERlG/GYMQAAnpwThli2CCHFgwWgvjqG8CwCwQwwBxDhNIPge8VQABumB1FxIBJUKW6xOEvDaMo2geAIjEDqN4jwWBlEEGIHgFg2i%2BAGGAAoAAangTAadxYJEYH43g/BBAiDEOwKQMhBCKBUOoOJuhNj6GiSgaw1h9CFPUZARYqAEgdDSQAWnFpsfY6yqApOIBEZImB1lMAUCwNRbRKgdBcBHUYzRSCBGmEUEo2RkipAEHc15uQ0i9GeXMVo7RqiTE%2BeMS5VQBBdHqL8/opQhjdBBXCqFTyYUSEWAoFhaw9CVMwNkkh8iOAUNIFQmhdCOB3AAEoa32MAZAyBNJcHOGYOkBSGBeGhHSBhVhLAGXwEQYg7CWj7A8FY%2Bg/KtgmV4Nw8JOikCWJMaK8xEA5XWJAMAexjik4uMoO4uJniSldNIAExgBBgmhOUZE6JsSaEJKuXgFJaSaEZOQFkg1uSyFxIKUUkpZT1g0MqdU2pVB6lNJaW0jpVDOE9OEKIcQgyo0jLUMo3Q8QDBGBmYw7lnrFkQGWastIGytk7L2dEQ5SgTlnIuYC5wEBXAguJA8iO0LZgxFeIkN5HQ61tu%2BfkZFzaQCtoqOCzowLPBNAkPWwdHRIVTEKCigdI7MjNAnZMJtLznLopWJilo2LcWkPIUouJpK1yvHWe8altL6WMo5bM7lAFeUkAFdpYV8rojsLspK2pEBZVGJfWQCgSqf0qoSAkZA7xgABQ1c44gridU0L1T4g1RqgkhLCdazAUSjBWoiXgRJjh7XKKdS67gvA3X5MKcUnxPqKlVJqdKupTAGnNNae0zpxGhnRv6RIaQ8alCJvGSAaQqbjA3psFm%2BAua1mcE2ds3Z%2BzS3HNOecgFtrq21tHWMBt6BV3/JyO89I6n7m6Y6Np2Fk6gXwoM3oMzEKV29peeMBdY6HPdBM6ipYm6BlcPvLu/FhLiW8FJUkNO0Qz13EEslc4Gg6Q0uQEFa9GabB3sIA%2BrYgrn3WPYaWD90r%2BEgHXOcCRZtpGyM2DIre%2BLFGkBqbETR/mVEcDUYIrRdHdEGOVQq/97WBh4FpSIyDWq3EeOCPqtjSGTUofNehy1vrsO4btakgjqhMnONdYIPJHryPeowDNrzAa6NBoYyG5j4aDVRr6bG7jsgE1jJoboMwUy02mAS/MiI2aJP5qk1sytKm/A1tuZZlojzZ19paEZtICKwc9uB/Z5TQ7p0Ius8Olzdn/kTAs4uqztnodzDRRizzO6ul7oJQeklnBj2nskPsHrdKRGRci/FrliXcDJbFd%2BNLgHRXsIFNlrQMrDEirMZ1jnAxiBJPeOs3Y0Tkpm369B7VQ2vFhP8UYwJ42zVxItZhnbNrkkLfSUt51K22OkY216yj23qN7d5/RxjoaWMRu6bIc7AzLvDN4zdnQeghPpsZy9t7tC80CALZsb7Q6bnuAB5p1zoP23g8j5Dhg0fYdTscxpxH06k9o5GJHrPM6Zj2dxx5rF3nCe%2BZJwFzg5KdZCDcOsxpd4xeSBpfsSXRhNKbDp/3CAnK5lJb5Y%2BoVwvWccPmDz3hpAECYCYFgGIOaKu8Gq7V5RpLGvbu0bl4kAVzg%2Bi4MSexAUAqWw0CmzgIeiXL9Uc13nROzDl/q2PlipB9kpGcJIIAA%3D。这是另一种可能的实现,如上所述位摆弄黑客 https://graphics.stanford.edu/%7Eseander/bithacks.html#InterleaveBMN
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};
unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.
// x and y must initially be less than 65536.
x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];
y = (y | (y << S[3])) & B[3];
y = (y | (y << S[2])) & B[2];
y = (y | (y << S[1])) & B[1];
y = (y | (y << S[0])) & B[0];
z = x | (y << 1);
A 查找表 https://graphics.stanford.edu/%7Eseander/bithacks.html#InterleaveTableLookup也可以用
#define EXPAND4(a) ((((a) & 0x8) << 4) | (((a) & 0x4) << 2) \
| (((a) & 0x2) << 1) | (((a) & 0x1)))
const uint8_t LUT[16] = {
EXPAND4( 0), EXPAND4( 1), EXPAND4( 2), EXPAND4( 3),
EXPAND4( 4), EXPAND4( 5), EXPAND4( 6), EXPAND4( 7),
EXPAND4( 8), EXPAND4( 9), EXPAND4(10), EXPAND4(11),
EXPAND4(12), EXPAND4(13), EXPAND4(14), EXPAND4(15)
};
output = ((uint64_t)LUT[(x >> 28) & 0xF] << 56) | ((uint64_t)LUT[(x >> 24) & 0xF] << 48)
| ((uint64_t)LUT[(x >> 20) & 0xF] << 40) | ((uint64_t)LUT[(x >> 16) & 0xF] << 32)
| ((uint64_t)LUT[(x >> 12) & 0xF] << 24) | ((uint64_t)LUT[(x >> 8) & 0xF] << 16)
| ((uint64_t)LUT[(x >> 4) & 0xF] << 8) | ((uint64_t)LUT[(x >> 0) & 0xF] << 0);
如有必要,可以增加查找表的大小
在 x86 上BMI2 https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#Parallel_bit_deposit_and_extract有硬件支持PDEP http://www.felixcloutier.com/x86/PDEP.html可以通过以下内在函数访问指令
output = _pdep_u64(x, 0xaaaaaaaaaaaaaaaaULL);
另一种没有位存储/扩展指令但具有快速乘法器的架构解决方案
uint64_t spaceOut8bits(uint8_t b)
{
uint64_t MAGIC = 0x8040201008040201;
uint64_t MASK = 0x8080808080808080;
uint64_t expand8bits = htobe64(((MAGIC*b) & MASK) >> 7);
uint64_t spacedOutBits = expand8bits*0x41041 & 0xAA000000AA000000;
return (spacedOutBits | (spacedOutBits << 24)) & 0xFFFF000000000000;
}
uint64_t spaceOut64bits(uint64_t x)
{
return (spaceOut8bits(x >> 24) >> 0)
| (spaceOut8bits(x >> 16) >> 16)
| (spaceOut8bits(x >> 8) >> 32)
| (spaceOut8bits(x >> 0) >> 48);
}
它的工作方式是这样的
- 第一步扩展输入位 https://stackoverflow.com/q/8461126/995714 from
abcdefgh
to a0000000 b0000000 c0000000 d0000000 e0000000 f0000000 g0000000 h0000000并存储在expand8bits
- 然后,我们在下一步中通过乘法和屏蔽将这些间隔开的位移近。在那之后
spacedOutBits
将包含a0b0c0d0 00000000 00000000 00000000 e0f0g0h0 00000000 00000000 00000000。我们将结果中的两个字节合并在一起
使位更接近的幻数是这样计算的
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
× 1000001000001000001
────────────────────────────────────────────────────────────────
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
00b0000000c0000000d0000000e0000000f0000000g0000000h0000000
+ 0000c0000000d0000000e0000000f0000000g0000000h0000000
000000d0000000e0000000f0000000g0000000h0000000
────────────────────────────────────────────────────────────────
a0b0c0d0b0c0d0e0c0d0e0f0d0e0f0g0e0f0g0h0f0g0h000g0h00000h0000000
& 1010101000000000000000000000000010101010000000000000000000000000
────────────────────────────────────────────────────────────────
a0b0c0d0000000000000000000000000e0f0g0h0000000000000000000000000
可以看到输出组件here https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGe1wAyeAyYAHI%2BAEaYxBIArKQADqgKhE4MHt6%2BekkpjgJBIeEsUTFc8XaYDmlCBEzEBBk%2Bfly2mPZ5DDV1BAVhkdFxtrX1jVktCsM9wX3FA2UAlLaoXsTI7BzmAMzByN5YANQmm24T%2BIIAdAhH2CYaAIK3d17BBABskgD6BPsRhB%2BYqgShnQEGegneX32qnm%2BwA9LD9hokcikXcLG4ACLYABiAHEABKPEwAdis932FKhh02GP2UFUhzMr0RqmxbOxKJhRzc3P2mzMXOJbjpEAZ5mZGlUKPZ2K5x15XFe8yOFkparhCLRmJROIJKMearFNLpYqZLLZKKRFstguF9MZEqlGmt0o5yLlN022H2AA5lZtVfD9mikdqNDikXqNAbKUbaaKHebkW7ncn3YchSLTY7U1a03mNB7rvtJP7A5qNBYNG4NBjw6ncRp8dHybHqfHsyya93q72ezXbVnE5LNhpR%2BOx5OJ0WvfsBSqYxS4ybh6ofeuN5ut36M3aE%2BKWZIj8eT6fS9TPd6uGXF/tiJgCCsGFCF/cSRiiW%2BzJssDQQvtsAADWUO5QgxSQICYGEoCgKDVx3bleXPEk90grkzUlZD5WOOcuViHlW3VIj1RQkU0NXAVqR5HDr13Mi4IPSVr3mZU33uNAGAmfYwQIH1IQCOQABUTFiCxFRE2kjkk0lbyAkCwIgxFFgA4DQPAiB9mvUgVPk9TcO0uS1MUzZFlk1SFI00sDPMvTYmUwyLP2JVrN0xTiVMwidKMjS/Rc7z9gATnsmyIK4Qs/IsrgtLM1yIC4AUIvUrgTMS0KrK8yK7KJYkPwDT8nheCFvl%2BAh/kBYEPgEwTQUKz5vmhbKyTuNV70fYhnygHiivmKqRIsfcvWLMwEIw1kJKo3lYiVOjOtqr4eqEvqBsvOdkNG7FxsQnDJD9W8SMzWbwTqhbhNE5ahsLVcNtiSTsOFSRLtIw63mO3qzqNFbFXQx1rtu6jhX5VjmuImaaqO%2Ba3v6j7i3i77zU2u7VsHZ7ush87Z19OHJV%2BiaaKVPa1SesGXohxb3ovYsSyxsabtx4VMdBrrXrJqGKYxx71oR/6KULV8HjfHL8qZyEFCBNYAHkvF4kqFGJvjiqBkkmrVJgpdQfYAFk7lxABJYUpJZH0NAeswNDCjQjZNs25ACAI%2BZVtXNbuIQAGkKQNyUja9i2fe9o2bbtvLPOF74ASBBh0B9GX232D4PgiZ5aEcBh44UAB3JgEneGCtd1twACoIjhrXXZnb13PtykQ/2UWmDWdBJYICxCAUGOw%2BBKOW/zzCwskLhVzuO5LXuIfLUrilWqfOla/rxvm4IVunpnzAG6l%2BfF8RsxS2pmVh%2BHvn3yFubvmXxv3hl4miqhRWZM8yf2unsXMEbzuF/R70t7lbmdqBkGZtPqWr9ZbQ1nF9OmfJ5yeX2naAB0sW7vwpAhRGX0CaUiXk/F%2BF8QHeiUuApSB9BZsTuBwRYtBOCxF4H4DgWhSCoE4DySw1ga7LFWJgRkmweCkAIJoEhiwADWIBTbnC4D6YaPpiRIi4AFWI0jjb6E4JIShPDaGcF4AoEAGguE8MWHAWAMBEAoFQCwBIdBojkEoGgYxpiYh1BYFFU2fA6AEGiOoiAERlG/GYMQAAnpwThli2CCHFgwWgvjqG8CwCwQwwBxDhNIPge8VQABumB1FxIBJUKW6xOEvDaMo2geAIjEDqN4jwWBlEEGIHgFg2i%2BAGGAAoAAangTAadxYJEYH43g/BBAiDEOwKQMhBCKBUOoOJuhNj6GiSgaw1h9CFPUZARYqAEgdDSQAWnFpsfY6yqApOIBEZImB1lMAUCwNRbRKgdBcBHUYzRSCBGmEUEo2RkipAEHc15uQ0i9GeXMVo7RqiTE%2BeMS5VQBBdHqL8/opQhjdBBXCqFTyYUSEWAoFhaw9CVMwNkkh8iOAUNIFQmhdCOB3AAEoa32MAZAyBNJcHOGYOkBSGBeGhHSBhVhLAGXwEQYg7CWj7A8FY%2Bg/KtgmV4Nw8JOikCWJMaK8xEA5XWJAMAexjik4uMoO4uJniSldNIAExgBBgmhOUZE6JsSaEJKuXgFJaSaEZOQFkg1uSyFxIKUUkpZT1g0MqdU2pVB6lNJaW0jpVDOE9OEKIcQgyo0jLUMo3Q8QDBGBmYw7lnrFkQGWastIGytk7L2dEQ5SgTlnIuYC5wEBXAguJA8iO0LZgxFeIkN5HQ61tu%2BfkZFzaQCtoqOCzowLPBNAkPWwdHRIVTEKCigdI7MjNAnZMJtLznLopWJilo2LcWkPIUouJpK1yvHWe8altL6WMo5bM7lAFeUkAFdpYV8rojsLspK2pEBZVGJfWQCgSqf0qoSAkZA7xgABQ1c44gridU0L1T4g1RqgkhLCdazAUSjBWoiXgRJjh7XKKdS67gvA3X5MKcUnxPqKlVJqdKupTAGnNNae0zpxGhnRv6RIaQ8alCJvGSAaQqbjA3psFm%2BAua1mcE2ds3Z%2BzS3HNOecgFtrq21tHWMBt6BV3/JyO89I6n7m6Y6Np2Fk6gXwoM3oMzEKV29peeMBdY6HPdBM6ipYm6BlcPvLu/FhLiW8FJUkNO0Qz13EEslc4Gg6Q0uQEFa9GabB3sIA%2BrYgrn3WPYaWD90r%2BEgHXOcCRZtpGyM2DIre%2BLFGkBqbETR/mVEcDUYIrRdHdEGOVQq/97WBh4FpSIyDWq3EeOCPqtjSGTUofNehy1vrsO4btakgjqhMnONdYIPJHryPeowDNrzAa6NBoYyG5j4aDVRr6bG7jsgE1jJoboMwUy02mAS/MiI2aJP5qk1sytKm/A1tuZZlojzZ19paEZtICKwc9uB/Z5TQ7p0Ius8Olzdn/kTAs4uqztnodzDRRizzO6ul7oJQeklnBj2nskPsHrdKRGRci/FrliXcDJbFd%2BNLgHRXsIFNlrQMrDEirMZ1jnAxiBJPeOs3Y0Tkpm369B7VQ2vFhP8UYwJ42zVxItZhnbNrkkLfSUt51K22OkY216yj23qN7d5/RxjoaWMRu6bIc7AzLvDN4zdnQeghPpsZy9t7tC80CALZsb7Q6bnuAB5p1zoP23g8j5Dhg0fYdTscxpxH06k9o5GJHrPM6Zj2dxx5rF3nCe%2BZJwFzg5KdZCDcOsxpd4xeSBpfsSXRhNKbDp/3CAnK5lJb5Y%2BoVwvWccPmDz3hpAECYCYFgGIOaKu8Gq7V5RpLGvbu0bl4kAVzg%2Bi4MSexAUAqWw0CmzgIeiXL9Uc13nROzDl/q2PlipB9kpGcJIIAA%3D。您可以更改编译器以查看它在各种架构上的完成情况
还有一种替代方法位摆弄黑客 https://graphics.stanford.edu/%7Eseander/bithacks.html#Interleave64bitOps page
z = ((x * 0x0101010101010101ULL & 0x8040201008040201ULL) *
0x0102040810204081ULL >> 49) & 0x5555 |
((y * 0x0101010101010101ULL & 0x8040201008040201ULL) *
0x0102040810204081ULL >> 48) & 0xAAAA;
更多解决方案可以在不使用 BMI2 的 PDEP 便携式高效替代品? https://stackoverflow.com/q/38938911/995714
有关的:如何对像素数据进行位条纹? https://stackoverflow.com/q/36256537/995714
正如您所看到的,如果没有位存储指令,操作将非常复杂。如果您不进行像这样的位条带化操作,那么最好使用 SIMD 并行执行