M4RI
1.0.1
|
00001 00010 #ifndef M4RI_PACKEDMATRIX_H 00011 #define M4RI_PACKEDMATRIX_H 00012 00013 /******************************************************************* 00014 * 00015 * M4RI: Linear Algebra over GF(2) 00016 * 00017 * Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu> 00018 * Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk> 00019 * Copyright (C) 2011 Carlo Wood <carlo@alinoe.com> 00020 * 00021 * Distributed under the terms of the GNU General Public License (GPL) 00022 * version 2 or higher. 00023 * 00024 * This code is distributed in the hope that it will be useful, 00025 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00026 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00027 * General Public License for more details. 00028 * 00029 * The full text of the GPL is available at: 00030 * 00031 * http://www.gnu.org/licenses/ 00032 * 00033 ********************************************************************/ 00034 00035 #include "m4ri_config.h" 00036 00037 #include <math.h> 00038 #include <assert.h> 00039 #include <stdio.h> 00040 00041 #if __M4RI_HAVE_SSE2 00042 #include <emmintrin.h> 00043 #endif 00044 00045 #include "misc.h" 00046 #include "debug_dump.h" 00047 00048 #if __M4RI_HAVE_SSE2 00049 00056 #define __M4RI_SSE2_CUTOFF 10 00057 #endif 00058 00065 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27) 00066 00075 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048) 00076 00077 typedef struct { 00078 size_t size; 00079 word* begin; 00080 word* end; 00081 } mzd_block_t; 00082 00089 typedef struct mzd_t { 00094 rci_t nrows; 00095 00100 rci_t ncols; 00101 00108 wi_t width; 00109 00117 wi_t rowstride; 00118 00126 wi_t offset_vector; 00127 00132 wi_t row_offset; 00133 00138 uint16_t offset; 00139 00153 uint8_t flags; 00154 00160 uint8_t blockrows_log; 00161 00162 #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed. 00163 00168 int blockrows_mask; 00169 #endif 00170 00175 word high_bitmask; 00176 00181 word low_bitmask; 00182 00188 mzd_block_t *blocks; 00189 00195 word **rows; 00196 00197 } mzd_t; 00198 00202 static wi_t const mzd_paddingwidth = 3; 00203 00204 static uint8_t const mzd_flag_nonzero_offset = 0x1; 00205 static uint8_t const mzd_flag_nonzero_excess = 0x2; 00206 static uint8_t const mzd_flag_windowed_zerooffset = 0x4; 00207 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8; 00208 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10; 00209 static uint8_t const mzd_flag_multiple_blocks = 0x20; 00210 00218 static inline int mzd_is_windowed(mzd_t const *M) { 00219 return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset); 00220 } 00221 00229 static inline int mzd_owns_blocks(mzd_t const *M) { 00230 return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks))); 00231 } 00232 00241 static inline word* mzd_first_row(mzd_t const *M) { 00242 word* result = M->blocks[0].begin + M->offset_vector; 00243 assert(M->nrows == 0 || result == M->rows[0]); 00244 return result; 00245 } 00246 00257 static inline word* mzd_first_row_next_block(mzd_t const* M, int n) { 00258 assert(n > 0); 00259 return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride; 00260 } 00261 00271 static inline int mzd_row_to_block(mzd_t const* M, rci_t row) { 00272 return (M->row_offset + row) >> M->blockrows_log; 00273 } 00274 00288 static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) { 00289 if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { 00290 if (__M4RI_UNLIKELY(n == 0)) { 00291 return (1 << M->blockrows_log) - M->row_offset; 00292 } else { 00293 int const last_block = mzd_row_to_block(M, M->nrows - 1); 00294 if (n < last_block) 00295 return (1 << M->blockrows_log); 00296 return M->nrows + M->row_offset - (n << M->blockrows_log); 00297 } 00298 } 00299 return n ? 0 : M->nrows; 00300 } 00301 00311 static inline word* mzd_row(mzd_t const* M, rci_t row) { 00312 wi_t big_vector = M->offset_vector + row * M->rowstride; 00313 word* result = M->blocks[0].begin + big_vector; 00314 if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { 00315 int const n = (M->row_offset + row) >> M->blockrows_log; 00316 result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word)); 00317 } 00318 assert(result == M->rows[row]); 00319 return result; 00320 } 00321 00332 mzd_t *mzd_init(rci_t const r, rci_t const c); 00333 00340 void mzd_free(mzd_t *A); 00341 00342 00364 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc); 00365 00372 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc) 00373 { 00374 return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc); 00375 } 00376 00383 #define mzd_free_window mzd_free 00384 00394 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) { 00395 if ((rowa == rowb) || (startblock >= M->width)) 00396 return; 00397 00398 /* This is the case since we're only called from _mzd_pls_mmpf, 00399 * which makes the same assumption. Therefore we don't need 00400 * to take a mask_begin into account. */ 00401 assert(M->offset == 0); 00402 00403 wi_t width = M->width - startblock - 1; 00404 word *a = M->rows[rowa] + startblock; 00405 word *b = M->rows[rowb] + startblock; 00406 word tmp; 00407 word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); 00408 00409 if (width != 0) { 00410 for(wi_t i = 0; i < width; ++i) { 00411 tmp = a[i]; 00412 a[i] = b[i]; 00413 b[i] = tmp; 00414 } 00415 } 00416 tmp = (a[width] ^ b[width]) & mask_end; 00417 a[width] ^= tmp; 00418 b[width] ^= tmp; 00419 00420 __M4RI_DD_ROW(M, rowa); 00421 __M4RI_DD_ROW(M, rowb); 00422 } 00423 00432 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) { 00433 if(rowa == rowb) 00434 return; 00435 00436 wi_t width = M->width - 1; 00437 word *a = M->rows[rowa]; 00438 word *b = M->rows[rowb]; 00439 word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset); 00440 word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); 00441 00442 word tmp = (a[0] ^ b[0]) & mask_begin; 00443 if (width != 0) { 00444 a[0] ^= tmp; 00445 b[0] ^= tmp; 00446 00447 for(wi_t i = 1; i < width; ++i) { 00448 tmp = a[i]; 00449 a[i] = b[i]; 00450 b[i] = tmp; 00451 } 00452 tmp = (a[width] ^ b[width]) & mask_end; 00453 a[width] ^= tmp; 00454 b[width] ^= tmp; 00455 00456 } else { 00457 tmp &= mask_end; 00458 a[0] ^= tmp; 00459 b[0] ^= tmp; 00460 } 00461 00462 __M4RI_DD_ROW(M, rowa); 00463 __M4RI_DD_ROW(M, rowb); 00464 } 00465 00478 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j); 00479 00488 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb); 00489 00500 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) { 00501 if (cola == colb) 00502 return; 00503 00504 rci_t const _cola = cola + M->offset; 00505 rci_t const _colb = colb + M->offset; 00506 00507 wi_t const a_word = _cola / m4ri_radix; 00508 wi_t const b_word = _colb / m4ri_radix; 00509 00510 int const a_bit = _cola % m4ri_radix; 00511 int const b_bit = _colb % m4ri_radix; 00512 00513 word* RESTRICT ptr = mzd_row(M, start_row); 00514 int max_bit = MAX(a_bit, b_bit); 00515 int count_remaining = stop_row - start_row; 00516 int min_bit = a_bit + b_bit - max_bit; 00517 int block = mzd_row_to_block(M, start_row); 00518 int offset = max_bit - min_bit; 00519 word mask = m4ri_one << min_bit; 00520 int count = MIN(mzd_rows_in_block(M, 0), count_remaining); 00521 // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me). 00522 if (count <= 0) 00523 return; 00524 00525 if (a_word == b_word) { 00526 while(1) { 00527 count_remaining -= count; 00528 ptr += a_word; 00529 int fast_count = count / 4; 00530 int rest_count = count - 4 * fast_count; 00531 word xor_v[4]; 00532 wi_t const rowstride = M->rowstride; 00533 while (fast_count--) { 00534 xor_v[0] = ptr[0]; 00535 xor_v[1] = ptr[rowstride]; 00536 xor_v[2] = ptr[2 * rowstride]; 00537 xor_v[3] = ptr[3 * rowstride]; 00538 xor_v[0] ^= xor_v[0] >> offset; 00539 xor_v[1] ^= xor_v[1] >> offset; 00540 xor_v[2] ^= xor_v[2] >> offset; 00541 xor_v[3] ^= xor_v[3] >> offset; 00542 xor_v[0] &= mask; 00543 xor_v[1] &= mask; 00544 xor_v[2] &= mask; 00545 xor_v[3] &= mask; 00546 xor_v[0] |= xor_v[0] << offset; 00547 xor_v[1] |= xor_v[1] << offset; 00548 xor_v[2] |= xor_v[2] << offset; 00549 xor_v[3] |= xor_v[3] << offset; 00550 ptr[0] ^= xor_v[0]; 00551 ptr[rowstride] ^= xor_v[1]; 00552 ptr[2 * rowstride] ^= xor_v[2]; 00553 ptr[3 * rowstride] ^= xor_v[3]; 00554 ptr += 4 * rowstride; 00555 } 00556 while (rest_count--) { 00557 word xor_v = *ptr; 00558 xor_v ^= xor_v >> offset; 00559 xor_v &= mask; 00560 *ptr ^= xor_v | (xor_v << offset); 00561 ptr += rowstride; 00562 } 00563 if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0) 00564 break; 00565 ptr = mzd_first_row_next_block(M, block); 00566 } 00567 } else { 00568 word* RESTRICT min_ptr; 00569 wi_t max_offset; 00570 if (min_bit == a_bit) { 00571 min_ptr = ptr + a_word; 00572 max_offset = b_word - a_word; 00573 } else { 00574 min_ptr = ptr + b_word; 00575 max_offset = a_word - b_word; 00576 } 00577 while(1) { 00578 count_remaining -= count; 00579 wi_t const rowstride = M->rowstride; 00580 while(count--) { 00581 word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask; 00582 min_ptr[0] ^= xor_v; 00583 min_ptr[max_offset] ^= xor_v << offset; 00584 min_ptr += rowstride; 00585 } 00586 if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0) 00587 break; 00588 ptr = mzd_first_row_next_block(M, block); 00589 if (min_bit == a_bit) 00590 min_ptr = ptr + a_word; 00591 else 00592 min_ptr = ptr + b_word; 00593 } 00594 } 00595 00596 __M4RI_DD_MZD(M); 00597 } 00598 00610 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) { 00611 return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix); 00612 } 00613 00626 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) { 00627 __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value); 00628 } 00629 00630 00641 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) { 00642 int const spot = (y + M->offset) % m4ri_radix; 00643 wi_t const block = (y + M->offset) / m4ri_radix; 00644 M->rows[x][block] ^= values << spot; 00645 int const space = m4ri_radix - spot; 00646 if (n > space) 00647 M->rows[x][block + 1] ^= values >> space; 00648 } 00649 00660 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) { 00661 /* This is the best way, since this will drop out once we inverse the bits in values: */ 00662 values >>= (m4ri_radix - n); /* Move the bits to the lowest columns */ 00663 00664 int const spot = (y + M->offset) % m4ri_radix; 00665 wi_t const block = (y + M->offset) / m4ri_radix; 00666 M->rows[x][block] &= values << spot; 00667 int const space = m4ri_radix - spot; 00668 if (n > space) 00669 M->rows[x][block + 1] &= values >> space; 00670 } 00671 00681 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) { 00682 word values = m4ri_ffff >> (m4ri_radix - n); 00683 int const spot = (y + M->offset) % m4ri_radix; 00684 wi_t const block = (y + M->offset) / m4ri_radix; 00685 M->rows[x][block] &= ~(values << spot); 00686 int const space = m4ri_radix - spot; 00687 if (n > space) 00688 M->rows[x][block + 1] &= ~(values >> space); 00689 } 00690 00699 void mzd_print(mzd_t const *M); 00700 00707 void mzd_print_tight(mzd_t const *M); 00708 00719 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) { 00720 assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols); 00721 coloffset += M->offset; 00722 wi_t const startblock= coloffset/m4ri_radix; 00723 wi_t wide = M->width - startblock; 00724 word *src = M->rows[srcrow] + startblock; 00725 word *dst = M->rows[dstrow] + startblock; 00726 word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix); 00727 word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); 00728 00729 *dst++ ^= *src++ & mask_begin; 00730 --wide; 00731 00732 #if __M4RI_HAVE_SSE2 00733 int not_aligned = __M4RI_ALIGNMENT(src,16) != 0; /* 0: Aligned, 1: Not aligned */ 00734 if (wide > not_aligned + 1) /* Speed up for small matrices */ 00735 { 00736 if (not_aligned) { 00737 *dst++ ^= *src++; 00738 --wide; 00739 } 00740 /* Now wide > 1 */ 00741 __m128i* __src = (__m128i*)src; 00742 __m128i* __dst = (__m128i*)dst; 00743 __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL); 00744 do 00745 { 00746 __m128i xmm1 = _mm_xor_si128(*__dst, *__src); 00747 *__dst++ = xmm1; 00748 } 00749 while(++__src < eof); 00750 src = (word*)__src; 00751 dst = (word*)__dst; 00752 wide = ((sizeof(word)*wide)%16)/sizeof(word); 00753 } 00754 #endif 00755 wi_t i = -1; 00756 while(++i < wide) 00757 dst[i] ^= src[i]; 00758 /* 00759 * Revert possibly non-zero excess bits. 00760 * Note that i == wide here, and wide can be 0. 00761 * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;) 00762 * We use i - 1 here to let the compiler know these are the same addresses 00763 * that we last accessed, in the previous loop. 00764 */ 00765 dst[i - 1] ^= src[i - 1] & ~mask_end; 00766 00767 __M4RI_DD_ROW(M, dstrow); 00768 } 00769 00781 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow); 00782 00797 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A); 00798 00813 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B); 00814 00829 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B); 00830 00842 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear); 00843 00853 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear); 00854 00865 void mzd_randomize(mzd_t *M); 00866 00881 void mzd_set_ui(mzd_t *M, unsigned int const value); 00882 00898 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full); 00899 00915 rci_t mzd_echelonize_naive(mzd_t *M, int const full); 00916 00926 int mzd_equal(mzd_t const *A, mzd_t const *B); 00927 00941 int mzd_cmp(mzd_t const *A, mzd_t const *B); 00942 00950 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A); 00951 00972 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B); 00973 00993 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B); 00994 01007 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc); 01008 01022 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I); 01023 01035 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B); 01036 01047 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B); 01048 01059 #define mzd_sub mzd_add 01060 01071 #define _mzd_sub _mzd_add 01072 01073 01074 01084 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) { 01085 int const spot = (y + M->offset) % m4ri_radix; 01086 wi_t const block = (y + M->offset) / m4ri_radix; 01087 int const spill = spot + n - m4ri_radix; 01088 word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill); 01089 return temp >> (m4ri_radix - n); 01090 } 01091 01092 01112 void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3, 01113 mzd_t const *SC1, rci_t const row1, wi_t const startblock1, 01114 mzd_t const *SC2, rci_t const row2, wi_t const startblock2); 01115 01116 01136 static inline void mzd_combine_weird(mzd_t *C, rci_t const c_row, wi_t const c_startblock, 01137 mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 01138 mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { 01139 word tmp; 01140 rci_t i = 0; 01141 01142 01143 for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) { 01144 tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix); 01145 mzd_clear_bits(C, c_row, i, m4ri_radix); 01146 mzd_xor_bits(C, c_row, i, m4ri_radix, tmp); 01147 } 01148 if(A->ncols - i) { 01149 tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i)); 01150 mzd_clear_bits(C, c_row, i, (C->ncols - i)); 01151 mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp); 01152 } 01153 01154 __M4RI_DD_MZD(C); 01155 } 01156 01173 static inline void mzd_combine_even_in_place(mzd_t *A, rci_t const a_row, wi_t const a_startblock, 01174 mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { 01175 01176 wi_t wide = A->width - a_startblock - 1; 01177 01178 word *a = A->rows[a_row] + a_startblock; 01179 word *b = B->rows[b_row] + b_startblock; 01180 01181 #if __M4RI_HAVE_SSE2 01182 if(wide > __M4RI_SSE2_CUTOFF) { 01184 if (__M4RI_ALIGNMENT(a,16)) { 01185 *a++ ^= *b++; 01186 wide--; 01187 } 01188 01189 if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) { 01190 __m128i *a128 = (__m128i*)a; 01191 __m128i *b128 = (__m128i*)b; 01192 const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL); 01193 01194 do { 01195 *a128 = _mm_xor_si128(*a128, *b128); 01196 ++b128; 01197 ++a128; 01198 } while(a128 < eof); 01199 01200 a = (word*)a128; 01201 b = (word*)b128; 01202 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 01203 } 01204 } 01205 #endif // __M4RI_HAVE_SSE2 01206 01207 if (wide > 0) { 01208 wi_t n = (wide + 7) / 8; 01209 switch (wide % 8) { 01210 case 0: do { *(a++) ^= *(b++); 01211 case 7: *(a++) ^= *(b++); 01212 case 6: *(a++) ^= *(b++); 01213 case 5: *(a++) ^= *(b++); 01214 case 4: *(a++) ^= *(b++); 01215 case 3: *(a++) ^= *(b++); 01216 case 2: *(a++) ^= *(b++); 01217 case 1: *(a++) ^= *(b++); 01218 } while (--n > 0); 01219 } 01220 } 01221 01222 *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix); 01223 01224 __M4RI_DD_MZD(A); 01225 } 01226 01227 01247 static inline void mzd_combine_even(mzd_t *C, rci_t const c_row, wi_t const c_startblock, 01248 mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 01249 mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { 01250 01251 wi_t wide = A->width - a_startblock - 1; 01252 word *a = A->rows[a_row] + a_startblock; 01253 word *b = B->rows[b_row] + b_startblock; 01254 word *c = C->rows[c_row] + c_startblock; 01255 01256 /* /\* this is a corner case triggered by Strassen multiplication */ 01257 /* * which assumes certain (virtual) matrix sizes */ 01258 /* * 2011/03/07: I don't think this was ever correct *\/ */ 01259 /* if (a_row >= A->nrows) { */ 01260 /* assert(a_row < A->nrows); */ 01261 /* for(wi_t i = 0; i < wide; ++i) { */ 01262 /* c[i] = b[i]; */ 01263 /* } */ 01264 /* } else { */ 01265 #if __M4RI_HAVE_SSE2 01266 if(wide > __M4RI_SSE2_CUTOFF) { 01268 if (__M4RI_ALIGNMENT(a,16)) { 01269 *c++ = *b++ ^ *a++; 01270 wide--; 01271 } 01272 01273 if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) { 01274 __m128i *a128 = (__m128i*)a; 01275 __m128i *b128 = (__m128i*)b; 01276 __m128i *c128 = (__m128i*)c; 01277 const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL); 01278 01279 do { 01280 *c128 = _mm_xor_si128(*a128, *b128); 01281 ++c128; 01282 ++b128; 01283 ++a128; 01284 } while(a128 < eof); 01285 01286 a = (word*)a128; 01287 b = (word*)b128; 01288 c = (word*)c128; 01289 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 01290 } 01291 } 01292 #endif // __M4RI_HAVE_SSE2 01293 01294 if (wide > 0) { 01295 wi_t n = (wide + 7) / 8; 01296 switch (wide % 8) { 01297 case 0: do { *(c++) = *(a++) ^ *(b++); 01298 case 7: *(c++) = *(a++) ^ *(b++); 01299 case 6: *(c++) = *(a++) ^ *(b++); 01300 case 5: *(c++) = *(a++) ^ *(b++); 01301 case 4: *(c++) = *(a++) ^ *(b++); 01302 case 3: *(c++) = *(a++) ^ *(b++); 01303 case 2: *(c++) = *(a++) ^ *(b++); 01304 case 1: *(c++) = *(a++) ^ *(b++); 01305 } while (--n > 0); 01306 } 01307 } 01308 *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix)); 01309 01310 __M4RI_DD_MZD(C); 01311 } 01312 01313 01322 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n) 01323 { 01324 return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n)); 01325 } 01326 01327 01334 int mzd_is_zero(mzd_t const *A); 01335 01344 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset); 01345 01362 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c); 01363 01364 01377 double mzd_density(mzd_t const *A, wi_t res); 01378 01393 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c); 01394 01395 01404 rci_t mzd_first_zero_row(mzd_t const *A); 01405 01406 #endif // M4RI_PACKEDMATRIX_H