M4RI  1.0.1
mzd.h
Go to the documentation of this file.
1 
10 #ifndef M4RI_MZD
11 #define M4RI_MZD
12 
13 /*******************************************************************
14 *
15 * M4RI: Linear Algebra over GF(2)
16 *
17 * Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
18 * Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
19 * Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
20 *
21 * Distributed under the terms of the GNU General Public License (GPL)
22 * version 2 or higher.
23 *
24 * This code is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 * General Public License for more details.
28 *
29 * The full text of the GPL is available at:
30 *
31 * http://www.gnu.org/licenses/
32 *
33 ********************************************************************/
34 
35 #include <m4ri/m4ri_config.h>
36 
37 #include <math.h>
38 #include <assert.h>
39 #include <stdio.h>
40 
41 #if __M4RI_HAVE_SSE2
42 #include <emmintrin.h>
43 #endif
44 
45 #include <m4ri/misc.h>
46 #include <m4ri/debug_dump.h>
47 
48 #if __M4RI_HAVE_SSE2
49 
56 #define __M4RI_SSE2_CUTOFF 10
57 #endif
58 
65 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
66 
75 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L3_CACHE))) / 2, 2048)
76 
77 typedef struct {
78  size_t size;
79  word* begin;
80  word* end;
81 } mzd_block_t;
82 
89 typedef struct mzd_t {
95 
101 
109 
118 
127 
133 
138  uint16_t offset;
139 
153  uint8_t flags;
154 
160  uint8_t blockrows_log;
161 
162 #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed.
163 
168  int blockrows_mask;
169 #endif
170 
176 
182 
189 
196 
197 } mzd_t;
198 
202 static wi_t const mzd_paddingwidth = 3;
203 
204 static uint8_t const mzd_flag_nonzero_offset = 0x1;
205 static uint8_t const mzd_flag_nonzero_excess = 0x2;
206 static uint8_t const mzd_flag_windowed_zerooffset = 0x4;
207 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8;
208 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10;
209 static uint8_t const mzd_flag_multiple_blocks = 0x20;
210 
218 static inline int mzd_is_windowed(mzd_t const *M) {
219  return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
220 }
221 
229 static inline int mzd_owns_blocks(mzd_t const *M) {
230  return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks)));
231 }
232 
241 static inline word* mzd_first_row(mzd_t const *M) {
242  word* result = M->blocks[0].begin + M->offset_vector;
243  assert(M->nrows == 0 || result == M->rows[0]);
244  return result;
245 }
246 
257 static inline word* mzd_first_row_next_block(mzd_t const* M, int n) {
258  assert(n > 0);
259  return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride;
260 }
261 
271 static inline int mzd_row_to_block(mzd_t const* M, rci_t row) {
272  return (M->row_offset + row) >> M->blockrows_log;
273 }
274 
288 static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) {
289  if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
290  if (__M4RI_UNLIKELY(n == 0)) {
291  return (1 << M->blockrows_log) - M->row_offset;
292  } else {
293  int const last_block = mzd_row_to_block(M, M->nrows - 1);
294  if (n < last_block)
295  return (1 << M->blockrows_log);
296  return M->nrows + M->row_offset - (n << M->blockrows_log);
297  }
298  }
299  return n ? 0 : M->nrows;
300 }
301 
311 static inline wi_t mzd_remaining_rows_in_block(mzd_t const* M, rci_t r) {
312  const int n = mzd_row_to_block(M, r);
313  r = (r - (n << M->blockrows_log));
314  if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
315  if (__M4RI_UNLIKELY(n == 0)) {
316  return (1 << M->blockrows_log) - M->row_offset - r;
317  } else {
318  int const last_block = mzd_row_to_block(M, M->nrows - 1);
319  if (n < last_block)
320  return (1 << M->blockrows_log) - r;
321  return M->nrows + M->row_offset - (n << M->blockrows_log) - r;
322  }
323  }
324  return n ? 0 : M->nrows - r;
325 }
326 
336 static inline word* mzd_row(mzd_t const* M, rci_t row) {
337  wi_t big_vector = M->offset_vector + row * M->rowstride;
338  word* result = M->blocks[0].begin + big_vector;
339  if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
340  int const n = (M->row_offset + row) >> M->blockrows_log;
341  result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word));
342  }
343  assert(result == M->rows[row]);
344  return result;
345 }
346 
357 mzd_t *mzd_init(rci_t const r, rci_t const c);
358 
365 void mzd_free(mzd_t *A);
366 
367 
389 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
390 
397 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc)
398 {
399  return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc);
400 }
401 
408 #define mzd_free_window mzd_free
409 
419 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) {
420  if ((rowa == rowb) || (startblock >= M->width))
421  return;
422 
423  /* This is the case since we're only called from _mzd_ple_mmpf,
424  * which makes the same assumption. Therefore we don't need
425  * to take a mask_begin into account. */
426  assert(M->offset == 0);
427 
428  wi_t width = M->width - startblock - 1;
429  word *a = M->rows[rowa] + startblock;
430  word *b = M->rows[rowb] + startblock;
431  word tmp;
432  word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
433 
434  if (width != 0) {
435  for(wi_t i = 0; i < width; ++i) {
436  tmp = a[i];
437  a[i] = b[i];
438  b[i] = tmp;
439  }
440  }
441  tmp = (a[width] ^ b[width]) & mask_end;
442  a[width] ^= tmp;
443  b[width] ^= tmp;
444 
445  __M4RI_DD_ROW(M, rowa);
446  __M4RI_DD_ROW(M, rowb);
447 }
448 
457 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) {
458  if(rowa == rowb)
459  return;
460 
461  wi_t width = M->width - 1;
462  word *a = M->rows[rowa];
463  word *b = M->rows[rowb];
464  word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset);
465  word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
466 
467  word tmp = (a[0] ^ b[0]) & mask_begin;
468  if (width != 0) {
469  a[0] ^= tmp;
470  b[0] ^= tmp;
471 
472  for(wi_t i = 1; i < width; ++i) {
473  tmp = a[i];
474  a[i] = b[i];
475  b[i] = tmp;
476  }
477  tmp = (a[width] ^ b[width]) & mask_end;
478  a[width] ^= tmp;
479  b[width] ^= tmp;
480 
481  } else {
482  tmp &= mask_end;
483  a[0] ^= tmp;
484  b[0] ^= tmp;
485  }
486 
487  __M4RI_DD_ROW(M, rowa);
488  __M4RI_DD_ROW(M, rowb);
489 }
490 
503 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j);
504 
513 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb);
514 
525 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) {
526  if (cola == colb)
527  return;
528 
529  rci_t const _cola = cola + M->offset;
530  rci_t const _colb = colb + M->offset;
531 
532  wi_t const a_word = _cola / m4ri_radix;
533  wi_t const b_word = _colb / m4ri_radix;
534 
535  int const a_bit = _cola % m4ri_radix;
536  int const b_bit = _colb % m4ri_radix;
537 
538  word* RESTRICT ptr = mzd_row(M, start_row);
539  int max_bit = MAX(a_bit, b_bit);
540  int count_remaining = stop_row - start_row;
541  int min_bit = a_bit + b_bit - max_bit;
542  int block = mzd_row_to_block(M, start_row);
543  int offset = max_bit - min_bit;
544  word mask = m4ri_one << min_bit;
545  int count = MIN(mzd_remaining_rows_in_block(M, start_row), count_remaining);
546 
547  // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me).
548  if (count <= 0)
549  return;
550 
551  if (a_word == b_word) {
552  while(1) {
553  count_remaining -= count;
554  ptr += a_word;
555  int fast_count = count / 4;
556  int rest_count = count - 4 * fast_count;
557  word xor_v[4];
558  wi_t const rowstride = M->rowstride;
559  while (fast_count--) {
560  xor_v[0] = ptr[0];
561  xor_v[1] = ptr[rowstride];
562  xor_v[2] = ptr[2 * rowstride];
563  xor_v[3] = ptr[3 * rowstride];
564  xor_v[0] ^= xor_v[0] >> offset;
565  xor_v[1] ^= xor_v[1] >> offset;
566  xor_v[2] ^= xor_v[2] >> offset;
567  xor_v[3] ^= xor_v[3] >> offset;
568  xor_v[0] &= mask;
569  xor_v[1] &= mask;
570  xor_v[2] &= mask;
571  xor_v[3] &= mask;
572  xor_v[0] |= xor_v[0] << offset;
573  xor_v[1] |= xor_v[1] << offset;
574  xor_v[2] |= xor_v[2] << offset;
575  xor_v[3] |= xor_v[3] << offset;
576  ptr[0] ^= xor_v[0];
577  ptr[rowstride] ^= xor_v[1];
578  ptr[2 * rowstride] ^= xor_v[2];
579  ptr[3 * rowstride] ^= xor_v[3];
580  ptr += 4 * rowstride;
581  }
582  while (rest_count--) {
583  word xor_v = *ptr;
584  xor_v ^= xor_v >> offset;
585  xor_v &= mask;
586  *ptr ^= xor_v | (xor_v << offset);
587  ptr += rowstride;
588  }
589  block++;
590  if ((count = MIN(mzd_rows_in_block(M, block), count_remaining)) <= 0)
591  break;
592  ptr = mzd_first_row_next_block(M, block);
593  }
594  } else {
595  word* RESTRICT min_ptr;
596  wi_t max_offset;
597  if (min_bit == a_bit) {
598  min_ptr = ptr + a_word;
599  max_offset = b_word - a_word;
600  } else {
601  min_ptr = ptr + b_word;
602  max_offset = a_word - b_word;
603  }
604  while(1) {
605  count_remaining -= count;
606  wi_t const rowstride = M->rowstride;
607  while(count--) {
608  word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
609  min_ptr[0] ^= xor_v;
610  min_ptr[max_offset] ^= xor_v << offset;
611  min_ptr += rowstride;
612  }
613  block++;
614  if ((count = MIN(mzd_rows_in_block(M,+block), count_remaining)) <= 0)
615  break;
616  ptr = mzd_first_row_next_block(M, block);
617  if (min_bit == a_bit)
618  min_ptr = ptr + a_word;
619  else
620  min_ptr = ptr + b_word;
621  }
622  }
623 
624  __M4RI_DD_MZD(M);
625 }
626 
638 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) {
639  return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix);
640 }
641 
654 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) {
655  __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value);
656 }
657 
658 
669 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
670  int const spot = (y + M->offset) % m4ri_radix;
671  wi_t const block = (y + M->offset) / m4ri_radix;
672  M->rows[x][block] ^= values << spot;
673  int const space = m4ri_radix - spot;
674  if (n > space)
675  M->rows[x][block + 1] ^= values >> space;
676 }
677 
688 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
689  /* This is the best way, since this will drop out once we inverse the bits in values: */
690  values >>= (m4ri_radix - n); /* Move the bits to the lowest columns */
691 
692  int const spot = (y + M->offset) % m4ri_radix;
693  wi_t const block = (y + M->offset) / m4ri_radix;
694  M->rows[x][block] &= values << spot;
695  int const space = m4ri_radix - spot;
696  if (n > space)
697  M->rows[x][block + 1] &= values >> space;
698 }
699 
709 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
710  assert(n>0 && n <= m4ri_radix);
711  word values = m4ri_ffff >> (m4ri_radix - n);
712  int const spot = (y + M->offset) % m4ri_radix;
713  wi_t const block = (y + M->offset) / m4ri_radix;
714  M->rows[x][block] &= ~(values << spot);
715  int const space = m4ri_radix - spot;
716  if (n > space)
717  M->rows[x][block + 1] &= ~(values >> space);
718 }
719 
732 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) {
733  assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
734  coloffset += M->offset;
735  wi_t const startblock= coloffset/m4ri_radix;
736  wi_t wide = M->width - startblock;
737  word *src = M->rows[srcrow] + startblock;
738  word *dst = M->rows[dstrow] + startblock;
739  word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix);
740  word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
741 
742  *dst++ ^= *src++ & mask_begin;
743  --wide;
744 
745 #if __M4RI_HAVE_SSE2
746  int not_aligned = __M4RI_ALIGNMENT(src,16) != 0; /* 0: Aligned, 1: Not aligned */
747  if (wide > not_aligned + 1) /* Speed up for small matrices */
748  {
749  if (not_aligned) {
750  *dst++ ^= *src++;
751  --wide;
752  }
753  /* Now wide > 1 */
754  __m128i* __src = (__m128i*)src;
755  __m128i* __dst = (__m128i*)dst;
756  __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL);
757  do
758  {
759  __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
760  *__dst++ = xmm1;
761  }
762  while(++__src < eof);
763  src = (word*)__src;
764  dst = (word*)__dst;
765  wide = ((sizeof(word)*wide)%16)/sizeof(word);
766  }
767 #endif
768  wi_t i = -1;
769  while(++i < wide)
770  dst[i] ^= src[i];
771  /*
772  * Revert possibly non-zero excess bits.
773  * Note that i == wide here, and wide can be 0.
774  * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;)
775  * We use i - 1 here to let the compiler know these are the same addresses
776  * that we last accessed, in the previous loop.
777  */
778  dst[i - 1] ^= src[i - 1] & ~mask_end;
779 
780  __M4RI_DD_ROW(M, dstrow);
781 }
782 
794 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow);
795 
810 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
811 
826 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
827 
842 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
843 
855 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear);
856 
866 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear);
867 
878 void mzd_randomize(mzd_t *M);
879 
894 void mzd_set_ui(mzd_t *M, unsigned int const value);
895 
911 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full);
912 
928 rci_t mzd_echelonize_naive(mzd_t *M, int const full);
929 
939 int mzd_equal(mzd_t const *A, mzd_t const *B);
940 
954 int mzd_cmp(mzd_t const *A, mzd_t const *B);
955 
963 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A);
964 
985 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B);
986 
1006 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B);
1007 
1020 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
1021 
1035 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I);
1036 
1048 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
1049 
1060 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
1061 
1072 #define mzd_sub mzd_add
1073 
1084 #define _mzd_sub _mzd_add
1085 
1086 
1087 
1097 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
1098  int const spot = (y + M->offset) % m4ri_radix;
1099  wi_t const block = (y + M->offset) / m4ri_radix;
1100  int const spill = spot + n - m4ri_radix;
1101  word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill);
1102  return temp >> (m4ri_radix - n);
1103 }
1104 
1105 
1125 void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3,
1126  mzd_t const *SC1, rci_t const row1, wi_t const startblock1,
1127  mzd_t const *SC2, rci_t const row2, wi_t const startblock2);
1128 
1129 
1146 static inline void mzd_combine_weird(mzd_t *C, rci_t const c_row,
1147  mzd_t const *A, rci_t const a_row,
1148  mzd_t const *B, rci_t const b_row) {
1149  word tmp;
1150  rci_t i = 0;
1151 
1152 
1153  for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) {
1154  tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix);
1155  mzd_clear_bits(C, c_row, i, m4ri_radix);
1156  mzd_xor_bits(C, c_row, i, m4ri_radix, tmp);
1157  }
1158  if(A->ncols - i) {
1159  tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i));
1160  mzd_clear_bits(C, c_row, i, (C->ncols - i));
1161  mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp);
1162  }
1163 
1164  __M4RI_DD_MZD(C);
1165 }
1166 
1183 static inline void mzd_combine_even_in_place(mzd_t *A, rci_t const a_row, wi_t const a_startblock,
1184  mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1185 
1186  wi_t wide = A->width - a_startblock - 1;
1187 
1188  word *a = A->rows[a_row] + a_startblock;
1189  word *b = B->rows[b_row] + b_startblock;
1190 
1191 #if __M4RI_HAVE_SSE2
1192  if(wide > __M4RI_SSE2_CUTOFF) {
1194  if (__M4RI_ALIGNMENT(a,16)) {
1195  *a++ ^= *b++;
1196  wide--;
1197  }
1198 
1199  if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) {
1200  __m128i *a128 = (__m128i*)a;
1201  __m128i *b128 = (__m128i*)b;
1202  const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
1203 
1204  do {
1205  *a128 = _mm_xor_si128(*a128, *b128);
1206  ++b128;
1207  ++a128;
1208  } while(a128 < eof);
1209 
1210  a = (word*)a128;
1211  b = (word*)b128;
1212  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
1213  }
1214  }
1215 #endif // __M4RI_HAVE_SSE2
1216 
1217  if (wide > 0) {
1218  wi_t n = (wide + 7) / 8;
1219  switch (wide % 8) {
1220  case 0: do { *(a++) ^= *(b++);
1221  case 7: *(a++) ^= *(b++);
1222  case 6: *(a++) ^= *(b++);
1223  case 5: *(a++) ^= *(b++);
1224  case 4: *(a++) ^= *(b++);
1225  case 3: *(a++) ^= *(b++);
1226  case 2: *(a++) ^= *(b++);
1227  case 1: *(a++) ^= *(b++);
1228  } while (--n > 0);
1229  }
1230  }
1231 
1232  *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix);
1233 
1234  __M4RI_DD_MZD(A);
1235 }
1236 
1237 
1257 static inline void mzd_combine_even(mzd_t *C, rci_t const c_row, wi_t const c_startblock,
1258  mzd_t const *A, rci_t const a_row, wi_t const a_startblock,
1259  mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
1260 
1261  wi_t wide = A->width - a_startblock - 1;
1262  word *a = A->rows[a_row] + a_startblock;
1263  word *b = B->rows[b_row] + b_startblock;
1264  word *c = C->rows[c_row] + c_startblock;
1265 
1266  /* /\* this is a corner case triggered by Strassen multiplication */
1267  /* * which assumes certain (virtual) matrix sizes */
1268  /* * 2011/03/07: I don't think this was ever correct *\/ */
1269  /* if (a_row >= A->nrows) { */
1270  /* assert(a_row < A->nrows); */
1271  /* for(wi_t i = 0; i < wide; ++i) { */
1272  /* c[i] = b[i]; */
1273  /* } */
1274  /* } else { */
1275 #if __M4RI_HAVE_SSE2
1276  if(wide > __M4RI_SSE2_CUTOFF) {
1278  if (__M4RI_ALIGNMENT(a,16)) {
1279  *c++ = *b++ ^ *a++;
1280  wide--;
1281  }
1282 
1283  if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) {
1284  __m128i *a128 = (__m128i*)a;
1285  __m128i *b128 = (__m128i*)b;
1286  __m128i *c128 = (__m128i*)c;
1287  const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
1288 
1289  do {
1290  *c128 = _mm_xor_si128(*a128, *b128);
1291  ++c128;
1292  ++b128;
1293  ++a128;
1294  } while(a128 < eof);
1295 
1296  a = (word*)a128;
1297  b = (word*)b128;
1298  c = (word*)c128;
1299  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
1300  }
1301  }
1302 #endif // __M4RI_HAVE_SSE2
1303 
1304  if (wide > 0) {
1305  wi_t n = (wide + 7) / 8;
1306  switch (wide % 8) {
1307  case 0: do { *(c++) = *(a++) ^ *(b++);
1308  case 7: *(c++) = *(a++) ^ *(b++);
1309  case 6: *(c++) = *(a++) ^ *(b++);
1310  case 5: *(c++) = *(a++) ^ *(b++);
1311  case 4: *(c++) = *(a++) ^ *(b++);
1312  case 3: *(c++) = *(a++) ^ *(b++);
1313  case 2: *(c++) = *(a++) ^ *(b++);
1314  case 1: *(c++) = *(a++) ^ *(b++);
1315  } while (--n > 0);
1316  }
1317  }
1318  *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix));
1319 
1320  __M4RI_DD_MZD(C);
1321 }
1322 
1323 
1332 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n)
1333 {
1334  return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n));
1335 }
1336 
1337 
1344 int mzd_is_zero(mzd_t const *A);
1345 
1354 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset);
1355 
1372 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c);
1373 
1374 
1387 double mzd_density(mzd_t const *A, wi_t res);
1388 
1403 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c);
1404 
1405 
1414 rci_t mzd_first_zero_row(mzd_t const *A);
1415 
1422 static inline word mzd_hash(mzd_t const *A) {
1423  word hash = 0;
1424  for (rci_t r = 0; r < A->nrows; ++r)
1425  hash ^= rotate_word(calculate_hash(A->rows[r], A->width), r % m4ri_radix);
1426  return hash;
1427 }
1428 
1438 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A);
1439 
1449 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A);
1450 
1451 #endif // M4RI_MZD