strcpy や memcpy と愉快な仲間たち

以前 strcpy とか memcpy の実装の違いが気になったので、ちょっと調べてみた時の話を加筆・修正した。

まずはとてつもなく適当なテストプログラム。

#include <stdio.h>
#include <string.h>
#include <sys/time.h>

#define BUF_SIZE 0x10000000
#define TRIAL_COUNT 5

char src[BUF_SIZE], dst[BUF_SIZE];

int main(int argc, char** argv)
{
  printf("src : 0x%08x\n", src);
  printf("dst : 0x%08x\n", dst);
  puts("");
  
  memset(src, 0xff, BUF_SIZE);
  src[BUF_SIZE - 1] = 0;
  memcpy(dst, src, BUF_SIZE);
  
  for (int i = 1; i <= TRIAL_COUNT; i++) {
    struct timeval tv1, tv2, tv;
    
    memset(src, 0xff, BUF_SIZE);
    src[BUF_SIZE - 1] = 0;
    
    gettimeofday(&tv1, NULL);
    strcpy(dst, src);
    gettimeofday(&tv2, NULL);
    timersub(&tv2, &tv1, &tv);
    printf("strcpy          [%d] : %08u %08u\n", i, tv.tv_sec, tv.tv_usec);
    
    gettimeofday(&tv1, NULL);
    strncpy(dst, src, BUF_SIZE);
    gettimeofday(&tv2, NULL);
    timersub(&tv2, &tv1, &tv);
    printf("strncpy         [%d] : %08u %08u\n", i, tv.tv_sec, tv.tv_usec);
    
    gettimeofday(&tv1, NULL);
    memcpy(dst, src, BUF_SIZE);
    gettimeofday(&tv2, NULL);
    timersub(&tv2, &tv1, &tv);
    printf("memcpy          [%d] : %08u %08u\n", i, tv.tv_sec, tv.tv_usec);
    
    gettimeofday(&tv1, NULL);
    memmove(dst, src, BUF_SIZE);
    gettimeofday(&tv2, NULL);
    timersub(&tv2, &tv1, &tv);
    printf("memmove         [%d] : %08u %08u\n", i, tv.tv_sec, tv.tv_usec);
    
    gettimeofday(&tv1, NULL);
    memmove(dst+1, src, BUF_SIZE);
    gettimeofday(&tv2, NULL);
    timersub(&tv2, &tv1, &tv);
    printf("memmove(overlap)[%d] : %08u %08u\n", i, tv.tv_sec, tv.tv_usec);
    
    puts("");
  }

  return 0;
}

そして、その実行結果。

src : 0x10403020
dst : 0x00403020

strcpy          [1] : 00000000 00370000
strncpy         [1] : 00000000 00404000
memcpy          [1] : 00000000 00300000
memmove         [1] : 00000000 00312000
memmove(overlap)[1] : 00000000 00504000

strcpy          [2] : 00000000 00372000
strncpy         [2] : 00000000 00401000
memcpy          [2] : 00000000 00304000
memmove         [2] : 00000000 00311000
memmove(overlap)[2] : 00000000 00509000

strcpy          [3] : 00000000 00370000
strncpy         [3] : 00000000 00406000
memcpy          [3] : 00000000 00300000
memmove         [3] : 00000000 00300000
memmove(overlap)[3] : 00000000 00510000

strcpy          [4] : 00000000 00370000
strncpy         [4] : 00000000 00405000
memcpy          [4] : 00000000 00296000
memmove         [4] : 00000000 00311000
memmove(overlap)[4] : 00000000 00509000

strcpy          [5] : 00000000 00369000
strncpy         [5] : 00000000 00404000
memcpy          [5] : 00000000 00305000
memmove         [5] : 00000000 00308000
memmove(overlap)[5] : 00000000 00508000

雰囲気から察するに以下のような順位になる。

memcpy と memmove (重なりなし)
strcpy
strncpy
memmove (重なりあり)

cygwin の C ライブラリは newlib を利用しているため、それのソースを調べてみる。

memcpy

まずは memcpy のソースの一部を抜粋したものを以下に示す。

/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
#define UNALIGNED(X, Y) \
  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))

/* How many bytes are copied each iteration of the 4X unrolled loop.  */
#define BIGBLOCKSIZE    (sizeof (long) << 2)

/* How many bytes are copied each iteration of the word copy loop.  */
#define LITTLEBLOCKSIZE (sizeof (long))

/* Threshhold for punting to the byte copier.  */
#define TOO_SMALL(LEN)  ((LEN) < BIGBLOCKSIZE)

_PTR
_DEFUN (memcpy, (dst0, src0, len0),
	_PTR dst0 _AND
	_CONST _PTR src0 _AND
	size_t len0)
{
  char *dst = dst0;
  _CONST char *src = src0;
  long *aligned_dst;
  _CONST long *aligned_src;
  int   len =  len0;

  /* If the size is small, or either SRC or DST is unaligned,
     then punt into the byte copy loop.  This should be rare.  */
  if (!TOO_SMALL(len) && !UNALIGNED (src, dst))
    {
      aligned_dst = (long*)dst;
      aligned_src = (long*)src;

      /* Copy 4X long words at a time if possible.  */
      while (len >= BIGBLOCKSIZE)
        {
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          *aligned_dst++ = *aligned_src++;
          len -= BIGBLOCKSIZE;
        }

      /* Copy one long word at a time if possible.  */
      while (len >= LITTLEBLOCKSIZE)
        {
          *aligned_dst++ = *aligned_src++;
          len -= LITTLEBLOCKSIZE;
        }

       /* Pick up any residual with a byte copier.  */
      dst = (char*)aligned_dst;
      src = (char*)aligned_src;
    }

  while (len--)
    *dst++ = *src++;

  return dst0;
}

まず、if 文で char* を long* に読み替えているが、これはその内側の while ループの代入文で 4byte コピーの命令になるようにするためのようである。加えて、代入文をループアンローリング (X4) することで最速を実現しているようである。

strcpy

次に strcpy の一部抜粋したものを以下に示す。

#if LONG_MAX == 2147483647L
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECTNULL(X) (((X) - 0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif

#ifndef DETECTNULL
#error long int is not a 32bit or 64bit byte
#endif

char*
_DEFUN (strcpy, (dst0, src0),
	char *dst0 _AND
	_CONST char *src0)
{
  char *dst = dst0;
  _CONST char *src = src0;
  long *aligned_dst;
  _CONST long *aligned_src;

  /* If SRC or DEST is unaligned, then copy bytes.  */
  if (!UNALIGNED (src, dst))
    {
      aligned_dst = (long*)dst;
      aligned_src = (long*)src;

      /* SRC and DEST are both "long int" aligned, try to do "long int"
         sized copies.  */
      while (!DETECTNULL(*aligned_src))
        {
          *aligned_dst++ = *aligned_src++;
        }

      dst = (char*)aligned_dst;
      src = (char*)aligned_src;
    }

  while ((*dst++ = *src++))
    ;
  return dst0;
}

memcpy と同様に 4byte コピーは使用しているが、ループアンローリングは使用していない。各バイトでヌル文字のチェックが必要であるためであろう。

実はこのソースで使用されているヌル文字検出マクロ DETECTNULL *1 の仕組みがよくわからなかったのだが、本棚から取り出したハッカーのたのしみ―本物のプログラマはいかにして問題を解くかをペラペラとめくって探していると、 $\bar{x} & (x-1)$ で末尾まで連続する 0 の部分のみを 1 とするビットパターンが得られる (例 : 01011000 → 00000111) という記述を見つけた。
ということは最上位ビットがセットされているパターンはビットがすべて 1 の場合のみであり、それは元のビットがすべて 0 の場合しかない。なるほど、これでヌル文字が検出できる仕組みは理解できた。

strncpy

続いては strncpy の一部抜粋。

#define TOO_SMALL(LEN) ((LEN) < sizeof (long))

char *
_DEFUN (strncpy, (dst0, src0),
	char *dst0 _AND
	_CONST char *src0 _AND
	size_t count)
{
  char *dst = dst0;
  _CONST char *src = src0;
  long *aligned_dst;
  _CONST long *aligned_src;

  /* If SRC and DEST is aligned and count large enough, then copy words.  */
  if (!UNALIGNED (src, dst) && !TOO_SMALL (count))
    {
      aligned_dst = (long*)dst;
      aligned_src = (long*)src;

      /* SRC and DEST are both "long int" aligned, try to do "long int"
	 sized copies.  */
      while (count >= sizeof (long int) && !DETECTNULL(*aligned_src))
	{
	  count -= sizeof (long int);
	  *aligned_dst++ = *aligned_src++;
	}

      dst = (char*)aligned_dst;
      src = (char*)aligned_src;
    }

  while (count > 0)
    {
      --count;
      if ((*dst++ = *src++) == '\0')
	break;
    }

  while (count-- > 0)
    *dst++ = '\0';

  return dst0;
}

strcpy と同様に 4byte コピーのみであるが、終了条件として、ヌル文字のチェックに加えて、指定された長さであるかどうかのチェックが毎回必要になるだけ、遅くなっていると思われる。

memmove

最後は memmove の一部抜粋。

_PTR
_DEFUN (memmove, (dst_void, src_void, length),
	_PTR dst_void _AND
	_CONST _PTR src_void _AND
	size_t length)
{
  char *dst = dst_void;
  _CONST char *src = src_void;
  long *aligned_dst;
  _CONST long *aligned_src;
  int   len =  length;

  if (src < dst && dst < src + len)
    {
      /* Destructive overlap...have to copy backwards */
      src += len;
      dst += len;
      while (len--)
	{
	  *--dst = *--src;
	}
    }
  else
    {
      /* Use optimizing algorithm for a non-destructive copy to closely 
         match memcpy. If the size is small or either SRC or DST is unaligned,
         then punt into the byte copy loop.  This should be rare.  */
      if (!TOO_SMALL(len) && !UNALIGNED (src, dst))
        {
          aligned_dst = (long*)dst;
          aligned_src = (long*)src;

          /* Copy 4X long words at a time if possible.  */
          while (len >= BIGBLOCKSIZE)
            {
              *aligned_dst++ = *aligned_src++;
              *aligned_dst++ = *aligned_src++;
              *aligned_dst++ = *aligned_src++;
              *aligned_dst++ = *aligned_src++;
              len -= BIGBLOCKSIZE;
            }

          /* Copy one long word at a time if possible.  */
          while (len >= LITTLEBLOCKSIZE)
            {
              *aligned_dst++ = *aligned_src++;
              len -= LITTLEBLOCKSIZE;
            }

          /* Pick up any residual with a byte copier.  */
          dst = (char*)aligned_dst;
          src = (char*)aligned_src;
        }

      while (len--)
        {
          *dst++ = *src++;
        }
    }

  return dst_void;
}

重なりがない場合は memcpy と同じであるが、重なりがある場合は 1byte ずつちまちまとコピーするらしい。そりゃ遅いだろうね。

まとめ

普段、何気なく使用している標準ライブラリでも、ループアンローリングや char* から long* への読み替えによる 1 命令あたりの byte 数の変更など、高速化の技術がいろいろと使われていて、興味深いものであった。

ちなみに glibc と比較しようと思ったが、こっちはこっちで複雑っぽいので、また気が向いた時にでも調べたい。

*1:誤解を招く名前だから DETECTNULCHAR の方が適切だという意見もある。

いけむランド

はてダからやってきました

strcpy や memcpy と愉快な仲間たち

memcpy

strcpy

strncpy

memmove

まとめ