mknmz のソースコードを読む

作成日 : 2011-05-26

最終更新日 :

1. はじめに

1.1 mknmz と namazu

namazu は大きく二つのプログラムに分かれる。一つはインデックスを作るプログラム、もう一つは検索をするプログラムである。インデックスを作るプログラムは mknmz、検索をするプログラムは namazu である。ここではインデックスを作るプログラムである mknmz のソースコードを読んでみよう。実際には mknmz.in ファイルを読むことにする。両者の違いは、mknmz.in を作る元ソース（テンプレート）であるのに対し、 mknmz は環境等を置き換えたあとで実際にスクリプトを実行する実ソースである、ということである。また、mknmz はプログラミング言語 perl で書かれている。

1.2 ソースコードを表示する

以下の仕組みで表示用のソースコードを加工している。 html では &, >, < の表示にあたり工夫する必要がある。ここで、UNIX のコマンドである sed と cat -n を使っている。

% sed -e 's/\&/\&/g' -e 's/>/\>/g' -e 's/</\</g' scripts/mknmz.in | cat -n > mknmz.in.html

2. ソースコードを読む

2.1 ライブラリなど


     1	#! %PERL% -w
     2	# -*- Perl -*-
     3	# mknmz - indexer of Namazu
     4	# $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
     5	#
     6	# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
     7	# Copyright (C) 2000-2008 Namazu Project All rights reserved.
     8	#     This is free software with ABSOLUTELY NO WARRANTY.
     9	#
    10	#  This program is free software; you can redistribute it and/or modify
    11	#  it under the terms of the GNU General Public License as published by
    12	#  the Free Software Foundation; either versions 2, or (at your option)
    13	#  any later version.
    14	# 
    15	#  This program is distributed in the hope that it will be useful
    16	#  but WITHOUT ANY WARRANTY; without even the implied warranty of
    17	#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    18	#  GNU General Public License for more details.
    19	#
    20	#  You should have received a copy of the GNU General Public License
    21	#  along with this program; if not, write to the Free Software
    22	#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
    23	#  02111-1307, USA
    24	#
    25	#  This file must be encoded in EUC-JP encoding
    26	#
    27	
    28	package mknmz;
    29	require 5.004;
    30	use English;
    31	use lib "%ADDITIONAL_INC%";
    32	use Cwd;
    33	use IO::File;
    34	use File::Find;
    35	use File::MMagic;
    36	use Time::Local;
    37	use strict;  # be strict since v1.2.0
    38	use Getopt::Long;
    39	use File::Copy;
    40	use DirHandle;
    41	use File::Basename;
    42	
    43	use vars qw($SYSTEM);
    44	# It exists only for back compatibility.
    45	$SYSTEM = $English::OSNAME;

28 行め、package mknmz; はパッケージ mknmz の名前空間を宣言している。

29 行め、require 5.004; は依存バージョンである。

30行目以降の use は各種モジュールの宣言である。

lib → ここは mknmz では use lib "."; と展開される。
English → 各種の数などを英語で表すためのモジュール
Cwd → カレントワーキングディレクトリを得るためのモジュール
File::Find　→ファイルの検索など
File::MMagic　→ファイルのマジックナンバーを見る
File::Copy　→　ファイルをコピーする
File::Basename　→ファイルのフルパスから
IO::File　　　→　ＩＯを使う
Time::Local　→　時間（ローカル時間）
strict　→　インタープリターで解釈を厳密にして怪しいところはすべてエラーを出し停止する
Getopt::Long　→　長形式のオプション解釈モジュール
DirHandle　→　ディレクトリを扱うモジュール
vars qw($SYSTEM)

    45 行目の	$SYSTEM = $English::OSNAME; は後方互換性のためだけに存在する。

2.2 変数宣言

    46	
    47	my $NAMAZU_INDEX_VERSION = "2.0";
    48	
    49	my $CodingSystem = "euc";
    50	my $PKGDATADIR    = $ENV{'pkgdatadir'} || "@pkgdatadir@";
    51	my $CONFDIR       = "@CONFDIR@";     # directory where mknmzrc are in.
    52	my $LIBDIR        = $PKGDATADIR . "/pl";      # directory where library etc. are in.
    53	my $FILTERDIR     = $PKGDATADIR . "/filter";   # directory where filters are in.
    54	my $TEMPLATEDIR   = $PKGDATADIR . "/template"; # directory where templates are in.
    55	
    56	my $DeletedFilesCount = 0;
    57	my $UpdatedFilesCount = 0;
    58	my $APPENDMODE = 0;
    59	my %PhraseHash = ();
    60	my %PhraseHashLast = ();
    61	my %KeyIndex = ();
    62	my %KeyIndexLast = ();
    63	my %CheckPoint = ("on" => undef, "continue" => undef);
    64	my $ConfigFile = undef;
    65	my $MediaType  = undef;
    66	
    67	my $ReplaceCode  = undef;  # perl code for transforming URI
    68	my @Seed = ();
    69	my @LoadedRcfiles = ();
    70	my $Magic = new File::MMagic;
    71	
    72	my $ReceiveTERM = 0;
    73	
    74	STDOUT->autoflush(1);
    75	STDERR->autoflush(1);

47 行から 58 行まではスカラー型変数の宣言。その中で、49 行は日本語の表現形式を宣言している。ここでは euc である。 59 行から 63 行まではハッシュ(連想配列)である。ここもほとんど何もない状態の宣言である。 64 行から 67 行はふたたびスカラー型変数の宣言。 68 行と69 行は空の配列の宣言である。 70 行と72　行は再度スカラー型変数の宣言。 73 行は標準出力のバッファをフラッシュする(バッファを貯めない)宣言。 74 行は標準エラー出力に対して同様にバッファをフラッシュする宣言。

2-3 main 関数


    76	main();
    77	sub main {
    78	    my $start_time = time;
    79	
    80	    if ($English::PERL_VERSION == 5.008001) {
    81	        unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
    82	            print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
    83	            exit 1;
    84	        }
    85	    }
    86	
    87	    init();
    88	
    89	    # At first, loading pl/conf.pl to prevent overriding some variables.
    90	    preload_modules();
    91	
    92	    # set LANG and bind textdomain
    93	    util::set_lang();
    94	    textdomain('namazu', $util::LANG_MSG);
    95	
    96	    load_modules();
    97	    my ($output_dir, @targets) = parse_options();
    98	    my ($docid_base, $total_files_num) = prep($output_dir, @targets);
    99	
   100	    my $swap = 1;
   101	    my $docid_count = 0;
   102	    my $file_count = 0;
   103	    my $total_files_size = 0;
   104	    my $key_count = 0;
   105	    my $checkpoint = 0;
   106	    my $flist_ptr = 0;
   107	    my $processed_files_size = 0;
   108	
   109	    if ($CheckPoint{'continue'}) {
   110		# Restore variables
   111		eval util::readfile($var::NMZ{'_checkpoint'}) ;
   112	    } else {
   113		print $total_files_num . _(" files are found to be indexed.\n");
   114	    }
   115	
   116	    {
   117		my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
   118		my $fh_flist = util::efopen($var::NMZ{'_flist'});
   119		my %field_indices = ();
   120		get_field_index_base(\%field_indices);
   121	
   122		if ($CheckPoint{'continue'}) {
   123		    seek($fh_flist, $flist_ptr, 0);
   124		}
   125	
   126		# Process target files one by one
   127		while (defined(my $line = <$fh_flist>)) {
   128		    $flist_ptr += length($line);
   129		    my $cfile = $line;
   130		    chomp $cfile;
   131		    util::dprint(_("target file: ")."$cfile\n");
   132	
   133		    my ($cfile_size, $num) = 
   134			process_file($cfile, $docid_count, $docid_base, 
   135				     $file_count, \%field_indices,
   136				     $fh_errorsfile, $total_files_num);
   137		    if ($num == 0) {
   138			$total_files_num--;
   139			next;
   140		    } else {
   141			$docid_count += $num;
   142			$file_count++;
   143		    }
   144	
   145		    $total_files_size     += $cfile_size;
   146		    $processed_files_size += $cfile_size;
   147		    last if $ReceiveTERM;
   148		    if ($processed_files_size > $conf::ON_MEMORY_MAX) {
   149			if (%KeyIndex) {
   150			    $key_count = write_index();
   151			    print _("Writing index files..."); 
   152			    write_phrase_hash();
   153			    print "\n";
   154			}
   155			$processed_files_size = 0;
   156			$checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
   157		    }
   158		}
   159	
   160	        util::fclose($fh_flist);
   161	        util::fclose($fh_errorsfile);
   162	    }
   163	    # This should be out of above blocks because of file handler closing.
   164	    re_exec($flist_ptr, $docid_count, $docid_base, $start_time, 
   165		    $total_files_size, $total_files_num,
   166		    $file_count, $key_count) if $checkpoint;
   167	
   168	    if (%KeyIndex) {
   169		$key_count = write_index();
   170		print _("Writing index files...");
   171		write_phrase_hash();
   172		print "\n";
   173	    }
   174	
   175	    $key_count = get_total_keys() unless $key_count;
   176	    do_remain_job($total_files_size, $docid_count, $key_count, 
   177			   $start_time);
   178	    exit 0;
   179	}

結構な量があるが、少しずつみていこう。まず 76 行で main を呼び出す。呼び出しの main 関数は 77 行から定義されている。変数 $start_time はまず今の時刻を測りたい、という要求からである。 if ($English::PERL_VERSION == 5.008001) だったら、次に続く条件を吟味したうえで不合格となる。

87 行は初期化を実行する。実際の初期化の内容は別に回す。

90 行は pl/conf.pl をローディングすることにしている。

93 行は、set_lang()関数で LANG が設定される。この関数の詳細も追って書く。

94 行にtextdomain 関数がある。これは gettext() がコールされた際に検索をかけられる条件も知っている。

96 行は自前モジュールのロード (373 行参照)。

97 行はオプションを解析したときの返り値２種類を得ている。

98 行は 909 行で定義された prep 関数による値。 prep は準備 (preparation) の意味であろう。

100 行から　107 行は変数定義と初期化である。

109 行から　114 行はチェックポイントを表す CheckPoint ハッシュの 'continue' 要素の値を調べるている。

116 行から 162 行はブロックになっている。 117

process_file


   180	
   181	#
   182	# FIXME: Very complicated.
   183	#
   184	sub process_file ($$$$\%$$) {
   185	    my ($cfile, $docid_count, $docid_base, $file_count, 
   186		$field_indices, $fh_errorsfile, $total_files_num) = @_;
   187	
   188	    my $processed_num = 0;
   189	    my $file_size = util::filesize($cfile);
   190	
   191	    if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
   192		my @parts;
   193		@parts = htmlsplit::split($cfile, "NMZ.partial")
   194	            if ($file_size <= $conf::FILE_SIZE_MAX);
   195		if (@parts > 1) {
   196		    my $id = 0;
   197		    for my $part (@parts) {
   198			next if (defined $conf::EXCLUDE_PATH &&
   199				 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
   200			my $fname = util::tmpnam("NMZ.partial.$id");
   201			my $fragment  = defined $part ? $part : undef;
   202			my $uri   = generate_uri($cfile, $fragment);
   203			my $result = namazu_core($fname, 
   204						 $docid_count + $processed_num, 
   205						 $docid_base, $file_count, 
   206						 $field_indices, $fh_errorsfile, 
   207						 $total_files_num, 
   208						 $uri, $id, $#parts);
   209			if ($result > 0) {
   210			    $processed_num++;
   211			    my $rname = defined $part ? "$cfile\t$part" : "$cfile";
   212			    put_registry($rname);
   213			}
   214		    	unlink $fname;
   215			$id++;
   216		    }
   217		    return ($file_size, $processed_num);
   218		}
   219	    }
   220	    my $result = namazu_core($cfile, $docid_count, $docid_base, 
   221				     $file_count, $field_indices, 
   222				     $fh_errorsfile, $total_files_num,
   223				     undef, undef, undef);
   224	    if ($result > 0) {
   225		$processed_num++;
   226		put_registry($cfile);
   227	    }
   228	
   229	    return ($file_size, $processed_num);
   230	}

process_file

まず、sub process_file ($$$$\%$$)の引数に驚いてしまうが、リファレンスに関係があることをまず注意しておく。

   231	
   232	# 
   233	# Load mknmzrcs:
   234	# 
   235	#  1. MKNMZRC environment
   236	# 
   237	#  2. $(sysconfdir)/$(PACKAGE)/mknmzrc
   238	# 
   239	#  3. ~/.mknmzrc
   240	# 
   241	#  4. user-specified mknmzrc set by mknmz --config=file option.
   242	# 
   243	# If multiple files exists, read all of them.
   244	# 
   245	sub load_rcfiles () {
   246	    my (@cand) = ();
   247	
   248	    # To support Windows. Since they have nasty drive letter convention,
   249	    # it is necessary to change mknmzrc dynamically with env. variable.
   250	    push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
   251	    push @cand, "$CONFDIR/mknmzrc";
   252	    push @cand, "$ENV{'HOME'}/.mknmzrc";
   253	
   254	    util::vprint(_("Reading rcfile: "));
   255	    for my $rcfile (@cand) {
   256		if (-f $rcfile) {
   257			load_rcfile ($rcfile);
   258		    util::vprint(" $rcfile");
   259		}
   260	    }
   261	    util::vprint("\n");
   262	}
   263	
   264	sub load_rcfile ($) {
   265	    my ($rcfile) = @_;
   266	    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
   267		util::win32_yen_to_slash(\$rcfile);
   268	    }
   269	    return if (grep {m/^$rcfile$/} @LoadedRcfiles);
   270	    do $rcfile;
   271	    if ($@) {
   272	        chop $@;
   273	        push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
   274	    }else {
   275	        push @LoadedRcfiles, $rcfile;
   276	    }
   277	
   278	    # Dirty workaround.
   279	    $LIBDIR = $conf::LIBDIR
   280		if (defined $conf::LIBDIR && -d $conf::LIBDIR);
   281	    $FILTERDIR = $conf::FILTERDIR
   282		if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
   283	    $TEMPLATEDIR = $conf::TEMPLATEDIR
   284		if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
   285	}
   286	
   287	sub re_exec($$$$$$$$) {
   288	    my ($flist_ptr, $docid_count, $docid_base, $start_time, 
   289		$total_files_size, $total_files_num, $file_count, $key_count) = @_;
   290	
   291	    # store variables
   292	    {
   293		my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
   294	
   295		print $fh_checkpoint <<EOM;
   296		\$DeletedFilesCount = $DeletedFilesCount;
   297		\$UpdatedFilesCount = $UpdatedFilesCount;
   298		\$APPENDMODE = $APPENDMODE;
   299		\$flist_ptr = $flist_ptr;
   300		\$docid_count = $docid_count;
   301		\$docid_base = $docid_base;
   302		\$start_time = $start_time;
   303		\$total_files_size = $total_files_size;
   304		\$total_files_num = $total_files_num;
   305		\$key_count = $key_count;
   306		\$file_count = $file_count;
   307		\$\$ = $$;
   308	EOM
   309	        util::fclose($fh_checkpoint);
   310	    }
   311	
   312	    @ARGV = ("-S", @ARGV) ;
   313	    print _("Checkpoint reached: re-exec mknmz...\n");
   314	    util::dprint(join ' ', ("::::", @ARGV, "\n"));
   315	    exec ($0, @ARGV) ;
   316	}
   317	
   318	sub put_registry ($) {
   319	    my ($filename) = @_;
   320	    my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
   321	    print $fh_registry $filename, "\n";
   322	    util::fclose($fh_registry);
   323	}
   324	
   325	
   326	# Initialization
   327	#   $CodingSystem: Character Coding System 'euc' or 'sjis'
   328	sub init () {
   329	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   330		$CodingSystem = "sjis";
   331		if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   332		    $CONFDIR = $1 . $CONFDIR ;
   333		}
   334		if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   335		    $LIBDIR = $1 . $LIBDIR ;
   336		}
   337		if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   338		    $FILTERDIR = $1 . $FILTERDIR ;
   339		}
   340		if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   341		    $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
   342		}
   343	    } else {
   344		$CodingSystem = "euc";
   345	    }
   346	
   347	    $SIG{'INT'}  = sub {
   348		util::cdie("SIGINT caught! Aborted.\n");
   349	    };
   350	
   351	    $SIG{'TERM'}  = sub {
   352		print STDERR "SIGTERM caught!\n";
   353		$ReceiveTERM = 1;
   354	    };
   355	}
   356	
   357	sub preload_modules () { 
   358	    unshift @INC, $LIBDIR;
   359	    # workaround for test suites.
   360	    unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
   361	
   362	    require "var.pl" || die "unable to require \"var.pl\"\n";
   363	    require "conf.pl" || die "unable to require \"conf.pl\"\n";
   364	    require "util.pl" || die "unable to require \"util.pl\"\n";
   365	    require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
   366	    require "ext.pl" || die "unable to require \"ext.pl\"\n";
   367	}
   368	
   369	sub postload_modules () {
   370	    require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
   371	}
   372	
   373	sub load_modules () {
   374	    require "usage.pl" || die "unable to require \"usage.pl\"\n";
   375	    require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
   376	    require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
   377	    require "seed.pl" || die "unable to require \"seed.pl\"\n";
   378	    require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
   379	
   380	    @Seed = seed::init();
   381	}
   382	
   383	sub load_filtermodules () {
   384	    unshift @INC, $FILTERDIR;
   385	
   386	    #
   387	    # Windows modules must be loaded first.
   388	    # Because OLE filters have low precedence over normal ones.
   389	    #
   390	    load_win32modules() if $English::OSNAME eq "MSWin32";
   391	
   392	    # Check filter modules
   393	    my @filters = ();
   394	    @filters = glob "$FILTERDIR/*.pl";
   395	
   396	    load_filters(@filters);
   397	}
   398	
   399	sub load_win32modules () {
   400	    # Check filter modules
   401	    my @filters = ();
   402	    if (-f "../filter/win32/olemsword.pl") { # to ease developing
   403		@filters = glob "../filter/win32/*.pl";
   404		unshift @INC, "../filter/win32";
   405	    } else {
   406		@filters = glob "$FILTERDIR/win32/*.pl";
   407		unshift @INC, "$FILTERDIR/win32";
   408	    }
   409	
   410	    load_filters(@filters);
   411	}
   412	
   413	sub load_filters (@) {
   414	    my @filters = @_;
   415	   
   416	    for my $filter (@filters) {
   417		$filter =~ m!([-\w]+)\.pl$!;
   418		my $module = $1;
   419		require "$module.pl" || die "unable to require \"$module.pl\"\n";;
   420		my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
   421	
   422		eval "\@mtypes =    ${module}::mediatype();";
   423		die $@ if $@;  # eval error
   424		eval "\$status =    ${module}::status();";
   425		die $@ if $@;
   426		eval "\$recursive = ${module}::recursive();";
   427		die $@ if $@;
   428		eval "\$pre_codeconv  = ${module}::pre_codeconv();";
   429		die $@ if $@;
   430		eval "\$post_codeconv  = ${module}::post_codeconv();";
   431		die $@ if $@;
   432		eval "${module}::add_magic(\$Magic);";
   433		die $@ if $@;
   434	
   435		for my $mt (@mtypes) {
   436	        next if (defined $var::Supported{$mt} && 
   437	                 $var::Supported{$mt} eq 'yes' && $status eq 'no');
   438		    $var::Supported{$mt} = $status;
   439		    $var::REQUIRE_ACTIONS{$mt} = $module;
   440		    $var::RECURSIVE_ACTIONS{$mt} = $recursive;
   441		    $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
   442		    $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
   443		}
   444	    }
   445	}
   446	
   447	# Core routine.
   448	#
   449	# FIXME: Too many parameters. They must be cleared.
   450	#
   451	sub namazu_core ($$$$$$$$$$) {
   452	    my ($cfile, $docid_count, $docid_base, 
   453		$file_count, $field_indices, $fh_errorsfile, $total_files_num,
   454		$uri, $part_id, $part_num) = @_;
   455	
   456	    my $headings = "";
   457	    my $content = "";
   458	    my $weighted_str = "";
   459	    my %fields;
   460	    my $msg_prefix;
   461	
   462	    if ($part_id) {
   463		$msg_prefix = "    $part_id/$part_num - ";
   464	    } else {
   465		$msg_prefix = $file_count + 1 . "/$total_files_num - ";
   466	    }
   467	
   468	    unless ($uri) {
   469		$uri = generate_uri($cfile);  # Make a URI from a file name.
   470	    }
   471	    my ($cfile_size, $text_size, $kanji, $mtype) = 
   472		load_document(\$cfile, \$content, \$weighted_str,
   473			      \$headings, \%fields);
   474	
   475	    {
   476	        $fields{'mtime'} = (stat($cfile))[9];
   477	        my $utc = $fields{'mtime'};
   478	        $utc = time::rfc822time_to_mtime($fields{'date'})
   479	                if (defined $fields{'date'});
   480	        if ($utc == -1) {
   481	            my $date = $fields{'date'};
   482	            print "$cfile Illegal date format. : $date\n";
   483	            print $fh_errorsfile "$cfile Illegal date format. : $date\n";
   484	            $utc = $fields{'mtime'};
   485	            delete $fields{'date'};
   486	        }
   487	        $fields{'utc'} = $utc;
   488	    }
   489	
   490	    util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
   491	
   492	    # Check if the file is acceptable.
   493	    my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
   494	    if (defined $err) {
   495		if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   496		    my $uri2 = codeconv::eucjp_to_shiftjis($uri);
   497		    print $msg_prefix . "$uri2 $err\n";
   498		} else {
   499		    print $msg_prefix . "$uri $err\n";
   500		}
   501		print $fh_errorsfile "$cfile $err\n"; 
   502		return 0;  # return 0 if error
   503	    }
   504	
   505	    # Print processing file name as URI.
   506	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   507		my $uri2 = codeconv::eucjp_to_shiftjis($uri);
   508		print $msg_prefix . "$uri2 [$mtype]\n";
   509	    } else {
   510		print $msg_prefix . "$uri [$mtype]\n";
   511	    }
   512	
   513	    # Add filename.
   514	    my $filename = defined $cfile ? $cfile : '';
   515	    codeconv::toeuc(\$filename);
   516	    $filename = basename($filename);
   517	    $fields{'filename'} = $filename;
   518	
   519	    complete_field_info(\%fields, $cfile, $uri, 
   520				\$headings, \$content, \$weighted_str);
   521	    put_field_index(\%fields, $field_indices);
   522	
   523	    put_dateindex($cfile);
   524	    $content .= "\n\n$filename\n\n";    # add filename
   525	    $content .= $weighted_str;          # add weights
   526	    count_words($docid_count, $docid_base, \$content, $kanji);
   527	    make_phrase_hash($docid_count, $docid_base, \$content);
   528	
   529	    # assertion
   530	    util::assert($cfile_size != 0, 
   531			 "cfile_size == 0 at the end of namazu_core.");
   532	
   533	    return $cfile_size;
   534	}
   535	
   536	#
   537	# Make the URI from the given file name.
   538	#
   539	sub generate_uri (@) {
   540	    my ($file, $fragment) = @_;
   541	    return "" unless defined $file;
   542	
   543	    # omit a file name if omittable
   544	    $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o; 
   545	
   546	    if (defined $ReplaceCode) {
   547		# transforming URI by evaling
   548		$_ = $file;
   549		eval $ReplaceCode;
   550		$file = $_;
   551	    }
   552	
   553	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   554		$file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
   555	    }
   556	
   557	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   558		$file = codeconv::shiftjis_to_eucjp($file);
   559	    }
   560	    if (defined $fragment) {
   561	        codeconv::toeuc(\$fragment);
   562	    }
   563	
   564	    unless ($var::Opt{'noencodeuri'}) {
   565		for my $tmp ($file, $fragment) {
   566		    next unless defined $tmp;
   567	
   568		    # Escape unsafe characters (not strict)
   569		    $tmp =~ s/\%/%25/g;  # Convert original '%' into '%25' v1.1.1.2
   570		    $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
   571			sprintf("%%%02X",ord($1))/ge;
   572		}
   573	    }
   574	
   575	
   576	    my $uri = $file;
   577	    $uri .= "#" . $fragment if defined $fragment;
   578	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   579	        # restore '|' for drive letter rule of Win32, OS/2
   580	        $uri =~ s!^/([A-Z])%7C!/$1|!i;
   581	    }
   582	    return $uri;
   583	}
   584	
   585	
   586	sub get_field_index_base (\%) {
   587	    my ($field_indices) = @_;
   588	
   589	    my @keys = split('\|', $conf::SEARCH_FIELD);
   590	    if ($var::Opt{'meta'}) {
   591		push @keys, (split '\|', $conf::META_TAGS);
   592	    }
   593	    for my $key (@keys) {
   594		$key = lc($key);
   595		my $fname    = "$var::NMZ{'field'}.$key";
   596		my $tmp_fname = util::tmpnam("NMZ.field.$key");
   597		my $size = 0;
   598		$size = -s $fname if -f $fname;
   599		$size += -s $tmp_fname if -f $tmp_fname;
   600		$field_indices->{$key} = $size;
   601	    }
   602	}
   603	
   604	sub complete_field_info (\%$$\$\$\$) {
   605	    my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
   606	
   607	    for my $field (keys %{$fields}) {
   608	        if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
   609	            delete $fields->{$field};
   610	        }
   611	    }
   612	
   613	    unless (defined($fields->{'title'})) {
   614		$fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
   615	    }
   616	    unless (defined($fields->{'date'})) {
   617		my $mtime = $fields->{'mtime'};
   618		my $date = util::rfc822time($mtime);
   619		$fields->{'date'} = $date;
   620	    }
   621	    unless (defined($fields->{'uri'})) {
   622		$fields->{'uri'} = $uri;
   623	    }
   624	    unless (defined($fields->{'size'})) {
   625		$fields->{'size'} = -s $cfile;
   626	    }
   627	    unless (defined($fields->{'summary'})) {
   628		$fields->{'summary'} = make_summary($contref, $headings, $cfile);
   629	    }
   630	    unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
   631		$fields->{'from'} = getmsg("unknown");
   632	    }
   633	}
   634	
   635	#
   636	# Currently, messages for NMZ.* files should be encoded in
   637	# EUC-JP currently. We cannot use gettext.pl for the messsage
   638	# because gettext.pl may use Shift_JIS encoded messages.
   639	# So, we should use the function instead of gettext().
   640	#
   641	# FIXME: Ad hoc impl.  getmsg() is effective only for "unknown".
   642	#
   643	sub getmsg($) {
   644	    my ($msg) = @_;
   645	
   646	    if (util::islang_msg("ja")) {
   647		if ($msg eq "unknown") {
   648		    return "不明";
   649		}
   650	    }
   651	    return $msg;
   652	}
   653	
   654	sub make_summary ($$$) {
   655	    my ($contref, $headings, $cfile) = @_;
   656	
   657	    # pick up $conf::MAX_FIELD_LENGTH bytes string
   658	    my $tmp = "";
   659	    if ($$headings ne "") {
   660		$$headings =~ s/^\s+//;
   661		$$headings =~ s/\s+/ /g;
   662		$tmp = $$headings;
   663	    } else {
   664		$tmp = "";
   665	    }
   666	
   667	    my $offset = 0;
   668	    my $tmplen = 0;
   669	    while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
   670	           && $offset < length($$contref))
   671	    {
   672	        $tmp .= substr $$contref, $offset, $tmplen;
   673	        $offset += $tmplen;
   674	        $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
   675	        $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
   676	    }
   677	
   678	    # -1 means "LF"
   679	    my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
   680	    # Remove a garbage Kanji 1st char at the end.
   681	    $summary = codeconv::chomp_eucjp($summary);
   682	
   683	    $summary =~ s/^\s+//;
   684	    $summary =~ s/\s+/ /g;   # normalize white spaces
   685	
   686	    return $summary;
   687	}
   688	
   689	
   690	# output the field infomation into NMZ.fields.* files
   691	sub put_field_index (\%$) {
   692	    my ($fields, $field_indices) = @_;
   693	
   694	    my $aliases_regex = 
   695		join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
   696	
   697	    for my $field (keys %{$fields}) {
   698	        util::dprint("Field: $field: $fields->{$field}\n");
   699		if ($field =~ /^($aliases_regex)$/o) {
   700		    unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
   701			$fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
   702		    }
   703		    undef $fields->{$field};
   704		}
   705	    }
   706	
   707	    my @keys = split '\|', $conf::SEARCH_FIELD;
   708	    if ($var::Opt{'meta'}) {
   709	        my @meta = split '\|', $conf::META_TAGS;
   710	        while (my $meta = shift(@meta)) {
   711	            $meta = $conf::FIELD_ALIASES{$meta}
   712	                if (defined $conf::FIELD_ALIASES{$meta});
   713	
   714	            push @keys, $meta;
   715	        }
   716	
   717	        # uniq @keys
   718	        my %mark = ();
   719	        @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
   720	    }
   721	    for my $key (@keys) {
   722		my $lkey = lc($key);
   723		my $fname    = util::tmpnam("NMZ.field.$lkey");
   724		my $fh_field = util::efopen(">>$fname");
   725		my $output = "";
   726		if (defined($fields->{$key})) {
   727		    if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
   728			$fields->{$key} =~ s/\s+/ /g;
   729			$fields->{$key} =~ s/\s+$//;
   730			$fields->{$key} =~ s/^\s+//;
   731		    }
   732		    $output = $fields->{$key};
   733	
   734		    # -1 means "LF"
   735		    $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
   736		    # Remove a garbage Kanji 1st char at the end.
   737		    $output = codeconv::chomp_eucjp($output);
   738	
   739	            $output =~ s/\n.*$//s;
   740	            $output .= "\n";
   741		} else {
   742		    $output = "\n";
   743		}
   744		print $fh_field $output;
   745	        util::fclose($fh_field);
   746	
   747		# put index of field index
   748		{
   749		    my $fname        = util::tmpnam("NMZ.field.$lkey.i");
   750		    my $fh_field_idx = util::efopen(">>$fname");
   751		    print $fh_field_idx pack("N", $field_indices->{$lkey});
   752		    $field_indices->{$lkey} += length $output;
   753	            util::fclose($fh_field_idx);
   754		}
   755	    }
   756	
   757	}
   758	
   759	# put the date infomation into NMZ.t file
   760	sub put_dateindex ($) {
   761	    my ($cfile) = @_;
   762	    my $mtime = (stat($cfile))[9];
   763	
   764	    my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
   765	    print $fh_dataindex pack("N", $mtime);
   766	    util::fclose($fh_dataindex);
   767	}
   768	
   769	
   770	# load a document file
   771	sub load_document ($$$$\%) {
   772	    my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
   773	      = @_;
   774	    my $cfile = $$orig_cfile;
   775	
   776	    return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile));
   777	
   778	    # for handling a filename which contains Shift_JIS code for Windows.
   779	    # for handling a filename which contains including space.
   780	    my $shelter_cfile = "";
   781	    if (($cfile =~ /\s/) ||
   782	        ($English::OSNAME eq "MSWin32"
   783	        && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
   784	    {
   785		$shelter_cfile = $cfile;
   786		$cfile = util::tmpnam("NMZ.win32");
   787		unlink $cfile if (-e $cfile);
   788		copy($shelter_cfile, $cfile);
   789	    }
   790	
   791	    my $file_size;
   792	    $file_size = util::filesize($cfile); # not only file in feature.
   793	    if ($file_size > $conf::FILE_SIZE_MAX) {
   794		return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
   795	    }
   796	
   797	    $$contref = util::readfile($cfile);
   798	#    $file_size = length($$contref);
   799	
   800	    my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
   801	
   802	    if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") {
   803		unlink $cfile;
   804		$cfile = $shelter_cfile;
   805	    }
   806	
   807	    # Measure the text size at this time.
   808	    my $text_size = length($$contref) + length($$weighted_str); 
   809	
   810	    return ($file_size, $text_size, $kanji, $mtype);
   811	}
   812	
   813	sub apply_filter($$$$$$$) {
   814	    my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
   815	      = @_;
   816	    my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile;
   817	
   818	    # Filtering process.
   819	    my $mtype;
   820	    my $called_dt = 0;
   821	    while (1) {
   822		if (defined $MediaType) {
   823		    $mtype = $MediaType;
   824		} elsif (defined $mmtype) {
   825		    $mtype = $mmtype;
   826		} else {
   827		    my $mtype_n = $Magic->checktype_byfilename($cfile);
   828		    my $mtype_c = $Magic->checktype_data($$contref);
   829		    my $mtype_m;
   830		    $mtype_m = $Magic->checktype_magic($$contref) 
   831		      if ((! defined $mtype_c) ||
   832			  $mtype_c =~ 
   833			  /^(text\/html|text\/plain|application\/octet-stream)$/);
   834		    $mtype_c = $mtype_m 
   835			if (defined $mtype_m && 
   836			    $mtype_m !~ 
   837			    /^(text\/html|text\/plain|application\/octet-stream)$/);
   838		    $mtype_c = 'text/plain' unless defined $mtype_c;
   839		    if ($called_dt) {
   840			$mtype = $mtype_c;
   841		    } else {
   842			$mtype = decide_type($mtype_n, $mtype_c);
   843			$called_dt = 1;
   844		    }
   845		}
   846		util::dprint(_("Detected type: ")."$mtype\n");
   847	
   848		# Pre code conversion.
   849		if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
   850		    util::dprint("pre_codeconv\n");
   851		    codeconv_document($contref);
   852		}
   853	
   854		if (! $var::Supported{$mtype} || 
   855		    $var::Supported{$mtype} ne 'yes') 
   856		{
   857		    util::vprint(_("Unsupported media type ")."$mtype\n");
   858		    return (0, "$mtype; x-system=unsupported");
   859		}
   860	
   861		if ($var::REQUIRE_ACTIONS{$mtype}) {
   862		    util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
   863		    require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
   864		        || die _("unable to require ") . 
   865			    "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
   866		    my $err = undef;
   867		    {
   868			local $SIG{'PIPE'} = \&trapintr;
   869			eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
   870			  '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
   871		    }
   872		    if ($err) {
   873			if ($err =~ m/; x-system=unsupported$/) {
   874			    return (0, $err);
   875			}
   876			return (0, "$mtype; x-error=$err");
   877		    }
   878	
   879		    if ($@) {
   880			util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
   881			return (0, "$mtype; x-error=$@");
   882		    }
   883	
   884		    # Post code conversion.
   885		    if ($var::REQUIRE_POST_CODECONV{$mtype}) {
   886		        util::dprint("post_codeconv\n");
   887			codeconv_document($contref);
   888		    }
   889	
   890		    next if ($var::RECURSIVE_ACTIONS{$mtype});
   891		}
   892		last;
   893	    }
   894	
   895	    my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/;  # Kanji contained?
   896	    $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
   897	
   898	    return ($kanji, $mtype);
   899	}
   900	
   901	sub codeconv_document ($) {
   902	    my ($textref) = @_;
   903	    codeconv::toeuc($textref);
   904	    $$textref =~ s/\r\n/\n/g;
   905	    $$textref =~ s/\r/\n/g;
   906	    $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
   907	}
   908	
   909	sub prep () {
   910	    my $docid_base = 0;
   911	    my $output_dir = shift @_ ;
   912	    my @targets = @_ ;
   913	    my @flist = ();
   914	
   915	    $var::OUTPUT_DIR = $output_dir;
   916	
   917	    require_modules();
   918	    change_filenames();
   919	    check_present_index();
   920	
   921	    # if Checkpoint mode, return
   922	    return (0, 0) if $CheckPoint{'continue'};
   923	
   924	    check_lockfile($var::NMZ{'lock2'});
   925	    print _("Looking for indexing files...\n");
   926	    @flist = find_target(@targets);
   927	    ($docid_base, @flist) = append_index(@flist) 
   928		if -f $var::NMZ{'r'};
   929	    unless (@flist) { # if @flist is empty
   930		print _("No files to index.\n");
   931		exit 0;
   932	    }
   933	    set_lockfile($var::NMZ{'lock2'});
   934	    save_flist(@flist);
   935	    my $total_files_num = @flist;
   936	
   937	    return ($docid_base, $total_files_num);
   938	}
   939	
   940	sub save_flist(@) {
   941	    my @flist = @_;
   942	    return if (@flist == 0);
   943	
   944	    my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
   945	    print $fh_flist join("\n", @flist), "\n";
   946	    util::fclose($fh_flist);
   947	}
   948	
   949	sub require_modules() {
   950	    if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
   951		require NKF || die "unable to require \"NKF\"\n";
   952	        util::dprint(_("code conversion: using NKF module\n"));
   953		$var::USE_NKF_MODULE = 1;
   954	    }
   955	    if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
   956		require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
   957	        util::dprint(_("wakati: using Text::Kakasi module\n"));
   958		my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
   959	    }
   960	    if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
   961		require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
   962	        util::dprint(_("wakati: using Text::ChaSen module\n"));
   963		my @arg = ('-i', 'e', '-j', '-F', '%m ');
   964		@arg    = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
   965		my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
   966	    }
   967	    if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) {
   968	        require MeCab || die "unable to require \"MeCab\"\n";
   969	        util::dprint(_("wakati: using MeCab module\n"));
   970	    }
   971	}
   972	
   973	sub check_lockfile ($) {
   974	    # warn if check file exists in case other process is running or abnormal
   975	    # stop execution (later is not the major purpose, though).
   976	    # This is mainly for early detection before longish find_target.
   977	    my ($file) = @_;
   978	
   979	    if (-f $file) {
   980		print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
   981		exit 1;
   982	    }
   983	}
   984	
   985	sub set_lockfile ($) {
   986	    my ($file) = @_;
   987	
   988	    # make a lock file
   989	    if (-f $file) {
   990		print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
   991		exit 1;
   992	    } else {
   993		my $fh_lockfile = util::efopen(">$file");
   994		print $fh_lockfile "$$"; # save pid
   995	        util::fclose($fh_lockfile);
   996	    }
   997	}
   998	
   999	sub remove_lockfile ($) {
  1000	    my ($file) = @_;
  1001	
  1002	    # remove lock file
  1003	    unlink $file if -f $file;
  1004	}
  1005	
  1006	# check present index whether it is old type of not
  1007	sub check_present_index () {
  1008	    if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}") 
  1009	    {
  1010		util::cdie(_("Present index is old type. it's unsupported.\n"));
  1011	    }
  1012	}
  1013	
  1014	# remain
  1015	sub do_remain_job ($$$$) {
  1016	    my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
  1017	
  1018	    if ($docid_count == 0) {
  1019		# No files are indexed
  1020		if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
  1021		    update_dateindex();
  1022		    update_registry($docid_count);
  1023		}
  1024	    } else {
  1025		set_lockfile($var::NMZ{'lock'});
  1026		write_version();
  1027		write_body_msg();
  1028		write_tips_msg();
  1029		write_result_file();
  1030		update_field_index();
  1031		update_dateindex();
  1032		update_registry($docid_count);
  1033		write_nmz_files();
  1034		make_slog_file();
  1035		remove_lockfile($var::NMZ{'lock'});
  1036	    }
  1037	    make_headfoot_pages($docid_count, $key_count);
  1038	    put_log($total_files_size, $start_time, $docid_count, $key_count);
  1039	    util::remove_tmpfiles();
  1040	    unlink $var::NMZ{'_flist'};
  1041	}
  1042	
  1043	sub make_headfoot_pages($$) {
  1044	    my ($docid_count, $key_count) = @_;
  1045	
  1046	    for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
  1047	        if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
  1048	            my $suffix = $1 ? $1 : '';
  1049	            make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
  1050	        }
  1051	    }
  1052	    for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
  1053	        if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
  1054	            my $suffix = $1 ? $1 : '';
  1055	            make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
  1056	       }
  1057	    }
  1058	}
  1059	
  1060	# Parse command line options.
  1061	sub parse_options
  1062	{
  1063	    if (@ARGV == 0) {
  1064		show_mini_usage();
  1065		exit 1;
  1066	    }
  1067	
  1068	    my @targets = ();
  1069	    my $targets_loaded = 0;
  1070	    my @argv = @ARGV;
  1071	    my $cwd = cwd();
  1072	
  1073	    my $opt_dummy = 0;
  1074	    my $opt_version = 0;
  1075	    my $opt_help = 0;
  1076	    my $opt_all = 0;
  1077	    my $opt_chasen = 0;
  1078	    my $opt_chasen_noun = 0;
  1079	    my $opt_kakasi = 0;
  1080	    my $opt_mecab = 0;
  1081	    my $opt_checkpoint_sub = 0;
  1082	    my $opt_show_config = 0;
  1083	    my $opt_mailnews = 0;
  1084	    my $opt_mhonarc = 0;
  1085	    my $opt_norc = 0;
  1086	
  1087	    my $opt_quiet = undef;
  1088	    my $opt_config = undef;
  1089	    my $output_dir = undef;
  1090	    my $update_index = undef;
  1091	    my $include_file = undef;
  1092	    my $target_list = undef;
  1093	    my $index_lang = undef;
  1094	
  1095	    my %opt_conf;
  1096	
  1097	#    Getopt::Long::Configure('bundling');
  1098	    Getopt::Long::config('bundling');
  1099	    GetOptions(
  1100	       	       '0|help'              => \$opt_help,
  1101		       '1|exclude=s'         => \$opt_conf{'EXCLUDE_PATH'},
  1102		       '2|deny=s'            => \$opt_conf{'DENY_FILE'},
  1103		       '3|allow=s'           => \$opt_conf{'ALLOW_FILE'},
  1104		       '4|update=s'          => \$update_index,
  1105		       '5|mhonarc'           => \$opt_mhonarc,
  1106		       '6|mtime=s'           => \$var::Opt{'mtime'},
  1107		       '7|html-split'        => \$var::Opt{'htmlsplit'},
  1108		       'C|show-config'       => \$opt_show_config,
  1109		       'E|no-edge-symbol'    => \$var::Opt{'noedgesymbol'},
  1110		       'F|target-list=s'     => \$target_list,
  1111		       'G|no-okurigana'      => \$var::Opt{'okurigana'},
  1112		       'H|no-hiragana'       => \$var::Opt{'hiragana'},
  1113		       'I|include=s'         => \$include_file,
  1114		       'K|no-symbol'         => \$var::Opt{'nosymbol'},
  1115		       'L|indexing-lang=s'     => \$index_lang,
  1116		       'M|meta'              => \$var::Opt{'meta'},
  1117		       'O|output-dir=s'      => \$output_dir,
  1118		       'S|checkpoint-sub'    => \$opt_checkpoint_sub,
  1119		       'T|template-dir=s'    => \$TEMPLATEDIR,
  1120		       'U|no-encode-uri'     => \$var::Opt{'noencodeuri'} ,
  1121		       'V|verbose'           => \$var::Opt{'verbose'},
  1122		       'Y|no-delete'         => \$var::Opt{'nodelete'},
  1123		       'Z|no-update'         => \$var::Opt{'noupdate'},
  1124		       'a|all'               => \$opt_all,
  1125	               'b|use-mecab'         => \$opt_mecab,
  1126		       'c|use-chasen'        => \$opt_chasen,
  1127		       'd|debug'             => \$var::Opt{'debug'},
  1128		       'e|robots'            => \$var::Opt{'robotexclude'},
  1129		       'f|config=s'          => \$opt_config,
  1130		       'h|mailnews'          => \$opt_mailnews,
  1131		       'k|use-kakasi'        => \$opt_kakasi,
  1132		       'm|use-chasen-noun'   => \$opt_chasen_noun,
  1133		       'q|quiet'             => \$opt_quiet,
  1134		       'r|replace=s'         => \$ReplaceCode,
  1135		       's|checkpoint'        => \$CheckPoint{'on'},
  1136		       't|media-type=s'      => \$MediaType,
  1137		       'u|uuencode'          => \$opt_dummy, # for backward compat.
  1138		       'v|version'           => \$opt_version,
  1139		       'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
  1140		       'z|check-filesize'    => \$var::Opt{'checkfilesize'},
  1141		       'decode-base64'       => \$var::Opt{'decodebase64'},
  1142		       'norc'                => \$opt_norc,
  1143		       );
  1144	
  1145	    if ($opt_quiet) {
  1146		# Make STDOUT quiet by redirecting STDOUT to null device.
  1147		my $devnull = util::devnull();
  1148		open(STDOUT, ">$devnull") || die "$devnull: $!";
  1149	    }
  1150	
  1151	    if (defined $update_index) {
  1152		unless (-d $update_index) {
  1153		    print _("No such index: "), "$update_index\n";
  1154		    exit 1;
  1155		}
  1156	
  1157		my $orig_status = $var::NMZ{'status'};
  1158		$var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
  1159	
  1160		my $argv = get_status("argv");
  1161		if (!defined $argv) {
  1162		    print _("No such index: "), "$update_index\n";
  1163		    exit 1;
  1164		}
  1165		@ARGV = split /\t/, $argv;
  1166		util::dprint(_("Inherited argv: ")."@ARGV\n");
  1167	
  1168		my $cwd  = get_status("cwd");
  1169		if (!defined $cwd) {
  1170		    print _("No such index: "), "$update_index\n";
  1171		    exit 1;
  1172		}
  1173		chdir $cwd;
  1174		util::dprint(_("Inherited cwd: ")."$cwd\n");
  1175	
  1176		($output_dir, @targets) = parse_options();
  1177		$output_dir = $update_index;
  1178		$var::NMZ{'status'} = $orig_status;  # See also change_filenames()
  1179		return ($output_dir, @targets);
  1180	    }
  1181	
  1182	    if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
  1183	        load_rcfiles();
  1184	    }
  1185	    if ($opt_config) {
  1186	        if (-f $opt_config) {
  1187	            util::vprint(_("Reading rcfile: "));
  1188	            load_rcfile($ConfigFile = $opt_config);
  1189	            util::vprint(" $opt_config\n");
  1190	        }
  1191	    }
  1192	
  1193	    if ($index_lang) {
  1194		$util::LANG = $index_lang;
  1195	      util::dprint("Override indexing language: $util::LANG\n");
  1196	    }
  1197	
  1198	    if ($opt_help) {
  1199		show_usage();
  1200		exit 1;
  1201	    }
  1202	
  1203	    if ($opt_version) {
  1204		show_version();
  1205		exit 1;
  1206	    }
  1207	
  1208	    load_filtermodules(); # to make effect $opt_config, $index_lang.
  1209	    postload_modules();
  1210	
  1211	    foreach my $key (keys %opt_conf){
  1212		if (defined ($opt_conf{$key})) { 
  1213		     ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key};
  1214		}
  1215	    }
  1216	
  1217	    if ($opt_mailnews) {
  1218		$MediaType = 'message/rfc822';
  1219	    }
  1220	    if ($opt_mhonarc) {
  1221		$MediaType = 'text/html; x-type=mhonarc';
  1222	    }
  1223	    if ($opt_all) {
  1224		$conf::ALLOW_FILE = ".*";
  1225	    }
  1226	    if ($opt_chasen) {
  1227		$conf::WAKATI = $conf::CHASEN;
  1228		$var::Opt{'noun'} = 0;
  1229	    }
  1230	    if ($opt_chasen_noun) {
  1231		$conf::WAKATI = $conf::CHASEN_NOUN;
  1232		$var::Opt{'noun'} = 1;
  1233	    }
  1234	    if ($opt_kakasi) {
  1235		$conf::WAKATI = $conf::KAKASI;
  1236		$var::Opt{'noun'} = 0;
  1237	    }
  1238	    if ($opt_mecab) {
  1239	        $conf::WAKATI = $conf::MECAB;
  1240	        $var::Opt{'noun'} = 0;
  1241	    }
  1242	    if ($include_file) {
  1243		do $include_file;
  1244	        util::dprint("Included: $include_file\n");
  1245	    }
  1246	    if ($target_list) {
  1247		if ($CheckPoint{'continue'}) {
  1248		    @targets = ("dummy");
  1249		} else {
  1250		    @targets = load_target_list($target_list);
  1251		    util::dprint(_("Loaded: ")."$target_list\n");
  1252		}
  1253		$targets_loaded = 1;
  1254	    }
  1255	    if ($opt_checkpoint_sub) {
  1256		$CheckPoint{'on'}           = 1;
  1257		$CheckPoint{'continue'}     = 1;
  1258		@argv = grep {! /^-S$/} @argv;  # remove -S
  1259	    }
  1260	
  1261	    if (defined $ReplaceCode) {
  1262		my $orig = "/foo/bar/baz/quux.html";
  1263		$_ = $orig;
  1264		eval $ReplaceCode;
  1265		if ($@) {  # eval error
  1266		    util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
  1267		}
  1268		util::dprint(_("Replace: ")."$orig -> $_\n");
  1269	    }
  1270	
  1271	    if ($opt_show_config) {
  1272		show_config();
  1273		exit 1;
  1274	    }
  1275	
  1276	    if (@ARGV == 0 && $targets_loaded == 0) {
  1277		show_mini_usage();
  1278		exit 1;
  1279	    }
  1280	
  1281	    $output_dir = $cwd unless defined $output_dir;
  1282	    util::cdie("$output_dir: "._("invalid output directory\n"))
  1283		unless (-d $output_dir && -w $output_dir);
  1284	
  1285	    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
  1286		util::win32_yen_to_slash(\$output_dir);
  1287	    }
  1288	
  1289	    # take remaining @ARGV as targets
  1290	    if (@ARGV > 0 && $targets_loaded == 0) {
  1291		@targets = @ARGV ;
  1292	    }
  1293	    
  1294	    # revert @ARGV
  1295	    # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
  1296	    @ARGV = @argv;
  1297	
  1298	    return ($output_dir, @targets);
  1299	}
  1300	
  1301	sub show_config () {
  1302	    print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
  1303	    print _("System: ") . "$English::OSNAME\n" if $English::OSNAME;
  1304	    print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
  1305	    print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION);
  1306	    print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION;
  1307	    print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
  1308	    print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
  1309	    print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
  1310	    print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB;
  1311	    print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
  1312	    print _("Lang_Msg: ") . "$util::LANG_MSG\n";
  1313	    print _("Lang: ") . "$util::LANG\n";
  1314	    print _("Coding System: ") . "$CodingSystem\n";
  1315	    print _("CONFDIR: ") . "$CONFDIR\n";
  1316	    print _("LIBDIR: ") . "$LIBDIR\n";
  1317	    print _("FILTERDIR: ") . "$FILTERDIR\n";
  1318	    print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
  1319	
  1320	    my @all_types =     keys %var::Supported;
  1321	    my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types;
  1322	
  1323	    my $num_supported = @supported;
  1324	    my $num_unsupported = @all_types - @supported;
  1325	    print _("Supported media types:   ") . "($num_supported)\n";
  1326	    print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n");
  1327	    for my $mtype (sort keys %var::Supported) {
  1328		my $yn = $var::Supported{$mtype};
  1329		if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
  1330		print "$yn $mtype";
  1331		if ($var::REQUIRE_ACTIONS{$mtype}){
  1332		    print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
  1333		}
  1334		print "\n";
  1335	    }
  1336	}
  1337	
  1338	sub load_target_list ($) {
  1339	    my ($file) = @_;
  1340	    my $fh_targets = util::efopen($file);
  1341	    my @targets = <$fh_targets>;
  1342	    util::fclose($fh_targets);
  1343	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
  1344		foreach my $tmp (@targets){
  1345		    $tmp =~ s/\r//g;
  1346		    util::win32_yen_to_slash(\$tmp);
  1347		}
  1348	    }
  1349	    chomp @targets; 
  1350	    return @targets;
  1351	}
  1352	
  1353	# convert a relative path into an absolute path
  1354	sub absolute_path($$) {
  1355	    my ($cwd, $path) = @_;
  1356	
  1357	    $path =~ s!^\.$!\./!;
  1358	    $path =~ s!^\.[/\\]!$cwd/!;
  1359	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
  1360		util::win32_yen_to_slash(\$path);
  1361		if ($path =~ m!^//!) {
  1362		} elsif ($path =~ m!^/[^/]!) {
  1363		    my $driveletter = $cwd;
  1364		    if ($driveletter =~ m!^([A-Z]:)!i){
  1365			$driveletter = $1;
  1366		    }
  1367		    $path = "$driveletter$path";
  1368		} elsif ($path !~ m!^[A-Z]:/!i) {
  1369	            $path = "$cwd/$path";
  1370	        }
  1371	    } else {
  1372	        $path =~ s!^([^/])!$cwd/$1!; 
  1373	    }
  1374	    return $path;
  1375	}
  1376	
  1377	sub find_target (@) {
  1378	    my @targets = @_;
  1379	
  1380	    my $cwd = cwd();
  1381	    @targets = map { absolute_path($cwd, $_) } @targets;
  1382	
  1383	    # Convert \ to / with consideration for Shift_JIS encoding.
  1384	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
  1385	        foreach my $tmp (@targets){
  1386		    util::win32_yen_to_slash(\$tmp);
  1387		}
  1388	    }
  1389	
  1390	    # For reporting effects of --allow, --deny, --exclude, --mtime
  1391	    # options in --verbose mode.
  1392	    my %counts = (); 
  1393	    $counts{'possible'} = 0;
  1394	    $counts{'excluded'} = 0;
  1395	    $counts{'too_old'} = 0;
  1396	    $counts{'too_new'} = 0;
  1397	    $counts{'not_allowed'} = 0;
  1398	    $counts{'denied'} = 0;
  1399	
  1400	    # Traverse directories.
  1401	    # This routine is not efficent but I prefer reliable logic.
  1402	    my @flist = ();
  1403	    my $start = time();
  1404	    util::vprint(_("find_target starting: "). localtime($start). "\n");
  1405	    while (@targets) {
  1406		my $target = shift @targets;
  1407	
  1408		if ($target eq '') {
  1409		    print STDERR "Warning: target contains empty line, skip it\n";
  1410		    next;
  1411		}
  1412		
  1413		if (-f $target) { # target is a file.
  1414		    add_target($target, \@flist, \%counts);
  1415		} elsif (-d $target) { # target is a directory.
  1416		    my @subtargets = ();
  1417		    # Find subdirectories in target directory
  1418		    # because File::Find::find() does not follow symlink.
  1419		    if (-l $target) {
  1420			my $dh = new DirHandle($target);
  1421			while (defined(my $ent = $dh->read)) {
  1422			    next if ($ent =~ /^\.{1,2}$/);
  1423			    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
  1424	                        next if ($ent =~ m!^($conf::DENY_DDN)$!i);
  1425	                        my $tmp = $ent;
  1426	                        util::win32_yen_to_slash(\$tmp);
  1427	                        next if ($ent ne $tmp);
  1428	                    }
  1429			    my $fname = "$target/$ent";
  1430			    next if ($fname eq '.' || $fname eq '..');
  1431			    if (-d $fname) {
  1432				push(@subtargets, $fname);
  1433			    } else {
  1434				add_target($fname, \@flist, \%counts);
  1435			    }
  1436			}
  1437		    } else {
  1438			@subtargets = ($target);
  1439		    }
  1440	
  1441		    #
  1442		    # Wanted routine for File::Find's find().
  1443		    #
  1444		    my $wanted_closure = sub {
  1445			my $fname = "$File::Find::dir/$_";
  1446			add_target($fname, \@flist, \%counts);
  1447		    };
  1448	
  1449		    find($wanted_closure, @subtargets) if (@subtargets > 0);
  1450		} else {
  1451		    print STDERR _("unsupported target: ") . $target;
  1452		}
  1453	    }
  1454	
  1455	    # uniq @flist
  1456	    my %mark = ();
  1457	    @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
  1458	
  1459	    # Sort file names with consideration for numbers.
  1460	    @flist = map  { $_->[0] }
  1461		     sort { $a->[1] cmp $b->[1] } 
  1462		     map  { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge; 
  1463			    [ $_, $tmp ] } @flist;
  1464	
  1465	    my $elapsed = time() - $start ;
  1466	    $elapsed += 1 ;   # to round up and avoid 0 
  1467	
  1468	    # For --verbose option.
  1469	    report_find_target($elapsed, $#flist + 1, %counts);
  1470	
  1471	    return @flist;
  1472	}
  1473	
  1474	sub add_target ($\@\%) {
  1475	    my ($target, $flists_ref, $counts_ref) = @_;
  1476	
  1477	    if ($target =~ /[\n\r\t]/) {
  1478		$target =~ s/[\n\r\t]//g;
  1479		print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
  1480		return;   # skip a file name containing LF/CR/TAB chars.
  1481	    }
  1482	
  1483	    return unless -f $target;  # Only file is targeted.
  1484	
  1485	    $counts_ref->{'possible'}++;
  1486	
  1487	    unless (util::canopen($target)) {
  1488	        util::vprint(sprintf(_("Unreadable:	%s"), $target));
  1489		$counts_ref->{'excluded'}++;
  1490		return;
  1491	    }
  1492	
  1493	
  1494	    if (defined $conf::EXCLUDE_PATH && 
  1495		$target =~ /$conf::EXCLUDE_PATH/ ) 
  1496	    {
  1497	        util::vprint(sprintf(_("Excluded:	%s"), $target));
  1498		$counts_ref->{'excluded'}++;
  1499		return; 
  1500	    }
  1501	
  1502	    #
  1503	    # Do processing just like find's  --mtime option.
  1504	    #
  1505	    if (defined $var::Opt{'mtime'}) {
  1506		my $mtime = -M $_;
  1507		if ($var::Opt{'mtime'} < 0) {
  1508	
  1509		    # This must be `>=' not `>' for consistency with find(1).
  1510		    if (int($mtime) >= - $var::Opt{'mtime'}) {
  1511		        util::vprint(sprintf(_("Too old:	%s"), $target));
  1512		        $counts_ref->{'too_old'}++;
  1513			return;
  1514		    }
  1515		} elsif ($var::Opt{'mtime'} > 0) {
  1516		    if ($var::Opt{'mtime'} =~ /^\+/) {
  1517			if ((int($mtime) < $var::Opt{'mtime'})) {
  1518			    util::vprint(sprintf(_("Too new:	%s"), $target));
  1519			    $counts_ref->{'too_new'}++;
  1520			    return;
  1521			}
  1522		    } else {
  1523			if (int($mtime) != $var::Opt{'mtime'}) {
  1524			    if (int($mtime) > $var::Opt{'mtime'}) {
  1525			        util::vprint(sprintf(_("Too old:	%s"),$target));
  1526				$counts_ref->{'too_old'}++;
  1527			    } else {
  1528			        util::vprint(sprintf(_("Too new:	%s"),$target));
  1529				$counts_ref->{'too_new'}++;
  1530			    }
  1531			    return;
  1532			}
  1533		    }
  1534		} else {
  1535		    # $var::Opt{'mtime'} == 0 ;
  1536		    return;
  1537		}
  1538	    }
  1539	
  1540	    # Extract the file name of the target.
  1541	    $target =~ m!^.*/([^/]+)$!;
  1542	    my $fname = $1;
  1543	
  1544	    if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
  1545	        util::vprint(sprintf(_("Denied:	%s"), $target));
  1546		$counts_ref->{'denied'}++; 
  1547		return;
  1548	    }
  1549	    if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
  1550	        util::vprint(sprintf(_("Not allowed:	%s"), $target));
  1551		$counts_ref->{'not_allowed'}++; 
  1552		return;
  1553	    } else{
  1554	        util::vprint(sprintf(_("Targeted:	%s"), $target));
  1555		push @$flists_ref, $target;
  1556	    }
  1557	
  1558	}
  1559	
  1560	sub report_find_target ($$%) {
  1561	    my ($elapsed, $num_targeted, %counts) = @_;
  1562	
  1563	    util::vprint(_("find_target finished: ") . localtime(time()). "\n");
  1564	    util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"), 
  1565				 $num_targeted, $elapsed, 
  1566				 $num_targeted /$elapsed));
  1567	    util::vprint(sprintf(_("  Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"), 
  1568				 $counts{'possible'}, 
  1569				 $counts{'not_allowed'},  
  1570				 $counts{'denied'}, 
  1571				 $counts{'excluded'}));
  1572	    util::vprint(sprintf(_("  MTIME too old: %d, MTIME too new: %d"),
  1573				 $counts{'too_old'}, 
  1574				 $counts{'too_new'}));
  1575	}
  1576	
  1577	sub show_usage () {
  1578	    util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
  1579	    util::dprint(_("lang: ")."$util::LANG\n");
  1580	
  1581	    my $usage = $usage::USAGE;
  1582	    $usage = _($usage);
  1583	    printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS;
  1584	}
  1585	
  1586	sub show_mini_usage () {
  1587	    print _("Usage: mknmz [options] <target>...\n");
  1588	    print _("Try `mknmz --help' for more information.\n");
  1589	}
  1590	
  1591	sub show_version () {
  1592	    print $usage::VERSION_INFO;
  1593	}
  1594	
  1595	#
  1596	# check the file. No $msg is good.
  1597	#
  1598	sub check_file ($$$$$) {
  1599	    my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
  1600	
  1601	    my $msg = undef;
  1602	    if ($mtype =~ /; x-system=unsupported$/) {
  1603		$mtype =~ s/; x-system=unsupported$//;
  1604		$msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
  1605	    } elsif ($mtype =~ /; x-error=file_size_max/) {
  1606	        $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
  1607	    } elsif ($mtype =~ /; x-error=.*$/) {
  1608		$mtype =~ s/^.*; x-error=(.*)$/$1/;
  1609		$msg = $mtype;
  1610	    } elsif ($mtype =~ /^x-system/) {
  1611		$msg = _("system error occurred! ")."($mtype)"._(" skipped.");
  1612	    } elsif (! -e $cfile) {
  1613		$msg = _("does NOT EXIST! skipped.");
  1614	    } elsif (! util::canopen($cfile)) {
  1615		$msg = _("is NOT READABLE! skipped.");
  1616	    } elsif ($text_size == 0 || $cfile_size == 0) {
  1617		$msg = _("is 0 size! skipped.");
  1618	    } elsif ($mtype =~ /^application\/octet-stream/) {
  1619		$msg = _("may be a BINARY file! skipped.");
  1620	    } elsif ($cfile_size > $conf::FILE_SIZE_MAX) {
  1621		$msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
  1622	    } elsif ($text_size > $conf::TEXT_SIZE_MAX) {
  1623		$msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ;
  1624	    } 
  1625	
  1626	    return $msg;
  1627	}
  1628	
  1629	
  1630	#
  1631	# Write NMZ.version file.
  1632	#
  1633	sub write_version() {
  1634	    unless (-f $var::NMZ{'version'}) {
  1635		my $fh = util::efopen(">$var::NMZ{'version'}");
  1636		print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
  1637	        util::fclose($fh);
  1638	    }
  1639	}
  1640	
  1641	#
  1642	# rename each temporary file to a real file name.
  1643	#
  1644	sub write_nmz_files () {
  1645	    util::Rename($var::NMZ{'_i'},   $var::NMZ{'i'});
  1646	    util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
  1647	    util::Rename($var::NMZ{'_w'},  $var::NMZ{'w'});
  1648	    util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
  1649	    util::Rename($var::NMZ{'_p'},  $var::NMZ{'p'});
  1650	    util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
  1651	}
  1652	
  1653	# output NMZ.body
  1654	sub write_body_msg () {
  1655	    for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
  1656	        if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
  1657	            my $suffix = $1 ? $1 : '';
  1658	            write_message("$var::NMZ{'body'}${suffix}");
  1659	        }
  1660	    }
  1661	}
  1662	
  1663	# output NMZ.tips
  1664	sub write_tips_msg () {
  1665	    for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
  1666	        if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
  1667	            my $suffix = $1 ? $1 : '';
  1668	            write_message("$var::NMZ{'tips'}${suffix}");
  1669	        }
  1670	    }
  1671	}
  1672	
  1673	
  1674	# output NMZ.result.*
  1675	sub write_result_file () {
  1676	    my $fname = "NMZ.result.normal";
  1677	
  1678	    my @files = glob "$TEMPLATEDIR/NMZ.result.*";
  1679	
  1680	    for my $file (@files) {
  1681		$file =~ m!(NMZ\.result\.[^/]*)$!;
  1682		my $target = "$var::OUTPUT_DIR/$1";
  1683		if (-f $target) {  # already exist;
  1684		    next;
  1685		} else {
  1686		    my $buf = util::readfile($file);
  1687		    my $fh_file = util::efopen(">$target");
  1688		    print $fh_file $buf;
  1689	            util::fclose($fh_file);
  1690		}
  1691	    }
  1692	}
  1693	
  1694	# write NMZ.body and etc.
  1695	sub write_message ($) {
  1696	    my ($msgfile) = @_;
  1697	
  1698	    if (! -f $msgfile) {
  1699		my ($template, $fname);
  1700		
  1701		$msgfile =~ m!.*/(.*)$!;
  1702		$fname = $1;
  1703		$template = "$TEMPLATEDIR/$fname";
  1704	
  1705		if (-f $template) {
  1706		    my $buf = util::readfile($template);
  1707		    my $fh_output = util::efopen(">$msgfile");
  1708		    print $fh_output $buf;
  1709	            util::fclose($fh_output);
  1710		}
  1711	    }
  1712	}
  1713	
  1714	
  1715	#
  1716	# Make the NMZ.slog file for logging.
  1717	#
  1718	sub make_slog_file () {
  1719	    if (! -f $var::NMZ{'slog'}) {
  1720		my $fh = util::efopen(">$var::NMZ{'slog'}");
  1721	        util::fclose($fh);
  1722	        undef $fh;
  1723	        chmod 0666, $var::NMZ{'slog'};
  1724	    }
  1725	    {
  1726		my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
  1727	        util::fclose($fh_slogfile);
  1728	    }
  1729	}
  1730	
  1731	
  1732	#
  1733	# Concatenate $CURRENTDIR to the head of each file.
  1734	#
  1735	sub change_filenames ($) {
  1736	    my $dir = $var::OUTPUT_DIR;
  1737	
  1738	    for my $key (sort keys %var::NMZ) {
  1739		next if $key =~ /^_/;    # exclude temporary file
  1740		$var::NMZ{$key} = "$dir/$var::NMZ{$key}";
  1741	    }
  1742	
  1743	    # temporary files
  1744	    for my $key (sort keys %var::NMZ) {
  1745		if ($key =~ /^_/) {
  1746		    $var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
  1747		}
  1748	    }
  1749	
  1750	    if ($var::Opt{'debug'}) {
  1751		for my $key (sort keys %var::NMZ) {
  1752		    util::dprint("NMZ: $var::NMZ{$key}\n");
  1753		}
  1754	    }
  1755	}
  1756	
  1757	
  1758	#
  1759	# Preparation processing for appending index files.
  1760	#
  1761	sub append_index (@) {
  1762	    my @flist = @_;
  1763	
  1764	    my $docid_base = 0;
  1765	    ($docid_base, @flist) = set_target_files(@flist);
  1766	
  1767	    unless (@flist) { 	# if @flist is empty
  1768		if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
  1769		    set_lockfile($var::NMZ{'lock2'});
  1770		    update_dateindex();
  1771		    update_registry(0);
  1772		    make_headfoot_pages(0, get_total_keys());
  1773		    put_log(0, 0, 0, get_total_keys());
  1774		    make_headfoot_pages(get_status("files"), get_status("keys"));
  1775		    util::remove_tmpfiles();
  1776		}
  1777		print _("No files to index.\n");
  1778		exit 0;
  1779	    }
  1780	
  1781	    $APPENDMODE = 1;
  1782	    # conserve files by copying
  1783	    copy($var::NMZ{'i'},  $var::NMZ{'_i'});
  1784	    copy($var::NMZ{'w'},  $var::NMZ{'_w'});
  1785	    copy($var::NMZ{'t'},  $var::NMZ{'_t'}) 
  1786		unless -f $var::NMZ{'_t'}; # preupdated ?
  1787	    copy($var::NMZ{'p'},  $var::NMZ{'_p'});
  1788	    copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
  1789	
  1790	    return ($docid_base, @flist);
  1791	}
  1792	
  1793	#
  1794	# Set target files to @flist and return with the number of regiested files.
  1795	#
  1796	sub set_target_files() {
  1797	    my %rdocs;    # 'rdocs' means 'registered documents'
  1798	    my @found_files = @_;
  1799	
  1800	    # Load the list of registered documents
  1801	    $rdocs{'name'} = load_registry();
  1802	
  1803	    # Pick up overlapped documents and do marking
  1804	    my %mark1;
  1805	    my @overlapped_files;
  1806	    grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
  1807	    $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
  1808	    for my $overlapped (grep { $mark1{$_} } @found_files) {
  1809		$rdocs{'overlapped'}{$overlapped} = 1;
  1810		push @overlapped_files, $overlapped;
  1811	    };
  1812	
  1813	    # Pick up not overlapped documents which are files to index.
  1814	    my @flist = grep { ! $mark1{$_} } @found_files;
  1815		 
  1816	    if ($var::Opt{'noupdate'}) {
  1817		return (scalar @{$rdocs{'name'}}, @flist);
  1818	    };
  1819	
  1820	    # Load the date index.
  1821	    $rdocs{'mtime'} = load_dateindex();
  1822	
  1823	    if (@{$rdocs{'mtime'}} == 0) {
  1824		return (scalar @{$rdocs{'name'}}, @flist); 
  1825	    };
  1826	
  1827	    util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
  1828			 "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
  1829	
  1830	    # Pick up deleted documents and do marking
  1831	    # (registered in the NMZ.r but not existent in the filesystem)
  1832	    my @deleted_documents;
  1833	    unless ($var::Opt{'nodelete'}) {
  1834		my %mark2;
  1835		grep { $mark2{$_}++ } @found_files;
  1836		for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} && 
  1837					! $rdocs{'overlapped'}{$_} } 
  1838				 @{$rdocs{'name'}}) 
  1839		{
  1840		    $rdocs{'deleted'}{$deleted} = 1;
  1841		    push @deleted_documents, $deleted;
  1842		}
  1843	    }
  1844	
  1845	    # check filesize
  1846	    if ($var::Opt{'checkfilesize'}) {
  1847		$rdocs{'size'} = load_sizefield();
  1848	    }
  1849	
  1850	    # Pick up updated documents and set the missing number for deleted files.
  1851	    my @updated_documents = pickup_updated_documents(\%rdocs);
  1852	
  1853	    # Append updated files to the list of files to index.
  1854	    if (@updated_documents) {
  1855		push @flist, @updated_documents;
  1856	    }
  1857	
  1858	    # Remove duplicates.
  1859	    my %seen = ();
  1860	    @flist = grep { ! $seen{$_}++ } @flist;
  1861	
  1862	    util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
  1863	    util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
  1864	    util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
  1865	    util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
  1866	    util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
  1867	    util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
  1868	
  1869	    # Update NMZ.t with the missing number infomation and
  1870	    # append updated files and deleted files to NMZ.r with leading '# '
  1871	    if (@updated_documents || @deleted_documents) {
  1872		$DeletedFilesCount = 0;
  1873		$UpdatedFilesCount = 0;
  1874		$UpdatedFilesCount += @updated_documents;
  1875	#	$DeletedFilesCount += @updated_documents;
  1876		$DeletedFilesCount += @deleted_documents;
  1877		preupdate_dateindex(@{$rdocs{'mtime'}});
  1878		preupdate_registry(@updated_documents, @deleted_documents);
  1879	    }
  1880	
  1881	    # Return the number of registered documents and list of files to index.
  1882	    return (scalar @{$rdocs{'name'}}, @flist);
  1883	}
  1884	
  1885	sub preupdate_registry(@) {
  1886	    my (@list) = @_;
  1887	
  1888	    my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
  1889	    @list = grep { s/(.*)/\# $1\n/ } @list;
  1890	    print $fh_registry @list;
  1891	    print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
  1892	    util::fclose($fh_registry);
  1893	}
  1894	
  1895	sub preupdate_dateindex(@) {
  1896	    my @mtimes = @_;
  1897	
  1898	    # Since rewriting the entire file, it is not efficient, 
  1899	    # but simple and reliable. this would be revised in the future.
  1900	    my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
  1901	#    print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
  1902	    print $fh_dateindex pack("N*", @mtimes);
  1903	    util::fclose($fh_dateindex);
  1904	}
  1905	
  1906	sub update_registry ($) {
  1907	    my ($docid_count) = @_;
  1908	
  1909	    {
  1910		my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
  1911		my $fh_registry_ = util::efopen($var::NMZ{'_r'});
  1912		while (defined(my $line = <$fh_registry_>)) {
  1913		    print $fh_registry $line;
  1914		}
  1915		if ($docid_count > 0) {
  1916		    print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
  1917		}
  1918	        util::fclose($fh_registry_) if (defined $fh_registry_);
  1919	        util::fclose($fh_registry);
  1920	    }
  1921	    unlink $var::NMZ{'_r'};
  1922	}
  1923	
  1924	sub update_dateindex () {
  1925	    util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
  1926	}
  1927	
  1928	sub update_field_index () {
  1929	    my @list = glob "$var::NMZ{'field'}.*.tmp";
  1930	    for my $tmp (@list) {
  1931	        if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
  1932		    my $fname_tmp = $1;
  1933		    my $fname_out = $2;
  1934		    {
  1935			my $fh_field = util::efopen(">>$fname_out");
  1936			my $fh_tmp = util::efopen($fname_tmp);
  1937	
  1938			while (defined(my $line = <$fh_tmp>)) {
  1939			    print $fh_field $line;
  1940			}
  1941	                util::fclose($fh_tmp) if (defined $fh_tmp);
  1942	                util::fclose($fh_field);
  1943		    }
  1944		    unlink $fname_tmp;
  1945		} else {
  1946		    util::cdie(_("update_field_index: ")."@list");
  1947		}
  1948	    }
  1949	}
  1950	
  1951	sub pickup_updated_documents (\%) {
  1952	    my ($rdocs_ref) = @_;
  1953	    my @updated_documents = ();
  1954	
  1955	    # To avoid duplicated outputs caused by --html-split support.
  1956	    my %printed = ();
  1957	    my $i = 0;
  1958	    for my $cfile (@{$rdocs_ref->{'name'}}) {
  1959		if (defined($rdocs_ref->{'deleted'}{$cfile})) {
  1960		    unless ($printed{$cfile}) {
  1961			print "$cfile " . _("was deleted!\n");
  1962			$printed{$cfile} = 1;
  1963		    }
  1964		    $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
  1965		} elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
  1966		    my $cfile_mtime = (stat($cfile))[9];
  1967		    my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
  1968		    my ($cfile_size, $rfile_size);
  1969		    if ($var::Opt{'checkfilesize'}) {
  1970			$cfile_size = (stat($cfile))[7];
  1971			$rfile_size = $rdocs_ref->{'size'}[$i];
  1972		    }
  1973	
  1974		    if ($rfile_mtime != $cfile_mtime || 
  1975			($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) {
  1976			# The file is updated!
  1977			unless ($printed{$cfile}) {
  1978			    print "$cfile " . _("was updated!\n");
  1979			    $printed{$cfile} = 1;
  1980			}
  1981			push(@updated_documents, $cfile);
  1982			$rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
  1983		    }
  1984		}
  1985		$i++;
  1986	    }
  1987	
  1988	    return @updated_documents
  1989	}
  1990	
  1991	sub load_dateindex() {
  1992	    my $fh_dateindex = util::efopen($var::NMZ{'t'});
  1993	
  1994	    my $size = -s $var::NMZ{'t'};
  1995	    my $buf  = "";
  1996	    read($fh_dateindex, $buf, $size);
  1997	    my @list = unpack("N*", $buf);  # load date index
  1998	#    print "\nload_dateindex\n", join("\n", @list), "\n\n";
  1999	
  2000	    util::fclose($fh_dateindex);
  2001	    return [ @list ];
  2002	}
  2003	
  2004	sub load_registry () {
  2005	    my $fh_registry = util::efopen($var::NMZ{'r'});
  2006	
  2007	    my @list = ();
  2008	    my %deleted    = ();
  2009	    my @registered = ();
  2010	
  2011	    while (defined(my $line = <$fh_registry>)) {
  2012		chomp($line);
  2013		next if $line =~ /^\s*$/;  # an empty line
  2014		next if $line =~ /^##/;    # a comment
  2015		if ($line =~ s/^\#\s+//) { # deleted document
  2016		    $deleted{$line}++;
  2017		} else {
  2018		    # Remove HTML's anchor generated by --html-split option.
  2019		    $line =~ s/\t.*$//g;  
  2020		    push @registered, $line;
  2021		}
  2022	    }
  2023	
  2024	    util::fclose($fh_registry) if (defined $fh_registry);
  2025	
  2026	    # Exclude deleted documents.
  2027	    for my $doc (@registered) {
  2028		if ($deleted{$doc}) {
  2029		    push @list, "# $doc";
  2030		    $deleted{$doc}--;
  2031		} else {
  2032		    push @list, $doc;
  2033		}
  2034	    }
  2035	
  2036	    return [ @list ];
  2037	}
  2038	
  2039	# get file size information from NMZ.field.size
  2040	sub load_sizefield() {
  2041	    my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size');
  2042	    return [] unless defined $fh_sizefield;
  2043	    my $line;
  2044	    my @ret = ();
  2045	    while (defined($line = <$fh_sizefield>)) {
  2046		chomp $line;
  2047		push @ret, $line;
  2048	    }
  2049	    util::fclose($fh_sizefield) if (defined $fh_sizefield);
  2050	    return \@ret;
  2051	}
  2052	
  2053	sub get_total_keys() {
  2054	    my $keys = get_status("keys");
  2055	    $keys =~ s/,//g if (defined $keys);
  2056	    $keys = 0 unless defined $keys;
  2057	    return $keys;
  2058	}
  2059	
  2060	sub get_total_files() {
  2061	    my $files = get_status("files");
  2062	    $files =~ s/,//g if (defined $files);
  2063	    $files = 0 unless defined $files;
  2064	    return $files;
  2065	}
  2066	
  2067	sub get_status($) {
  2068	    my ($key) = @_;
  2069	
  2070	    my $fh = util::fopen($var::NMZ{'status'});
  2071	    return undef unless defined $fh;
  2072	
  2073	    while (defined(my $line = <$fh>)) {
  2074		if ($line =~ /^$key\s+(.*)$/) {
  2075		    util::dprint("status: $key = $1\n");
  2076		    $fh->close;
  2077		    return $1;
  2078		}
  2079	    }
  2080	    util::fclose($fh) if (defined $fh);
  2081	    return undef;
  2082	}
  2083	
  2084	sub put_total_files($) {
  2085	    my ($number) = @_;
  2086	    $number =~ tr/,//d;
  2087	    put_status("files", $number);
  2088	}
  2089	
  2090	sub put_total_keys($) {
  2091	    my ($number) = @_;
  2092	    $number =~ tr/,//d;
  2093	    put_status("keys", $number);
  2094	}
  2095	
  2096	sub put_status($$) {
  2097	    my ($key, $value) = @_;
  2098	
  2099	    # remove NMZ.status file if the file has a previous value.
  2100	    unlink $var::NMZ{'status'} if defined get_status($key);
  2101	
  2102	    my $fh = util::efopen(">> $var::NMZ{'status'}");
  2103	    print $fh "$key $value\n";
  2104	    util::fclose($fh);
  2105	}
  2106	
  2107	# do logging
  2108	sub put_log ($$$$) {
  2109	    my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
  2110	
  2111	    my $date = localtime;
  2112	    my $added_files_count   = $docid_count;
  2113	    my $deleted_documents_count = $DeletedFilesCount;
  2114	    my $updated_documents_count = $UpdatedFilesCount;
  2115	    my $total_files_count   = get_total_files() + $docid_count
  2116				      - $DeletedFilesCount - $UpdatedFilesCount;
  2117	    my $added_keys_count    = 0;
  2118	    $added_keys_count       = $total_keys_count - get_total_keys();
  2119	
  2120	    my $processtime         = time - $start_time;
  2121	    $processtime            = 0 if $start_time == 0;
  2122	    $total_files_size       = $total_files_size;
  2123	    $total_keys_count       = $total_keys_count;
  2124	
  2125	    my @logmsgs = ();
  2126	    if ($APPENDMODE) {
  2127		push @logmsgs, N_("[Append]");
  2128	    } else {
  2129		push @logmsgs, N_("[Base]");
  2130	    }
  2131	    push @logmsgs, N_("Date:"), "$date" if $date;
  2132	    push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
  2133		if $added_files_count;
  2134	    push @logmsgs, N_("Deleted Documents:"), 
  2135		util::commas("$deleted_documents_count") if $deleted_documents_count;
  2136	    push @logmsgs, N_("Updated Documents:"), 
  2137		util::commas("$updated_documents_count") if $updated_documents_count;
  2138	    push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
  2139		if $total_files_size;
  2140	    push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count") 
  2141		if $total_files_count;
  2142	    push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count") 
  2143		if $added_keys_count;
  2144	    push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count") 
  2145		if $total_keys_count;
  2146	    push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
  2147	    push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
  2148		if $processtime;
  2149	    push @logmsgs, N_("File/Sec:"),  sprintf "%.2f", 
  2150	    	(($added_files_count + $updated_documents_count) / $processtime) 
  2151		if $processtime;
  2152	    push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME;
  2153	    push @logmsgs, N_("Perl:"),   sprintf("%f", $English::PERL_VERSION);
  2154	    push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
  2155	
  2156	    my $log_for_file = "";
  2157	
  2158	    my $msg = shift @logmsgs;	# [Base] or [Append]
  2159	    # To stdout, use gettext.
  2160	    print _($msg), "\n";
  2161	    # To log file, do not use gettext.
  2162	    $log_for_file = $msg . "\n";
  2163	    while (@logmsgs) {
  2164		my $field = shift @logmsgs;
  2165		my $value = shift @logmsgs;
  2166		printf "%-20s %s\n", _($field), "$value";
  2167		$log_for_file .= sprintf "%-20s %s\n", $field, "$value";
  2168	    }
  2169	    print "\n";
  2170	    $log_for_file .= "\n";
  2171	
  2172	    put_log_to_logfile($log_for_file);
  2173	    put_total_files($total_files_count);
  2174	    put_total_keys($total_keys_count);
  2175	
  2176	    my $argv = join "\t", @ARGV;
  2177	    my $cwd  = cwd();
  2178	    put_status("argv", $argv);
  2179	    put_status("cwd",  $cwd);
  2180	}
  2181	
  2182	sub put_log_to_logfile ($) {
  2183	    my ($logmsg) = @_;
  2184	    my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
  2185	    print $fh_logfile $logmsg;
  2186	    util::fclose($fh_logfile);
  2187	}
  2188	
  2189	sub get_year() {
  2190	    my $year = (localtime)[5] + 1900;
  2191	
  2192	    return $year;
  2193	}
  2194	
  2195	# Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
  2196	# Insert $docid_count, $key_count, and $month/$day/$year respectively.
  2197	sub make_headfoot ($$$) {
  2198	    my ($file, $docid_count, $key_count) = @_;
  2199	
  2200	    my $day   = sprintf("%02d", (localtime)[3]);
  2201	    my $month = sprintf("%02d", (localtime)[4] + 1);
  2202	    my $year  = get_year();
  2203	    my $buf   = "";
  2204	
  2205	    if (-f $file) {
  2206		$buf = util::readfile($file);
  2207	    } else {
  2208		$file =~ m!.*/(.*)$!;
  2209		my $fname = $1;
  2210		my $template = "$TEMPLATEDIR/$fname";
  2211	
  2212		if (-f $template) {
  2213		    $buf = util::readfile($template);
  2214		} else {
  2215		    return;
  2216		}
  2217	    }
  2218	
  2219	    my $fh_file = util::efopen(">$file");
  2220	
  2221	    if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
  2222	        my $total_files_count = util::commas(get_total_files() + $docid_count 
  2223	                                   - $DeletedFilesCount - $UpdatedFilesCount);
  2224		$buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
  2225	
  2226	    }
  2227	    if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
  2228		my $tmp = $2;
  2229		$tmp =~ tr/,//d;
  2230		$tmp = $key_count;
  2231		$tmp = util::commas($tmp);
  2232		$buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
  2233	    }
  2234	    $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
  2235	    $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
  2236	    $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
  2237		     {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
  2238	    $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
  2239		     {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs;
  2240	
  2241	    print $fh_file $buf;
  2242	    util::fclose($fh_file);
  2243	}
  2244	
  2245	# Make phrase hashes for NMZ.p
  2246	# Handle two words each for calculating a hash value ranged 0-65535.
  2247	sub make_phrase_hash ($$$) {
  2248	    my ($docid_count, $docid_base, $contref) = @_;
  2249	
  2250	    my %tmp = ();
  2251	    $$contref =~ s!\x7f */? *\d+ *\x7f!!g;  # remove tags of weight
  2252	    $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
  2253	    my @words = split(/\s+/, $$contref);
  2254	    @words = grep {$_ ne ""} @words;   # remove empty words
  2255	    my $word_b = shift @words;
  2256	    my $docid = $docid_count + $docid_base;
  2257	    for my $word (@words) {
  2258		next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
  2259		my $hash = hash($word_b . $word);
  2260		unless (defined $tmp{$hash}) {
  2261		    $tmp{$hash} = 1;
  2262		    $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
  2263		    $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
  2264	#	    util::dprint("<$word_b, $word> $hash\n");
  2265		    $PhraseHashLast{$hash} = $docid;
  2266		}
  2267		$word_b = $word;
  2268	    }
  2269	}
  2270	
  2271	# Construct NMZ.p and NMZ.pi file. this processing is rather complex.
  2272	sub write_phrase_hash () {
  2273	    write_phrase_hash_sub();
  2274	    util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
  2275	    util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
  2276	}
  2277	
  2278	sub write_phrase_hash_sub () {
  2279	    my $opened = 0;
  2280	
  2281	    return 0 if %PhraseHash eq '';    # namazu-devel-ja #3146
  2282	    util::dprint(_("doing write_phrase_hash() processing.\n"));
  2283	
  2284	    my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
  2285	    my $fh_tmp_p  = util::efopen(">$var::NMZ{'__p'}");
  2286	
  2287	    my $fh_phrase = util::fopen($var::NMZ{'_p'});
  2288	    my $fh_phraseindex = undef;
  2289	    if ($fh_phrase) {
  2290		$fh_phraseindex = util::efopen($var::NMZ{'_pi'});
  2291		$opened = 1;
  2292	    }
  2293		
  2294	    my $ptr = 0;
  2295	    for (my $i = 0; $i < 65536; $i++) {
  2296	
  2297		my $baserecord = "";
  2298		my $baseleng = 0;
  2299	
  2300		if ($opened) {
  2301		    my $tmp = 0;
  2302		    read($fh_phraseindex, $tmp, $var::INTSIZE);
  2303		    $tmp = unpack("N", $tmp);
  2304		    if ($tmp != 0xffffffff) { # 0xffffffff
  2305			$baseleng = readw($fh_phrase);
  2306			read($fh_phrase, $baserecord, $baseleng);
  2307		    }
  2308		}
  2309		if (defined($PhraseHash{$i})) {
  2310		    if ($baserecord eq "") {
  2311			print $fh_tmp_pi pack("N", $ptr);
  2312			my $record = $PhraseHash{$i};
  2313			my $n2 = length($record);
  2314			my $data = pack("w", $n2) . $record;
  2315			print $fh_tmp_p $data;
  2316			$ptr += length($data);
  2317		    } else {
  2318			print $fh_tmp_pi pack("N", $ptr);
  2319			my $record = $PhraseHash{$i};
  2320			my $last_docid = get_last_docid($baserecord, 1);
  2321			my $adjrecord = adjust_first_docid($record, $last_docid);
  2322			check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
  2323			$record = $adjrecord;
  2324			my $n2 = length($record) + $baseleng;
  2325			my $data = pack("w", $n2) .  $baserecord . $record;
  2326			print $fh_tmp_p $data;
  2327			$ptr += length($data);
  2328		    }
  2329		} else {
  2330		    if ($baserecord eq "") {
  2331			# if $baserecord has no data, set to 0xffffffff
  2332			print $fh_tmp_pi pack("N", 0xffffffff);
  2333		    } else {
  2334			print $fh_tmp_pi pack("N", $ptr);
  2335			my $data = pack("w", $baseleng) . $baserecord;
  2336			print $fh_tmp_p $data;
  2337			$ptr += length($data);
  2338		    }
  2339		}
  2340	    }
  2341	
  2342	    if ($opened) {
  2343	        util::fclose($fh_phraseindex);
  2344	    }
  2345	    if (defined $fh_phrase) {
  2346	        util::fclose($fh_phrase);
  2347	    }
  2348	    util::fclose($fh_tmp_p);
  2349	    util::fclose($fh_tmp_pi);
  2350	
  2351	    %PhraseHash = ();
  2352	    %PhraseHashLast = ();
  2353	}
  2354	
  2355	# Dr. Knuth's  ``hash'' from (UNIX MAGAZINE May 1998)
  2356	sub hash ($) {
  2357	    my ($word) = @_;
  2358	
  2359	    my $hash = 0;
  2360	    for (my $i = 0; $word ne ""; $i++) {
  2361		$hash ^= $Seed[$i & 0x03][ord($word)];
  2362	        $word = substr $word, 1;
  2363		# $word =~ s/^.//;  is slower
  2364	    }
  2365	    return $hash & 65535;
  2366	}
  2367	
  2368	# Count frequencies of words.
  2369	sub count_words ($$$$) {
  2370	    my ($docid_count, $docid_base, $contref, $kanji) = @_;
  2371	    my (@tmp);
  2372	
  2373	    # Normalize into small letter.
  2374	    $$contref =~ tr/A-Z/a-z/;
  2375	
  2376	    # Remove control char.
  2377	    $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
  2378	
  2379	    # It corresponds to -j option of ChaSen.
  2380	    $$contref =~ s/^[ \t\f]+//gm;    # except "\r\n"
  2381	    $$contref =~ s/[ \t\f]+$//gm;    # except "\r\n"
  2382	    $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi;   # for hyphenation
  2383	    if (util::islang("ja")) {
  2384	        $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
  2385	        $$contref =~ s/(｡｣|｡｢)/$1\n/gs;
  2386	    }
  2387	    $$contref =~ s/\n+/\n/gs;
  2388	
  2389	    # Do wakatigaki if necessary.
  2390	    if (util::islang("ja")) {
  2391		wakati::wakatize_japanese($contref) if $kanji;
  2392	    }
  2393	
  2394	    my $part1 = "";
  2395	    my $part2 = "";
  2396	    if ($$contref =~ /\x7f/) {
  2397		$part1 = substr $$contref, 0, index($$contref, "\x7f");
  2398		$part2 = substr $$contref, index($$contref, "\x7f");
  2399	#	$part1 = $PREMATCH;  # $& and friends are not efficient
  2400	#	$part2 = $MATCH . $POSTMATCH;
  2401	    } else {
  2402		$part1 = $$contref;
  2403		$part2 = "";
  2404	    }
  2405	
  2406	    # do scoring
  2407	    my %word_count = ();
  2408	    $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
  2409		wordcount_sub($2, $1, \%word_count)!ge;
  2410	    wordcount_sub($part1, 1, \%word_count);
  2411	
  2412	    # Add them to whole index
  2413	    my $docid = $docid_count + $docid_base;
  2414	    for my $word (keys(%word_count)) {
  2415		next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
  2416		$KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
  2417		$KeyIndex{$word} .= pack("w2", 
  2418					 $docid - $KeyIndexLast{$word}, 
  2419					 $word_count{$word});
  2420		$KeyIndexLast{$word} = $docid;
  2421	    }
  2422	}
  2423	
  2424	#
  2425	# Count words and do score weighting
  2426	#
  2427	sub wordcount_sub ($$\%) {
  2428	    my ($text, $weight, $word_count) = @_;
  2429	
  2430	    # Remove all symbols when -K option is specified.
  2431	    $text =~ tr/\xa1-\xfea-z0-9/   /c if $var::Opt{'nosymbol'};
  2432	
  2433	    # Count frequencies of words in a current document.
  2434	    # Handle symbols as follows.
  2435	    #
  2436	    # tcp/ip      ->  tcp/ip,     tcp,      ip
  2437	    # (tcp/ip)    ->  (tcp/ip),   tcp/ip,   tcp, ip
  2438	    # ((tcpi/ip)) ->  ((tcp/ip)), (tcp/ip), tcp
  2439	    #
  2440	    # Don't do processing for nested symbols.
  2441	    # NOTE: When -K is specified, all symbols are already removed.
  2442	
  2443	    my @words = split /\s+/, $text;
  2444	    for my $word (@words) {
  2445		next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
  2446		if ($var::Opt{'noedgesymbol'}) {
  2447		    # remove symbols at both ends
  2448		    $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
  2449		}
  2450		$word_count->{$word} = 0 unless defined($word_count->{$word});
  2451		$word_count->{$word} += $weight;
  2452		unless ($var::Opt{'nosymbol'}) {
  2453		    if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
  2454			$word_count->{$1} = 0 unless defined($word_count->{$1});
  2455			$word_count->{$1} += $weight;
  2456			next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
  2457		    } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
  2458			$word_count->{$1} = 0 unless defined($word_count->{$1});
  2459			$word_count->{$1} += $weight;
  2460			next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
  2461		    } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
  2462			$word_count->{$1} = 0 unless defined($word_count->{$1});
  2463			$word_count->{$1} += $weight;
  2464			next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
  2465		    }
  2466		    my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
  2467			if $word =~ /[^\xa1-\xfea-z_0-9]/;
  2468		    for my $tmp (@words_) {
  2469			next if $tmp eq "";
  2470			$word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
  2471			$word_count->{$tmp} += $weight;
  2472		    }
  2473		    @words_ = ();
  2474		}
  2475	    }
  2476	    return "";
  2477	}
  2478	
  2479	# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
  2480	sub write_index () {
  2481	    my $key_count = write_index_sub();
  2482	    util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
  2483	    util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
  2484	
  2485	    return $key_count;
  2486	}
  2487	
  2488	# readw: read one pack 'w' word.
  2489	# This code was contributed by <furukawa@tcp-ip.or.jp>.
  2490	sub readw ($) {
  2491	    my $fh = shift;
  2492	    my $ret = '';
  2493	    my $c;
  2494	    
  2495	    while (read($fh, $c, 1)){
  2496		$ret .= $c;
  2497		last unless 0x80 & ord $c;
  2498	    }
  2499	    return unpack('w', $ret);
  2500	}
  2501	
  2502	sub get_last_docid ($$) {
  2503	    my ($record, $step) = @_;
  2504	    my (@data) = unpack 'w*', $record;
  2505	
  2506	    my $sum = 0;
  2507	    for (my $i = 0; $i < @data; $i += $step) {
  2508		$sum += $data[$i];
  2509	    }
  2510	    my $leng = @data / $step;
  2511	    return $sum;
  2512	}
  2513	
  2514	sub adjust_first_docid ($$) {
  2515	    my ($record, $last_docid) = @_;
  2516	    my (@data) = unpack 'w*', $record;
  2517	
  2518	    $data[0] = $data[0] - $last_docid;
  2519	    return undef if ($data[0] < 0); # namazu-bug-ja#31
  2520	    $record = pack 'w*', @data;
  2521	    return $record;
  2522	}
  2523	
  2524	sub write_index_sub () {
  2525	    my @words = sort keys(%KeyIndex);
  2526	    return 0 if $#words == -1;
  2527	
  2528	    my $cnt = 0;
  2529	    my $ptr_i = 0;
  2530	    my $ptr_w = 0;
  2531	    my $key_count = 0;
  2532	    my $baserecord = "";
  2533	
  2534	    util::dprint(_("doing write_index() processing.\n"));
  2535	    my $fh_tmp_i  = util::efopen(">$var::NMZ{'__i'}");
  2536	    my $fh_tmp_w  = util::efopen(">$var::NMZ{'__w'}");
  2537	    my $fh_i      = util::fopen($var::NMZ{'_i'});
  2538	    my $fh_ii     = util::efopen(">$var::NMZ{'_ii'}");
  2539	    my $fh_w      = util::fopen($var::NMZ{'_w'});
  2540	    my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
  2541	
  2542	    if ($fh_w) {
  2543	      FOO:
  2544		while (defined(my $line = <$fh_w>)) {
  2545		    chop $line;
  2546		    my $current_word = $line;
  2547	
  2548		    my $baseleng = readw($fh_i);
  2549		    read($fh_i, $baserecord, $baseleng);
  2550	
  2551	 	    for (; $cnt < @words; $cnt++) {
  2552			last unless $words[$cnt] le $current_word;
  2553			my $record = $KeyIndex{$words[$cnt]};
  2554			my $leng = length($record);
  2555	
  2556			if ($current_word eq $words[$cnt]) {
  2557			    my $last_docid = get_last_docid($baserecord, 2);
  2558			    my $adjrecord = adjust_first_docid($record, $last_docid);
  2559			    check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
  2560			    $record = $adjrecord;
  2561			    $leng = length($record);  # re-measure
  2562			    my $tmp = pack("w", $leng + $baseleng);
  2563	
  2564			    my $data_i = "$tmp$baserecord$record";
  2565			    my $data_w = "$current_word\n";
  2566			    print $fh_tmp_i $data_i;
  2567			    print $fh_tmp_w $data_w;
  2568			    print $fh_ii pack("N", $ptr_i);
  2569			    print $fh_wi pack("N", $ptr_w);
  2570			    $ptr_i += length($data_i);
  2571			    $ptr_w += length($data_w);
  2572			    $key_count++;
  2573	
  2574			    $cnt++;
  2575			    next FOO;
  2576			} else {
  2577			    my $tmp = pack("w", $leng);
  2578			    my $data_i = "$tmp$record";
  2579			    my $data_w = "$words[$cnt]\n";
  2580			    print $fh_tmp_i $data_i;
  2581			    print $fh_tmp_w $data_w;
  2582			    print $fh_ii pack("N", $ptr_i);
  2583			    print $fh_wi pack("N", $ptr_w);
  2584			    $ptr_i += length($data_i);
  2585			    $ptr_w += length($data_w);
  2586			    $key_count++;
  2587			}
  2588		    }
  2589		    my $tmp  = pack("w", $baseleng);
  2590		    my $data_i = "$tmp$baserecord";
  2591		    my $data_w = "$current_word\n";
  2592		    print $fh_tmp_i $data_i;
  2593		    print $fh_tmp_w $data_w;
  2594		    print $fh_ii pack("N", $ptr_i);
  2595		    print $fh_wi pack("N", $ptr_w);
  2596		    $ptr_i += length($data_i);
  2597		    $ptr_w += length($data_w);
  2598		    $key_count++;
  2599		}
  2600	    }
  2601	    while ($cnt < @words) {
  2602		my $leng = length($KeyIndex{$words[$cnt]});
  2603		my $tmp = pack("w", $leng);
  2604		my $record = $KeyIndex{$words[$cnt]};
  2605	
  2606		my $data_i = "$tmp$record";
  2607		my $data_w = "$words[$cnt]\n";
  2608		print $fh_tmp_i $data_i;
  2609		print $fh_tmp_w $data_w;
  2610		print $fh_ii pack("N", $ptr_i);
  2611		print $fh_wi pack("N", $ptr_w);
  2612		$ptr_i += length($data_i);
  2613		$ptr_w += length($data_w);
  2614		$key_count++;
  2615		$cnt++;
  2616	    }
  2617	    %KeyIndex = ();
  2618	    %KeyIndexLast = ();
  2619	
  2620	    util::fclose($fh_wi);
  2621	    util::fclose($fh_w) if (defined $fh_w);
  2622	    util::fclose($fh_ii);
  2623	    util::fclose($fh_i) if (defined $fh_i);
  2624	    util::fclose($fh_tmp_w);
  2625	    util::fclose($fh_tmp_i);
  2626	
  2627	    return $key_count;
  2628	}
  2629	
  2630	#
  2631	# Decide the media type. 
  2632	# FIXME: Very ad hoc. It's just a compromise. -- satoru
  2633	#
  2634	sub decide_type ($$) {
  2635	    my ($name, $cont) = @_;
  2636	    return $name if (!defined $cont || $name eq $cont);
  2637	
  2638	    util::dprint("decide_type: name: $name, cont: $cont\n");
  2639	    if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
  2640		return $name;
  2641	    } elsif ($cont =~ m!^application/octet-stream! &&
  2642	             $name !~ m!^text/!) {
  2643		return $name;
  2644	    } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
  2645		     $name !~ m!^application/octet-stream!)  {
  2646		# FIXME: Currently File::MMagic 1.02's checktype_data() 
  2647		# is unreliable for them.
  2648		return $name;
  2649	    } elsif ($cont =~ m!^application/x-zip! &&
  2650	             $name =~ m!^application/!) {
  2651	        # zip format is used other applications e.g. OpenOffice.
  2652	        # It is necessary to add to check extention.
  2653	        return $name;
  2654	    }
  2655	
  2656	    return $cont;
  2657	}
  2658	
  2659	#
  2660	# Debugging code for the "negative numbers" problem.
  2661	#
  2662	sub check_records ($$$) {
  2663	    my ($recref, $baserecref, $step) = @_;
  2664	    dump_record($baserecref, $step);
  2665	    dump_record($recref, $step);
  2666	    print STDERR "The \x22negative number\x22 problem occurred.\n";
  2667	    exit(1);
  2668	}
  2669	
  2670	sub dump_record($$) {
  2671	    my ($recref, $step) = @_;
  2672	    my (@data) = unpack 'w*', $$recref;
  2673	    print STDERR "dump record data to NMZ.bug.info (step: $step)...";
  2674	    my $fh_info = util::fopen(">> NMZ.bug.info");
  2675	    print $fh_info "dumped record data (step: $step)...";
  2676	    foreach (@data) {
  2677		print $fh_info sprintf(" %08x", $_);
  2678	    }
  2679	    print $fh_info "\n";
  2680	    util::fclose($fh_info);
  2681	    return;
  2682	}
  2683	
  2684	sub trapintr {
  2685	    my ($signame) = @_;
  2686	    print STDERR "Warning: signal $signame occured.\n";
  2687	}
  2688	
  2689	#
  2690	# For avoiding "used only once: possible typo at ..." warnings.
  2691	#
  2692	muda($conf::ON_MEMORY_MAX,
  2693	     $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
  2694	     $conf::DENY_FILE, $var::INTSIZE,
  2695	     $conf::CHASEN_NOUN, $conf::CHASEN,
  2696	     $conf::KAKASI, $var::Opt{'okurigana'},
  2697	     $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
  2698	     $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
  2699	     $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
  2700	     $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
  2701	     $conf::ADDRESS, $var::MAILING_ADDRESS,
  2702	     $conf::FILE_SIZE_MAX,
  2703	     $conf::MECAB,
  2704	     $conf::DENY_DDN,
  2705	     $var::TRAC_URI,
  2706	     );
  2707	
  2708	sub muda {}
  2709

まりんきょ学問所＞全文検索システムNamazu ＞ mknmz のソースコードを読む

MARUYAMA Satosi