mknmz のソースコードを読む

作成日 : 2011-05-26
最終更新日 :

1. はじめに

1.1 mknmz と namazu

namazu は大きく二つのプログラムに分かれる。一つはインデックスを作るプログラム、 もう一つは検索をするプログラムである。インデックスを作るプログラムは mknmz、 検索をするプログラムは namazu である。 ここではインデックスを作るプログラムである mknmz のソースコードを読んでみよう。 実際には mknmz.in ファイルを読むことにする。 両者の違いは、mknmz.in を作る元ソース(テンプレート)であるのに対し、 mknmz は環境等を置き換えたあとで実際にスクリプトを実行する実ソースである、ということである。 また、mknmz はプログラミング言語 perl で書かれている。

1.2 ソースコードを表示する

以下の仕組みで表示用のソースコードを加工している。 html では &, >, < の表示にあたり工夫する必要がある。 ここで、UNIX のコマンドである sed と cat -n を使っている。

% sed -e 's/\&/\&amp;/g' -e 's/>/\&gt;/g' -e 's/</\&lt;/g' scripts/mknmz.in | cat -n > mknmz.in.html

2. ソースコードを読む

2.1 ライブラリなど

     1	#! %PERL% -w
     2	# -*- Perl -*-
     3	# mknmz - indexer of Namazu
     4	# $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
     5	#
     6	# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
     7	# Copyright (C) 2000-2008 Namazu Project All rights reserved.
     8	#     This is free software with ABSOLUTELY NO WARRANTY.
     9	#
    10	#  This program is free software; you can redistribute it and/or modify
    11	#  it under the terms of the GNU General Public License as published by
    12	#  the Free Software Foundation; either versions 2, or (at your option)
    13	#  any later version.
    14	# 
    15	#  This program is distributed in the hope that it will be useful
    16	#  but WITHOUT ANY WARRANTY; without even the implied warranty of
    17	#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    18	#  GNU General Public License for more details.
    19	#
    20	#  You should have received a copy of the GNU General Public License
    21	#  along with this program; if not, write to the Free Software
    22	#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
    23	#  02111-1307, USA
    24	#
    25	#  This file must be encoded in EUC-JP encoding
    26	#
    27	
    28	package mknmz;
    29	require 5.004;
    30	use English;
    31	use lib "%ADDITIONAL_INC%";
    32	use Cwd;
    33	use IO::File;
    34	use File::Find;
    35	use File::MMagic;
    36	use Time::Local;
    37	use strict;  # be strict since v1.2.0
    38	use Getopt::Long;
    39	use File::Copy;
    40	use DirHandle;
    41	use File::Basename;
    42	
    43	use vars qw($SYSTEM);
    44	# It exists only for back compatibility.
    45	$SYSTEM = $English::OSNAME;

28 行め、package mknmz; はパッケージ mknmz の名前空間を宣言している。

29 行め、require 5.004; は依存バージョンである。

30行目以降の use は各種モジュールの宣言である。

lib → ここは mknmz では use lib "."; と展開される。
English → 各種の数などを英語で表すためのモジュール
Cwd → カレントワーキングディレクトリを得るためのモジュール
File::Find →ファイルの検索など
File::MMagic →ファイルのマジックナンバーを見る
File::Copy → ファイルをコピーする
File::Basename →ファイルのフルパスから
IO::File   → IOを使う
Time::Local → 時間(ローカル時間)
strict → インタープリターで解釈を厳密にして怪しいところはすべてエラーを出し停止する
Getopt::Long → 長形式のオプション解釈モジュール
DirHandle → ディレクトリを扱うモジュール
vars qw($SYSTEM)

    45 行目の	$SYSTEM = $English::OSNAME; は後方互換性のためだけに存在する。

2.2 変数宣言

    46	
    47	my $NAMAZU_INDEX_VERSION = "2.0";
    48	
    49	my $CodingSystem = "euc";
    50	my $PKGDATADIR    = $ENV{'pkgdatadir'} || "@pkgdatadir@";
    51	my $CONFDIR       = "@CONFDIR@";     # directory where mknmzrc are in.
    52	my $LIBDIR        = $PKGDATADIR . "/pl";      # directory where library etc. are in.
    53	my $FILTERDIR     = $PKGDATADIR . "/filter";   # directory where filters are in.
    54	my $TEMPLATEDIR   = $PKGDATADIR . "/template"; # directory where templates are in.
    55	
    56	my $DeletedFilesCount = 0;
    57	my $UpdatedFilesCount = 0;
    58	my $APPENDMODE = 0;
    59	my %PhraseHash = ();
    60	my %PhraseHashLast = ();
    61	my %KeyIndex = ();
    62	my %KeyIndexLast = ();
    63	my %CheckPoint = ("on" => undef, "continue" => undef);
    64	my $ConfigFile = undef;
    65	my $MediaType  = undef;
    66	
    67	my $ReplaceCode  = undef;  # perl code for transforming URI
    68	my @Seed = ();
    69	my @LoadedRcfiles = ();
    70	my $Magic = new File::MMagic;
    71	
    72	my $ReceiveTERM = 0;
    73	
    74	STDOUT->autoflush(1);
    75	STDERR->autoflush(1);

47 行から 58 行まではスカラー型変数の宣言。 その中で、49 行は日本語の表現形式を宣言している。ここでは euc である。 59 行から 63 行まではハッシュ(連想配列)である。ここもほとんど何もない状態の宣言である。 64 行から 67 行はふたたびスカラー型変数の宣言。 68 行と69 行は空の配列の宣言である。 70 行と72 行は再度スカラー型変数の宣言。 73 行は標準出力のバッファをフラッシュする(バッファを貯めない)宣言。 74 行は標準エラー出力に対して同様にバッファをフラッシュする宣言。

2-3 main 関数


    76	main();
    77	sub main {
    78	    my $start_time = time;
    79	
    80	    if ($English::PERL_VERSION == 5.008001) {
    81	        unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
    82	            print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
    83	            exit 1;
    84	        }
    85	    }
    86	
    87	    init();
    88	
    89	    # At first, loading pl/conf.pl to prevent overriding some variables.
    90	    preload_modules();
    91	
    92	    # set LANG and bind textdomain
    93	    util::set_lang();
    94	    textdomain('namazu', $util::LANG_MSG);
    95	
    96	    load_modules();
    97	    my ($output_dir, @targets) = parse_options();
    98	    my ($docid_base, $total_files_num) = prep($output_dir, @targets);
    99	
   100	    my $swap = 1;
   101	    my $docid_count = 0;
   102	    my $file_count = 0;
   103	    my $total_files_size = 0;
   104	    my $key_count = 0;
   105	    my $checkpoint = 0;
   106	    my $flist_ptr = 0;
   107	    my $processed_files_size = 0;
   108	
   109	    if ($CheckPoint{'continue'}) {
   110		# Restore variables
   111		eval util::readfile($var::NMZ{'_checkpoint'}) ;
   112	    } else {
   113		print $total_files_num . _(" files are found to be indexed.\n");
   114	    }
   115	
   116	    {
   117		my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
   118		my $fh_flist = util::efopen($var::NMZ{'_flist'});
   119		my %field_indices = ();
   120		get_field_index_base(\%field_indices);
   121	
   122		if ($CheckPoint{'continue'}) {
   123		    seek($fh_flist, $flist_ptr, 0);
   124		}
   125	
   126		# Process target files one by one
   127		while (defined(my $line = <$fh_flist>)) {
   128		    $flist_ptr += length($line);
   129		    my $cfile = $line;
   130		    chomp $cfile;
   131		    util::dprint(_("target file: ")."$cfile\n");
   132	
   133		    my ($cfile_size, $num) = 
   134			process_file($cfile, $docid_count, $docid_base, 
   135				     $file_count, \%field_indices,
   136				     $fh_errorsfile, $total_files_num);
   137		    if ($num == 0) {
   138			$total_files_num--;
   139			next;
   140		    } else {
   141			$docid_count += $num;
   142			$file_count++;
   143		    }
   144	
   145		    $total_files_size     += $cfile_size;
   146		    $processed_files_size += $cfile_size;
   147		    last if $ReceiveTERM;
   148		    if ($processed_files_size > $conf::ON_MEMORY_MAX) {
   149			if (%KeyIndex) {
   150			    $key_count = write_index();
   151			    print _("Writing index files..."); 
   152			    write_phrase_hash();
   153			    print "\n";
   154			}
   155			$processed_files_size = 0;
   156			$checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
   157		    }
   158		}
   159	
   160	        util::fclose($fh_flist);
   161	        util::fclose($fh_errorsfile);
   162	    }
   163	    # This should be out of above blocks because of file handler closing.
   164	    re_exec($flist_ptr, $docid_count, $docid_base, $start_time, 
   165		    $total_files_size, $total_files_num,
   166		    $file_count, $key_count) if $checkpoint;
   167	
   168	    if (%KeyIndex) {
   169		$key_count = write_index();
   170		print _("Writing index files...");
   171		write_phrase_hash();
   172		print "\n";
   173	    }
   174	
   175	    $key_count = get_total_keys() unless $key_count;
   176	    do_remain_job($total_files_size, $docid_count, $key_count, 
   177			   $start_time);
   178	    exit 0;
   179	}

結構な量があるが、少しずつみていこう。まず 76 行で main を呼び出す。 呼び出しの main 関数は 77 行から定義されている。 変数 $start_time はまず今の時刻を測りたい、という要求からである。 if ($English::PERL_VERSION == 5.008001) だったら、 次に続く条件を吟味したうえで不合格となる。

87 行は初期化を実行する。実際の初期化の内容は別に回す。

90 行は pl/conf.pl をローディングすることにしている。

93 行は、set_lang()関数で LANG が設定される。 この関数の詳細も追って書く。

94 行にtextdomain 関数がある。これは gettext() がコールされた 際に検索をかけられる条件も知っている。

96 行は自前モジュールのロード (373 行 参照)。

97 行は オプションを解析したときの返り値2種類を得ている。

98 行は 909 行で定義された prep 関数による値。 prep は 準備 (preparation) の意味であろう。

100 行から 107 行は変数定義と初期化である。

109 行から 114 行はチェックポイントを表す CheckPoint ハッシュの 'continue' 要素の値を調べるている。

116 行から 162 行はブロックになっている。 117

process_file


   180	
   181	#
   182	# FIXME: Very complicated.
   183	#
   184	sub process_file ($$$$\%$$) {
   185	    my ($cfile, $docid_count, $docid_base, $file_count, 
   186		$field_indices, $fh_errorsfile, $total_files_num) = @_;
   187	
   188	    my $processed_num = 0;
   189	    my $file_size = util::filesize($cfile);
   190	
   191	    if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
   192		my @parts;
   193		@parts = htmlsplit::split($cfile, "NMZ.partial")
   194	            if ($file_size <= $conf::FILE_SIZE_MAX);
   195		if (@parts > 1) {
   196		    my $id = 0;
   197		    for my $part (@parts) {
   198			next if (defined $conf::EXCLUDE_PATH &&
   199				 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
   200			my $fname = util::tmpnam("NMZ.partial.$id");
   201			my $fragment  = defined $part ? $part : undef;
   202			my $uri   = generate_uri($cfile, $fragment);
   203			my $result = namazu_core($fname, 
   204						 $docid_count + $processed_num, 
   205						 $docid_base, $file_count, 
   206						 $field_indices, $fh_errorsfile, 
   207						 $total_files_num, 
   208						 $uri, $id, $#parts);
   209			if ($result > 0) {
   210			    $processed_num++;
   211			    my $rname = defined $part ? "$cfile\t$part" : "$cfile";
   212			    put_registry($rname);
   213			}
   214		    	unlink $fname;
   215			$id++;
   216		    }
   217		    return ($file_size, $processed_num);
   218		}
   219	    }
   220	    my $result = namazu_core($cfile, $docid_count, $docid_base, 
   221				     $file_count, $field_indices, 
   222				     $fh_errorsfile, $total_files_num,
   223				     undef, undef, undef);
   224	    if ($result > 0) {
   225		$processed_num++;
   226		put_registry($cfile);
   227	    }
   228	
   229	    return ($file_size, $processed_num);
   230	}
   

process_file

まず、sub process_file ($$$$\%$$)の引数に驚いてしまうが、 リファレンスに関係があることをまず注意しておく。

   231	
   232	# 
   233	# Load mknmzrcs:
   234	# 
   235	#  1. MKNMZRC environment
   236	# 
   237	#  2. $(sysconfdir)/$(PACKAGE)/mknmzrc
   238	# 
   239	#  3. ~/.mknmzrc
   240	# 
   241	#  4. user-specified mknmzrc set by mknmz --config=file option.
   242	# 
   243	# If multiple files exists, read all of them.
   244	# 
   245	sub load_rcfiles () {
   246	    my (@cand) = ();
   247	
   248	    # To support Windows. Since they have nasty drive letter convention,
   249	    # it is necessary to change mknmzrc dynamically with env. variable.
   250	    push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
   251	    push @cand, "$CONFDIR/mknmzrc";
   252	    push @cand, "$ENV{'HOME'}/.mknmzrc";
   253	
   254	    util::vprint(_("Reading rcfile: "));
   255	    for my $rcfile (@cand) {
   256		if (-f $rcfile) {
   257			load_rcfile ($rcfile);
   258		    util::vprint(" $rcfile");
   259		}
   260	    }
   261	    util::vprint("\n");
   262	}
   263	
   264	sub load_rcfile ($) {
   265	    my ($rcfile) = @_;
   266	    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
   267		util::win32_yen_to_slash(\$rcfile);
   268	    }
   269	    return if (grep {m/^$rcfile$/} @LoadedRcfiles);
   270	    do $rcfile;
   271	    if ($@) {
   272	        chop $@;
   273	        push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
   274	    }else {
   275	        push @LoadedRcfiles, $rcfile;
   276	    }
   277	
   278	    # Dirty workaround.
   279	    $LIBDIR = $conf::LIBDIR
   280		if (defined $conf::LIBDIR && -d $conf::LIBDIR);
   281	    $FILTERDIR = $conf::FILTERDIR
   282		if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
   283	    $TEMPLATEDIR = $conf::TEMPLATEDIR
   284		if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
   285	}
   286	
   287	sub re_exec($$$$$$$$) {
   288	    my ($flist_ptr, $docid_count, $docid_base, $start_time, 
   289		$total_files_size, $total_files_num, $file_count, $key_count) = @_;
   290	
   291	    # store variables
   292	    {
   293		my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
   294	
   295		print $fh_checkpoint <<EOM;
   296		\$DeletedFilesCount = $DeletedFilesCount;
   297		\$UpdatedFilesCount = $UpdatedFilesCount;
   298		\$APPENDMODE = $APPENDMODE;
   299		\$flist_ptr = $flist_ptr;
   300		\$docid_count = $docid_count;
   301		\$docid_base = $docid_base;
   302		\$start_time = $start_time;
   303		\$total_files_size = $total_files_size;
   304		\$total_files_num = $total_files_num;
   305		\$key_count = $key_count;
   306		\$file_count = $file_count;
   307		\$\$ = $$;
   308	EOM
   309	        util::fclose($fh_checkpoint);
   310	    }
   311	
   312	    @ARGV = ("-S", @ARGV) ;
   313	    print _("Checkpoint reached: re-exec mknmz...\n");
   314	    util::dprint(join ' ', ("::::", @ARGV, "\n"));
   315	    exec ($0, @ARGV) ;
   316	}
   317	
   318	sub put_registry ($) {
   319	    my ($filename) = @_;
   320	    my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
   321	    print $fh_registry $filename, "\n";
   322	    util::fclose($fh_registry);
   323	}
   324	
   325	
   326	# Initialization
   327	#   $CodingSystem: Character Coding System 'euc' or 'sjis'
   328	sub init () {
   329	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   330		$CodingSystem = "sjis";
   331		if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   332		    $CONFDIR = $1 . $CONFDIR ;
   333		}
   334		if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   335		    $LIBDIR = $1 . $LIBDIR ;
   336		}
   337		if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   338		    $FILTERDIR = $1 . $FILTERDIR ;
   339		}
   340		if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
   341		    $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
   342		}
   343	    } else {
   344		$CodingSystem = "euc";
   345	    }
   346	
   347	    $SIG{'INT'}  = sub {
   348		util::cdie("SIGINT caught! Aborted.\n");
   349	    };
   350	
   351	    $SIG{'TERM'}  = sub {
   352		print STDERR "SIGTERM caught!\n";
   353		$ReceiveTERM = 1;
   354	    };
   355	}
   356	
   357	sub preload_modules () { 
   358	    unshift @INC, $LIBDIR;
   359	    # workaround for test suites.
   360	    unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
   361	
   362	    require "var.pl" || die "unable to require \"var.pl\"\n";
   363	    require "conf.pl" || die "unable to require \"conf.pl\"\n";
   364	    require "util.pl" || die "unable to require \"util.pl\"\n";
   365	    require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
   366	    require "ext.pl" || die "unable to require \"ext.pl\"\n";
   367	}
   368	
   369	sub postload_modules () {
   370	    require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
   371	}
   372	
   373	sub load_modules () {
   374	    require "usage.pl" || die "unable to require \"usage.pl\"\n";
   375	    require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
   376	    require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
   377	    require "seed.pl" || die "unable to require \"seed.pl\"\n";
   378	    require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
   379	
   380	    @Seed = seed::init();
   381	}
   382	
   383	sub load_filtermodules () {
   384	    unshift @INC, $FILTERDIR;
   385	
   386	    #
   387	    # Windows modules must be loaded first.
   388	    # Because OLE filters have low precedence over normal ones.
   389	    #
   390	    load_win32modules() if $English::OSNAME eq "MSWin32";
   391	
   392	    # Check filter modules
   393	    my @filters = ();
   394	    @filters = glob "$FILTERDIR/*.pl";
   395	
   396	    load_filters(@filters);
   397	}
   398	
   399	sub load_win32modules () {
   400	    # Check filter modules
   401	    my @filters = ();
   402	    if (-f "../filter/win32/olemsword.pl") { # to ease developing
   403		@filters = glob "../filter/win32/*.pl";
   404		unshift @INC, "../filter/win32";
   405	    } else {
   406		@filters = glob "$FILTERDIR/win32/*.pl";
   407		unshift @INC, "$FILTERDIR/win32";
   408	    }
   409	
   410	    load_filters(@filters);
   411	}
   412	
   413	sub load_filters (@) {
   414	    my @filters = @_;
   415	   
   416	    for my $filter (@filters) {
   417		$filter =~ m!([-\w]+)\.pl$!;
   418		my $module = $1;
   419		require "$module.pl" || die "unable to require \"$module.pl\"\n";;
   420		my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
   421	
   422		eval "\@mtypes =    ${module}::mediatype();";
   423		die $@ if $@;  # eval error
   424		eval "\$status =    ${module}::status();";
   425		die $@ if $@;
   426		eval "\$recursive = ${module}::recursive();";
   427		die $@ if $@;
   428		eval "\$pre_codeconv  = ${module}::pre_codeconv();";
   429		die $@ if $@;
   430		eval "\$post_codeconv  = ${module}::post_codeconv();";
   431		die $@ if $@;
   432		eval "${module}::add_magic(\$Magic);";
   433		die $@ if $@;
   434	
   435		for my $mt (@mtypes) {
   436	        next if (defined $var::Supported{$mt} && 
   437	                 $var::Supported{$mt} eq 'yes' && $status eq 'no');
   438		    $var::Supported{$mt} = $status;
   439		    $var::REQUIRE_ACTIONS{$mt} = $module;
   440		    $var::RECURSIVE_ACTIONS{$mt} = $recursive;
   441		    $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
   442		    $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
   443		}
   444	    }
   445	}
   446	
   447	# Core routine.
   448	#
   449	# FIXME: Too many parameters. They must be cleared.
   450	#
   451	sub namazu_core ($$$$$$$$$$) {
   452	    my ($cfile, $docid_count, $docid_base, 
   453		$file_count, $field_indices, $fh_errorsfile, $total_files_num,
   454		$uri, $part_id, $part_num) = @_;
   455	
   456	    my $headings = "";
   457	    my $content = "";
   458	    my $weighted_str = "";
   459	    my %fields;
   460	    my $msg_prefix;
   461	
   462	    if ($part_id) {
   463		$msg_prefix = "    $part_id/$part_num - ";
   464	    } else {
   465		$msg_prefix = $file_count + 1 . "/$total_files_num - ";
   466	    }
   467	
   468	    unless ($uri) {
   469		$uri = generate_uri($cfile);  # Make a URI from a file name.
   470	    }
   471	    my ($cfile_size, $text_size, $kanji, $mtype) = 
   472		load_document(\$cfile, \$content, \$weighted_str,
   473			      \$headings, \%fields);
   474	
   475	    {
   476	        $fields{'mtime'} = (stat($cfile))[9];
   477	        my $utc = $fields{'mtime'};
   478	        $utc = time::rfc822time_to_mtime($fields{'date'})
   479	                if (defined $fields{'date'});
   480	        if ($utc == -1) {
   481	            my $date = $fields{'date'};
   482	            print "$cfile Illegal date format. : $date\n";
   483	            print $fh_errorsfile "$cfile Illegal date format. : $date\n";
   484	            $utc = $fields{'mtime'};
   485	            delete $fields{'date'};
   486	        }
   487	        $fields{'utc'} = $utc;
   488	    }
   489	
   490	    util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
   491	
   492	    # Check if the file is acceptable.
   493	    my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
   494	    if (defined $err) {
   495		if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   496		    my $uri2 = codeconv::eucjp_to_shiftjis($uri);
   497		    print $msg_prefix . "$uri2 $err\n";
   498		} else {
   499		    print $msg_prefix . "$uri $err\n";
   500		}
   501		print $fh_errorsfile "$cfile $err\n"; 
   502		return 0;  # return 0 if error
   503	    }
   504	
   505	    # Print processing file name as URI.
   506	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   507		my $uri2 = codeconv::eucjp_to_shiftjis($uri);
   508		print $msg_prefix . "$uri2 [$mtype]\n";
   509	    } else {
   510		print $msg_prefix . "$uri [$mtype]\n";
   511	    }
   512	
   513	    # Add filename.
   514	    my $filename = defined $cfile ? $cfile : '';
   515	    codeconv::toeuc(\$filename);
   516	    $filename = basename($filename);
   517	    $fields{'filename'} = $filename;
   518	
   519	    complete_field_info(\%fields, $cfile, $uri, 
   520				\$headings, \$content, \$weighted_str);
   521	    put_field_index(\%fields, $field_indices);
   522	
   523	    put_dateindex($cfile);
   524	    $content .= "\n\n$filename\n\n";    # add filename
   525	    $content .= $weighted_str;          # add weights
   526	    count_words($docid_count, $docid_base, \$content, $kanji);
   527	    make_phrase_hash($docid_count, $docid_base, \$content);
   528	
   529	    # assertion
   530	    util::assert($cfile_size != 0, 
   531			 "cfile_size == 0 at the end of namazu_core.");
   532	
   533	    return $cfile_size;
   534	}
   535	
   536	#
   537	# Make the URI from the given file name.
   538	#
   539	sub generate_uri (@) {
   540	    my ($file, $fragment) = @_;
   541	    return "" unless defined $file;
   542	
   543	    # omit a file name if omittable
   544	    $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o; 
   545	
   546	    if (defined $ReplaceCode) {
   547		# transforming URI by evaling
   548		$_ = $file;
   549		eval $ReplaceCode;
   550		$file = $_;
   551	    }
   552	
   553	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   554		$file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
   555	    }
   556	
   557	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   558		$file = codeconv::shiftjis_to_eucjp($file);
   559	    }
   560	    if (defined $fragment) {
   561	        codeconv::toeuc(\$fragment);
   562	    }
   563	
   564	    unless ($var::Opt{'noencodeuri'}) {
   565		for my $tmp ($file, $fragment) {
   566		    next unless defined $tmp;
   567	
   568		    # Escape unsafe characters (not strict)
   569		    $tmp =~ s/\%/%25/g;  # Convert original '%' into '%25' v1.1.1.2
   570		    $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
   571			sprintf("%%%02X",ord($1))/ge;
   572		}
   573	    }
   574	
   575	
   576	    my $uri = $file;
   577	    $uri .= "#" . $fragment if defined $fragment;
   578	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
   579	        # restore '|' for drive letter rule of Win32, OS/2
   580	        $uri =~ s!^/([A-Z])%7C!/$1|!i;
   581	    }
   582	    return $uri;
   583	}
   584	
   585	
   586	sub get_field_index_base (\%) {
   587	    my ($field_indices) = @_;
   588	
   589	    my @keys = split('\|', $conf::SEARCH_FIELD);
   590	    if ($var::Opt{'meta'}) {
   591		push @keys, (split '\|', $conf::META_TAGS);
   592	    }
   593	    for my $key (@keys) {
   594		$key = lc($key);
   595		my $fname    = "$var::NMZ{'field'}.$key";
   596		my $tmp_fname = util::tmpnam("NMZ.field.$key");
   597		my $size = 0;
   598		$size = -s $fname if -f $fname;
   599		$size += -s $tmp_fname if -f $tmp_fname;
   600		$field_indices->{$key} = $size;
   601	    }
   602	}
   603	
   604	sub complete_field_info (\%$$\$\$\$) {
   605	    my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
   606	
   607	    for my $field (keys %{$fields}) {
   608	        if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
   609	            delete $fields->{$field};
   610	        }
   611	    }
   612	
   613	    unless (defined($fields->{'title'})) {
   614		$fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
   615	    }
   616	    unless (defined($fields->{'date'})) {
   617		my $mtime = $fields->{'mtime'};
   618		my $date = util::rfc822time($mtime);
   619		$fields->{'date'} = $date;
   620	    }
   621	    unless (defined($fields->{'uri'})) {
   622		$fields->{'uri'} = $uri;
   623	    }
   624	    unless (defined($fields->{'size'})) {
   625		$fields->{'size'} = -s $cfile;
   626	    }
   627	    unless (defined($fields->{'summary'})) {
   628		$fields->{'summary'} = make_summary($contref, $headings, $cfile);
   629	    }
   630	    unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
   631		$fields->{'from'} = getmsg("unknown");
   632	    }
   633	}
   634	
   635	#
   636	# Currently, messages for NMZ.* files should be encoded in
   637	# EUC-JP currently. We cannot use gettext.pl for the messsage
   638	# because gettext.pl may use Shift_JIS encoded messages.
   639	# So, we should use the function instead of gettext().
   640	#
   641	# FIXME: Ad hoc impl.  getmsg() is effective only for "unknown".
   642	#
   643	sub getmsg($) {
   644	    my ($msg) = @_;
   645	
   646	    if (util::islang_msg("ja")) {
   647		if ($msg eq "unknown") {
   648		    return "不明";
   649		}
   650	    }
   651	    return $msg;
   652	}
   653	
   654	sub make_summary ($$$) {
   655	    my ($contref, $headings, $cfile) = @_;
   656	
   657	    # pick up $conf::MAX_FIELD_LENGTH bytes string
   658	    my $tmp = "";
   659	    if ($$headings ne "") {
   660		$$headings =~ s/^\s+//;
   661		$$headings =~ s/\s+/ /g;
   662		$tmp = $$headings;
   663	    } else {
   664		$tmp = "";
   665	    }
   666	
   667	    my $offset = 0;
   668	    my $tmplen = 0;
   669	    while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
   670	           && $offset < length($$contref))
   671	    {
   672	        $tmp .= substr $$contref, $offset, $tmplen;
   673	        $offset += $tmplen;
   674	        $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
   675	        $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
   676	    }
   677	
   678	    # -1 means "LF"
   679	    my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
   680	    # Remove a garbage Kanji 1st char at the end.
   681	    $summary = codeconv::chomp_eucjp($summary);
   682	
   683	    $summary =~ s/^\s+//;
   684	    $summary =~ s/\s+/ /g;   # normalize white spaces
   685	
   686	    return $summary;
   687	}
   688	
   689	
   690	# output the field infomation into NMZ.fields.* files
   691	sub put_field_index (\%$) {
   692	    my ($fields, $field_indices) = @_;
   693	
   694	    my $aliases_regex = 
   695		join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
   696	
   697	    for my $field (keys %{$fields}) {
   698	        util::dprint("Field: $field: $fields->{$field}\n");
   699		if ($field =~ /^($aliases_regex)$/o) {
   700		    unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
   701			$fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
   702		    }
   703		    undef $fields->{$field};
   704		}
   705	    }
   706	
   707	    my @keys = split '\|', $conf::SEARCH_FIELD;
   708	    if ($var::Opt{'meta'}) {
   709	        my @meta = split '\|', $conf::META_TAGS;
   710	        while (my $meta = shift(@meta)) {
   711	            $meta = $conf::FIELD_ALIASES{$meta}
   712	                if (defined $conf::FIELD_ALIASES{$meta});
   713	
   714	            push @keys, $meta;
   715	        }
   716	
   717	        # uniq @keys
   718	        my %mark = ();
   719	        @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
   720	    }
   721	    for my $key (@keys) {
   722		my $lkey = lc($key);
   723		my $fname    = util::tmpnam("NMZ.field.$lkey");
   724		my $fh_field = util::efopen(">>$fname");
   725		my $output = "";
   726		if (defined($fields->{$key})) {
   727		    if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
   728			$fields->{$key} =~ s/\s+/ /g;
   729			$fields->{$key} =~ s/\s+$//;
   730			$fields->{$key} =~ s/^\s+//;
   731		    }
   732		    $output = $fields->{$key};
   733	
   734		    # -1 means "LF"
   735		    $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
   736		    # Remove a garbage Kanji 1st char at the end.
   737		    $output = codeconv::chomp_eucjp($output);
   738	
   739	            $output =~ s/\n.*$//s;
   740	            $output .= "\n";
   741		} else {
   742		    $output = "\n";
   743		}
   744		print $fh_field $output;
   745	        util::fclose($fh_field);
   746	
   747		# put index of field index
   748		{
   749		    my $fname        = util::tmpnam("NMZ.field.$lkey.i");
   750		    my $fh_field_idx = util::efopen(">>$fname");
   751		    print $fh_field_idx pack("N", $field_indices->{$lkey});
   752		    $field_indices->{$lkey} += length $output;
   753	            util::fclose($fh_field_idx);
   754		}
   755	    }
   756	
   757	}
   758	
   759	# put the date infomation into NMZ.t file
   760	sub put_dateindex ($) {
   761	    my ($cfile) = @_;
   762	    my $mtime = (stat($cfile))[9];
   763	
   764	    my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
   765	    print $fh_dataindex pack("N", $mtime);
   766	    util::fclose($fh_dataindex);
   767	}
   768	
   769	
   770	# load a document file
   771	sub load_document ($$$$\%) {
   772	    my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
   773	      = @_;
   774	    my $cfile = $$orig_cfile;
   775	
   776	    return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile));
   777	
   778	    # for handling a filename which contains Shift_JIS code for Windows.
   779	    # for handling a filename which contains including space.
   780	    my $shelter_cfile = "";
   781	    if (($cfile =~ /\s/) ||
   782	        ($English::OSNAME eq "MSWin32"
   783	        && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
   784	    {
   785		$shelter_cfile = $cfile;
   786		$cfile = util::tmpnam("NMZ.win32");
   787		unlink $cfile if (-e $cfile);
   788		copy($shelter_cfile, $cfile);
   789	    }
   790	
   791	    my $file_size;
   792	    $file_size = util::filesize($cfile); # not only file in feature.
   793	    if ($file_size > $conf::FILE_SIZE_MAX) {
   794		return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
   795	    }
   796	
   797	    $$contref = util::readfile($cfile);
   798	#    $file_size = length($$contref);
   799	
   800	    my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
   801	
   802	    if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") {
   803		unlink $cfile;
   804		$cfile = $shelter_cfile;
   805	    }
   806	
   807	    # Measure the text size at this time.
   808	    my $text_size = length($$contref) + length($$weighted_str); 
   809	
   810	    return ($file_size, $text_size, $kanji, $mtype);
   811	}
   812	
   813	sub apply_filter($$$$$$$) {
   814	    my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
   815	      = @_;
   816	    my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile;
   817	
   818	    # Filtering process.
   819	    my $mtype;
   820	    my $called_dt = 0;
   821	    while (1) {
   822		if (defined $MediaType) {
   823		    $mtype = $MediaType;
   824		} elsif (defined $mmtype) {
   825		    $mtype = $mmtype;
   826		} else {
   827		    my $mtype_n = $Magic->checktype_byfilename($cfile);
   828		    my $mtype_c = $Magic->checktype_data($$contref);
   829		    my $mtype_m;
   830		    $mtype_m = $Magic->checktype_magic($$contref) 
   831		      if ((! defined $mtype_c) ||
   832			  $mtype_c =~ 
   833			  /^(text\/html|text\/plain|application\/octet-stream)$/);
   834		    $mtype_c = $mtype_m 
   835			if (defined $mtype_m && 
   836			    $mtype_m !~ 
   837			    /^(text\/html|text\/plain|application\/octet-stream)$/);
   838		    $mtype_c = 'text/plain' unless defined $mtype_c;
   839		    if ($called_dt) {
   840			$mtype = $mtype_c;
   841		    } else {
   842			$mtype = decide_type($mtype_n, $mtype_c);
   843			$called_dt = 1;
   844		    }
   845		}
   846		util::dprint(_("Detected type: ")."$mtype\n");
   847	
   848		# Pre code conversion.
   849		if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
   850		    util::dprint("pre_codeconv\n");
   851		    codeconv_document($contref);
   852		}
   853	
   854		if (! $var::Supported{$mtype} || 
   855		    $var::Supported{$mtype} ne 'yes') 
   856		{
   857		    util::vprint(_("Unsupported media type ")."$mtype\n");
   858		    return (0, "$mtype; x-system=unsupported");
   859		}
   860	
   861		if ($var::REQUIRE_ACTIONS{$mtype}) {
   862		    util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
   863		    require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
   864		        || die _("unable to require ") . 
   865			    "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
   866		    my $err = undef;
   867		    {
   868			local $SIG{'PIPE'} = \&trapintr;
   869			eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
   870			  '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
   871		    }
   872		    if ($err) {
   873			if ($err =~ m/; x-system=unsupported$/) {
   874			    return (0, $err);
   875			}
   876			return (0, "$mtype; x-error=$err");
   877		    }
   878	
   879		    if ($@) {
   880			util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
   881			return (0, "$mtype; x-error=$@");
   882		    }
   883	
   884		    # Post code conversion.
   885		    if ($var::REQUIRE_POST_CODECONV{$mtype}) {
   886		        util::dprint("post_codeconv\n");
   887			codeconv_document($contref);
   888		    }
   889	
   890		    next if ($var::RECURSIVE_ACTIONS{$mtype});
   891		}
   892		last;
   893	    }
   894	
   895	    my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/;  # Kanji contained?
   896	    $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
   897	
   898	    return ($kanji, $mtype);
   899	}
   900	
   901	sub codeconv_document ($) {
   902	    my ($textref) = @_;
   903	    codeconv::toeuc($textref);
   904	    $$textref =~ s/\r\n/\n/g;
   905	    $$textref =~ s/\r/\n/g;
   906	    $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
   907	}
   908	
   909	sub prep () {
   910	    my $docid_base = 0;
   911	    my $output_dir = shift @_ ;
   912	    my @targets = @_ ;
   913	    my @flist = ();
   914	
   915	    $var::OUTPUT_DIR = $output_dir;
   916	
   917	    require_modules();
   918	    change_filenames();
   919	    check_present_index();
   920	
   921	    # if Checkpoint mode, return
   922	    return (0, 0) if $CheckPoint{'continue'};
   923	
   924	    check_lockfile($var::NMZ{'lock2'});
   925	    print _("Looking for indexing files...\n");
   926	    @flist = find_target(@targets);
   927	    ($docid_base, @flist) = append_index(@flist) 
   928		if -f $var::NMZ{'r'};
   929	    unless (@flist) { # if @flist is empty
   930		print _("No files to index.\n");
   931		exit 0;
   932	    }
   933	    set_lockfile($var::NMZ{'lock2'});
   934	    save_flist(@flist);
   935	    my $total_files_num = @flist;
   936	
   937	    return ($docid_base, $total_files_num);
   938	}
   939	
   940	sub save_flist(@) {
   941	    my @flist = @_;
   942	    return if (@flist == 0);
   943	
   944	    my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
   945	    print $fh_flist join("\n", @flist), "\n";
   946	    util::fclose($fh_flist);
   947	}
   948	
   949	sub require_modules() {
   950	    if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
   951		require NKF || die "unable to require \"NKF\"\n";
   952	        util::dprint(_("code conversion: using NKF module\n"));
   953		$var::USE_NKF_MODULE = 1;
   954	    }
   955	    if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
   956		require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
   957	        util::dprint(_("wakati: using Text::Kakasi module\n"));
   958		my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
   959	    }
   960	    if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
   961		require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
   962	        util::dprint(_("wakati: using Text::ChaSen module\n"));
   963		my @arg = ('-i', 'e', '-j', '-F', '%m ');
   964		@arg    = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
   965		my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
   966	    }
   967	    if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) {
   968	        require MeCab || die "unable to require \"MeCab\"\n";
   969	        util::dprint(_("wakati: using MeCab module\n"));
   970	    }
   971	}
   972	
   973	sub check_lockfile ($) {
   974	    # warn if check file exists in case other process is running or abnormal
   975	    # stop execution (later is not the major purpose, though).
   976	    # This is mainly for early detection before longish find_target.
   977	    my ($file) = @_;
   978	
   979	    if (-f $file) {
   980		print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
   981		exit 1;
   982	    }
   983	}
   984	
   985	sub set_lockfile ($) {
   986	    my ($file) = @_;
   987	
   988	    # make a lock file
   989	    if (-f $file) {
   990		print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
   991		exit 1;
   992	    } else {
   993		my $fh_lockfile = util::efopen(">$file");
   994		print $fh_lockfile "$$"; # save pid
   995	        util::fclose($fh_lockfile);
   996	    }
   997	}
   998	
   999	sub remove_lockfile ($) {
  1000	    my ($file) = @_;
  1001	
  1002	    # remove lock file
  1003	    unlink $file if -f $file;
  1004	}
  1005	
  1006	# check present index whether it is old type of not
  1007	sub check_present_index () {
  1008	    if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}") 
  1009	    {
  1010		util::cdie(_("Present index is old type. it's unsupported.\n"));
  1011	    }
  1012	}
  1013	
  1014	# remain
  1015	sub do_remain_job ($$$$) {
  1016	    my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
  1017	
  1018	    if ($docid_count == 0) {
  1019		# No files are indexed
  1020		if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
  1021		    update_dateindex();
  1022		    update_registry($docid_count);
  1023		}
  1024	    } else {
  1025		set_lockfile($var::NMZ{'lock'});
  1026		write_version();
  1027		write_body_msg();
  1028		write_tips_msg();
  1029		write_result_file();
  1030		update_field_index();
  1031		update_dateindex();
  1032		update_registry($docid_count);
  1033		write_nmz_files();
  1034		make_slog_file();
  1035		remove_lockfile($var::NMZ{'lock'});
  1036	    }
  1037	    make_headfoot_pages($docid_count, $key_count);
  1038	    put_log($total_files_size, $start_time, $docid_count, $key_count);
  1039	    util::remove_tmpfiles();
  1040	    unlink $var::NMZ{'_flist'};
  1041	}
  1042	
  1043	sub make_headfoot_pages($$) {
  1044	    my ($docid_count, $key_count) = @_;
  1045	
  1046	    for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
  1047	        if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
  1048	            my $suffix = $1 ? $1 : '';
  1049	            make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
  1050	        }
  1051	    }
  1052	    for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
  1053	        if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
  1054	            my $suffix = $1 ? $1 : '';
  1055	            make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
  1056	       }
  1057	    }
  1058	}
  1059	
  1060	# Parse command line options.
  1061	sub parse_options
  1062	{
  1063	    if (@ARGV == 0) {
  1064		show_mini_usage();
  1065		exit 1;
  1066	    }
  1067	
  1068	    my @targets = ();
  1069	    my $targets_loaded = 0;
  1070	    my @argv = @ARGV;
  1071	    my $cwd = cwd();
  1072	
  1073	    my $opt_dummy = 0;
  1074	    my $opt_version = 0;
  1075	    my $opt_help = 0;
  1076	    my $opt_all = 0;
  1077	    my $opt_chasen = 0;
  1078	    my $opt_chasen_noun = 0;
  1079	    my $opt_kakasi = 0;
  1080	    my $opt_mecab = 0;
  1081	    my $opt_checkpoint_sub = 0;
  1082	    my $opt_show_config = 0;
  1083	    my $opt_mailnews = 0;
  1084	    my $opt_mhonarc = 0;
  1085	    my $opt_norc = 0;
  1086	
  1087	    my $opt_quiet = undef;
  1088	    my $opt_config = undef;
  1089	    my $output_dir = undef;
  1090	    my $update_index = undef;
  1091	    my $include_file = undef;
  1092	    my $target_list = undef;
  1093	    my $index_lang = undef;
  1094	
  1095	    my %opt_conf;
  1096	
  1097	#    Getopt::Long::Configure('bundling');
  1098	    Getopt::Long::config('bundling');
  1099	    GetOptions(
  1100	       	       '0|help'              => \$opt_help,
  1101		       '1|exclude=s'         => \$opt_conf{'EXCLUDE_PATH'},
  1102		       '2|deny=s'            => \$opt_conf{'DENY_FILE'},
  1103		       '3|allow=s'           => \$opt_conf{'ALLOW_FILE'},
  1104		       '4|update=s'          => \$update_index,
  1105		       '5|mhonarc'           => \$opt_mhonarc,
  1106		       '6|mtime=s'           => \$var::Opt{'mtime'},
  1107		       '7|html-split'        => \$var::Opt{'htmlsplit'},
  1108		       'C|show-config'       => \$opt_show_config,
  1109		       'E|no-edge-symbol'    => \$var::Opt{'noedgesymbol'},
  1110		       'F|target-list=s'     => \$target_list,
  1111		       'G|no-okurigana'      => \$var::Opt{'okurigana'},
  1112		       'H|no-hiragana'       => \$var::Opt{'hiragana'},
  1113		       'I|include=s'         => \$include_file,
  1114		       'K|no-symbol'         => \$var::Opt{'nosymbol'},
  1115		       'L|indexing-lang=s'     => \$index_lang,
  1116		       'M|meta'              => \$var::Opt{'meta'},
  1117		       'O|output-dir=s'      => \$output_dir,
  1118		       'S|checkpoint-sub'    => \$opt_checkpoint_sub,
  1119		       'T|template-dir=s'    => \$TEMPLATEDIR,
  1120		       'U|no-encode-uri'     => \$var::Opt{'noencodeuri'} ,
  1121		       'V|verbose'           => \$var::Opt{'verbose'},
  1122		       'Y|no-delete'         => \$var::Opt{'nodelete'},
  1123		       'Z|no-update'         => \$var::Opt{'noupdate'},
  1124		       'a|all'               => \$opt_all,
  1125	               'b|use-mecab'         => \$opt_mecab,
  1126		       'c|use-chasen'        => \$opt_chasen,
  1127		       'd|debug'             => \$var::Opt{'debug'},
  1128		       'e|robots'            => \$var::Opt{'robotexclude'},
  1129		       'f|config=s'          => \$opt_config,
  1130		       'h|mailnews'          => \$opt_mailnews,
  1131		       'k|use-kakasi'        => \$opt_kakasi,
  1132		       'm|use-chasen-noun'   => \$opt_chasen_noun,
  1133		       'q|quiet'             => \$opt_quiet,
  1134		       'r|replace=s'         => \$ReplaceCode,
  1135		       's|checkpoint'        => \$CheckPoint{'on'},
  1136		       't|media-type=s'      => \$MediaType,
  1137		       'u|uuencode'          => \$opt_dummy, # for backward compat.
  1138		       'v|version'           => \$opt_version,
  1139		       'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
  1140		       'z|check-filesize'    => \$var::Opt{'checkfilesize'},
  1141		       'decode-base64'       => \$var::Opt{'decodebase64'},
  1142		       'norc'                => \$opt_norc,
  1143		       );
  1144	
  1145	    if ($opt_quiet) {
  1146		# Make STDOUT quiet by redirecting STDOUT to null device.
  1147		my $devnull = util::devnull();
  1148		open(STDOUT, ">$devnull") || die "$devnull: $!";
  1149	    }
  1150	
  1151	    if (defined $update_index) {
  1152		unless (-d $update_index) {
  1153		    print _("No such index: "), "$update_index\n";
  1154		    exit 1;
  1155		}
  1156	
  1157		my $orig_status = $var::NMZ{'status'};
  1158		$var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
  1159	
  1160		my $argv = get_status("argv");
  1161		if (!defined $argv) {
  1162		    print _("No such index: "), "$update_index\n";
  1163		    exit 1;
  1164		}
  1165		@ARGV = split /\t/, $argv;
  1166		util::dprint(_("Inherited argv: ")."@ARGV\n");
  1167	
  1168		my $cwd  = get_status("cwd");
  1169		if (!defined $cwd) {
  1170		    print _("No such index: "), "$update_index\n";
  1171		    exit 1;
  1172		}
  1173		chdir $cwd;
  1174		util::dprint(_("Inherited cwd: ")."$cwd\n");
  1175	
  1176		($output_dir, @targets) = parse_options();
  1177		$output_dir = $update_index;
  1178		$var::NMZ{'status'} = $orig_status;  # See also change_filenames()
  1179		return ($output_dir, @targets);
  1180	    }
  1181	
  1182	    if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
  1183	        load_rcfiles();
  1184	    }
  1185	    if ($opt_config) {
  1186	        if (-f $opt_config) {
  1187	            util::vprint(_("Reading rcfile: "));
  1188	            load_rcfile($ConfigFile = $opt_config);
  1189	            util::vprint(" $opt_config\n");
  1190	        }
  1191	    }
  1192	
  1193	    if ($index_lang) {
  1194		$util::LANG = $index_lang;
  1195	      util::dprint("Override indexing language: $util::LANG\n");
  1196	    }
  1197	
  1198	    if ($opt_help) {
  1199		show_usage();
  1200		exit 1;
  1201	    }
  1202	
  1203	    if ($opt_version) {
  1204		show_version();
  1205		exit 1;
  1206	    }
  1207	
  1208	    load_filtermodules(); # to make effect $opt_config, $index_lang.
  1209	    postload_modules();
  1210	
  1211	    foreach my $key (keys %opt_conf){
  1212		if (defined ($opt_conf{$key})) { 
  1213		     ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key};
  1214		}
  1215	    }
  1216	
  1217	    if ($opt_mailnews) {
  1218		$MediaType = 'message/rfc822';
  1219	    }
  1220	    if ($opt_mhonarc) {
  1221		$MediaType = 'text/html; x-type=mhonarc';
  1222	    }
  1223	    if ($opt_all) {
  1224		$conf::ALLOW_FILE = ".*";
  1225	    }
  1226	    if ($opt_chasen) {
  1227		$conf::WAKATI = $conf::CHASEN;
  1228		$var::Opt{'noun'} = 0;
  1229	    }
  1230	    if ($opt_chasen_noun) {
  1231		$conf::WAKATI = $conf::CHASEN_NOUN;
  1232		$var::Opt{'noun'} = 1;
  1233	    }
  1234	    if ($opt_kakasi) {
  1235		$conf::WAKATI = $conf::KAKASI;
  1236		$var::Opt{'noun'} = 0;
  1237	    }
  1238	    if ($opt_mecab) {
  1239	        $conf::WAKATI = $conf::MECAB;
  1240	        $var::Opt{'noun'} = 0;
  1241	    }
  1242	    if ($include_file) {
  1243		do $include_file;
  1244	        util::dprint("Included: $include_file\n");
  1245	    }
  1246	    if ($target_list) {
  1247		if ($CheckPoint{'continue'}) {
  1248		    @targets = ("dummy");
  1249		} else {
  1250		    @targets = load_target_list($target_list);
  1251		    util::dprint(_("Loaded: ")."$target_list\n");
  1252		}
  1253		$targets_loaded = 1;
  1254	    }
  1255	    if ($opt_checkpoint_sub) {
  1256		$CheckPoint{'on'}           = 1;
  1257		$CheckPoint{'continue'}     = 1;
  1258		@argv = grep {! /^-S$/} @argv;  # remove -S
  1259	    }
  1260	
  1261	    if (defined $ReplaceCode) {
  1262		my $orig = "/foo/bar/baz/quux.html";
  1263		$_ = $orig;
  1264		eval $ReplaceCode;
  1265		if ($@) {  # eval error
  1266		    util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
  1267		}
  1268		util::dprint(_("Replace: ")."$orig -> $_\n");
  1269	    }
  1270	
  1271	    if ($opt_show_config) {
  1272		show_config();
  1273		exit 1;
  1274	    }
  1275	
  1276	    if (@ARGV == 0 && $targets_loaded == 0) {
  1277		show_mini_usage();
  1278		exit 1;
  1279	    }
  1280	
  1281	    $output_dir = $cwd unless defined $output_dir;
  1282	    util::cdie("$output_dir: "._("invalid output directory\n"))
  1283		unless (-d $output_dir && -w $output_dir);
  1284	
  1285	    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
  1286		util::win32_yen_to_slash(\$output_dir);
  1287	    }
  1288	
  1289	    # take remaining @ARGV as targets
  1290	    if (@ARGV > 0 && $targets_loaded == 0) {
  1291		@targets = @ARGV ;
  1292	    }
  1293	    
  1294	    # revert @ARGV
  1295	    # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
  1296	    @ARGV = @argv;
  1297	
  1298	    return ($output_dir, @targets);
  1299	}
  1300	
  1301	sub show_config () {
  1302	    print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
  1303	    print _("System: ") . "$English::OSNAME\n" if $English::OSNAME;
  1304	    print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
  1305	    print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION);
  1306	    print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION;
  1307	    print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
  1308	    print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
  1309	    print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
  1310	    print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB;
  1311	    print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
  1312	    print _("Lang_Msg: ") . "$util::LANG_MSG\n";
  1313	    print _("Lang: ") . "$util::LANG\n";
  1314	    print _("Coding System: ") . "$CodingSystem\n";
  1315	    print _("CONFDIR: ") . "$CONFDIR\n";
  1316	    print _("LIBDIR: ") . "$LIBDIR\n";
  1317	    print _("FILTERDIR: ") . "$FILTERDIR\n";
  1318	    print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
  1319	
  1320	    my @all_types =     keys %var::Supported;
  1321	    my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types;
  1322	
  1323	    my $num_supported = @supported;
  1324	    my $num_unsupported = @all_types - @supported;
  1325	    print _("Supported media types:   ") . "($num_supported)\n";
  1326	    print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n");
  1327	    for my $mtype (sort keys %var::Supported) {
  1328		my $yn = $var::Supported{$mtype};
  1329		if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
  1330		print "$yn $mtype";
  1331		if ($var::REQUIRE_ACTIONS{$mtype}){
  1332		    print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
  1333		}
  1334		print "\n";
  1335	    }
  1336	}
  1337	
  1338	sub load_target_list ($) {
  1339	    my ($file) = @_;
  1340	    my $fh_targets = util::efopen($file);
  1341	    my @targets = <$fh_targets>;
  1342	    util::fclose($fh_targets);
  1343	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
  1344		foreach my $tmp (@targets){
  1345		    $tmp =~ s/\r//g;
  1346		    util::win32_yen_to_slash(\$tmp);
  1347		}
  1348	    }
  1349	    chomp @targets; 
  1350	    return @targets;
  1351	}
  1352	
  1353	# convert a relative path into an absolute path
  1354	sub absolute_path($$) {
  1355	    my ($cwd, $path) = @_;
  1356	
  1357	    $path =~ s!^\.$!\./!;
  1358	    $path =~ s!^\.[/\\]!$cwd/!;
  1359	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
  1360		util::win32_yen_to_slash(\$path);
  1361		if ($path =~ m!^//!) {
  1362		} elsif ($path =~ m!^/[^/]!) {
  1363		    my $driveletter = $cwd;
  1364		    if ($driveletter =~ m!^([A-Z]:)!i){
  1365			$driveletter = $1;
  1366		    }
  1367		    $path = "$driveletter$path";
  1368		} elsif ($path !~ m!^[A-Z]:/!i) {
  1369	            $path = "$cwd/$path";
  1370	        }
  1371	    } else {
  1372	        $path =~ s!^([^/])!$cwd/$1!; 
  1373	    }
  1374	    return $path;
  1375	}
  1376	
  1377	sub find_target (@) {
  1378	    my @targets = @_;
  1379	
  1380	    my $cwd = cwd();
  1381	    @targets = map { absolute_path($cwd, $_) } @targets;
  1382	
  1383	    # Convert \ to / with consideration for Shift_JIS encoding.
  1384	    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
  1385	        foreach my $tmp (@targets){
  1386		    util::win32_yen_to_slash(\$tmp);
  1387		}
  1388	    }
  1389	
  1390	    # For reporting effects of --allow, --deny, --exclude, --mtime
  1391	    # options in --verbose mode.
  1392	    my %counts = (); 
  1393	    $counts{'possible'} = 0;
  1394	    $counts{'excluded'} = 0;
  1395	    $counts{'too_old'} = 0;
  1396	    $counts{'too_new'} = 0;
  1397	    $counts{'not_allowed'} = 0;
  1398	    $counts{'denied'} = 0;
  1399	
  1400	    # Traverse directories.
  1401	    # This routine is not efficent but I prefer reliable logic.
  1402	    my @flist = ();
  1403	    my $start = time();
  1404	    util::vprint(_("find_target starting: "). localtime($start). "\n");
  1405	    while (@targets) {
  1406		my $target = shift @targets;
  1407	
  1408		if ($target eq '') {
  1409		    print STDERR "Warning: target contains empty line, skip it\n";
  1410		    next;
  1411		}
  1412		
  1413		if (-f $target) { # target is a file.
  1414		    add_target($target, \@flist, \%counts);
  1415		} elsif (-d $target) { # target is a directory.
  1416		    my @subtargets = ();
  1417		    # Find subdirectories in target directory
  1418		    # because File::Find::find() does not follow symlink.
  1419		    if (-l $target) {
  1420			my $dh = new DirHandle($target);
  1421			while (defined(my $ent = $dh->read)) {
  1422			    next if ($ent =~ /^\.{1,2}$/);
  1423			    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
  1424	                        next if ($ent =~ m!^($conf::DENY_DDN)$!i);
  1425	                        my $tmp = $ent;
  1426	                        util::win32_yen_to_slash(\$tmp);
  1427	                        next if ($ent ne $tmp);
  1428	                    }
  1429			    my $fname = "$target/$ent";
  1430			    next if ($fname eq '.' || $fname eq '..');
  1431			    if (-d $fname) {
  1432				push(@subtargets, $fname);
  1433			    } else {
  1434				add_target($fname, \@flist, \%counts);
  1435			    }
  1436			}
  1437		    } else {
  1438			@subtargets = ($target);
  1439		    }
  1440	
  1441		    #
  1442		    # Wanted routine for File::Find's find().
  1443		    #
  1444		    my $wanted_closure = sub {
  1445			my $fname = "$File::Find::dir/$_";
  1446			add_target($fname, \@flist, \%counts);
  1447		    };
  1448	
  1449		    find($wanted_closure, @subtargets) if (@subtargets > 0);
  1450		} else {
  1451		    print STDERR _("unsupported target: ") . $target;
  1452		}
  1453	    }
  1454	
  1455	    # uniq @flist
  1456	    my %mark = ();
  1457	    @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
  1458	
  1459	    # Sort file names with consideration for numbers.
  1460	    @flist = map  { $_->[0] }
  1461		     sort { $a->[1] cmp $b->[1] } 
  1462		     map  { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge; 
  1463			    [ $_, $tmp ] } @flist;
  1464	
  1465	    my $elapsed = time() - $start ;
  1466	    $elapsed += 1 ;   # to round up and avoid 0 
  1467	
  1468	    # For --verbose option.
  1469	    report_find_target($elapsed, $#flist + 1, %counts);
  1470	
  1471	    return @flist;
  1472	}
  1473	
  1474	sub add_target ($\@\%) {
  1475	    my ($target, $flists_ref, $counts_ref) = @_;
  1476	
  1477	    if ($target =~ /[\n\r\t]/) {
  1478		$target =~ s/[\n\r\t]//g;
  1479		print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
  1480		return;   # skip a file name containing LF/CR/TAB chars.
  1481	    }
  1482	
  1483	    return unless -f $target;  # Only file is targeted.
  1484	
  1485	    $counts_ref->{'possible'}++;
  1486	
  1487	    unless (util::canopen($target)) {
  1488	        util::vprint(sprintf(_("Unreadable:	%s"), $target));
  1489		$counts_ref->{'excluded'}++;
  1490		return;
  1491	    }
  1492	
  1493	
  1494	    if (defined $conf::EXCLUDE_PATH && 
  1495		$target =~ /$conf::EXCLUDE_PATH/ ) 
  1496	    {
  1497	        util::vprint(sprintf(_("Excluded:	%s"), $target));
  1498		$counts_ref->{'excluded'}++;
  1499		return; 
  1500	    }
  1501	
  1502	    #
  1503	    # Do processing just like find's  --mtime option.
  1504	    #
  1505	    if (defined $var::Opt{'mtime'}) {
  1506		my $mtime = -M $_;
  1507		if ($var::Opt{'mtime'} < 0) {
  1508	
  1509		    # This must be `>=' not `>' for consistency with find(1).
  1510		    if (int($mtime) >= - $var::Opt{'mtime'}) {
  1511		        util::vprint(sprintf(_("Too old:	%s"), $target));
  1512		        $counts_ref->{'too_old'}++;
  1513			return;
  1514		    }
  1515		} elsif ($var::Opt{'mtime'} > 0) {
  1516		    if ($var::Opt{'mtime'} =~ /^\+/) {
  1517			if ((int($mtime) < $var::Opt{'mtime'})) {
  1518			    util::vprint(sprintf(_("Too new:	%s"), $target));
  1519			    $counts_ref->{'too_new'}++;
  1520			    return;
  1521			}
  1522		    } else {
  1523			if (int($mtime) != $var::Opt{'mtime'}) {
  1524			    if (int($mtime) > $var::Opt{'mtime'}) {
  1525			        util::vprint(sprintf(_("Too old:	%s"),$target));
  1526				$counts_ref->{'too_old'}++;
  1527			    } else {
  1528			        util::vprint(sprintf(_("Too new:	%s"),$target));
  1529				$counts_ref->{'too_new'}++;
  1530			    }
  1531			    return;
  1532			}
  1533		    }
  1534		} else {
  1535		    # $var::Opt{'mtime'} == 0 ;
  1536		    return;
  1537		}
  1538	    }
  1539	
  1540	    # Extract the file name of the target.
  1541	    $target =~ m!^.*/([^/]+)$!;
  1542	    my $fname = $1;
  1543	
  1544	    if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
  1545	        util::vprint(sprintf(_("Denied:	%s"), $target));
  1546		$counts_ref->{'denied'}++; 
  1547		return;
  1548	    }
  1549	    if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
  1550	        util::vprint(sprintf(_("Not allowed:	%s"), $target));
  1551		$counts_ref->{'not_allowed'}++; 
  1552		return;
  1553	    } else{
  1554	        util::vprint(sprintf(_("Targeted:	%s"), $target));
  1555		push @$flists_ref, $target;
  1556	    }
  1557	
  1558	}
  1559	
  1560	sub report_find_target ($$%) {
  1561	    my ($elapsed, $num_targeted, %counts) = @_;
  1562	
  1563	    util::vprint(_("find_target finished: ") . localtime(time()). "\n");
  1564	    util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"), 
  1565				 $num_targeted, $elapsed, 
  1566				 $num_targeted /$elapsed));
  1567	    util::vprint(sprintf(_("  Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"), 
  1568				 $counts{'possible'}, 
  1569				 $counts{'not_allowed'},  
  1570				 $counts{'denied'}, 
  1571				 $counts{'excluded'}));
  1572	    util::vprint(sprintf(_("  MTIME too old: %d, MTIME too new: %d"),
  1573				 $counts{'too_old'}, 
  1574				 $counts{'too_new'}));
  1575	}
  1576	
  1577	sub show_usage () {
  1578	    util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
  1579	    util::dprint(_("lang: ")."$util::LANG\n");
  1580	
  1581	    my $usage = $usage::USAGE;
  1582	    $usage = _($usage);
  1583	    printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS;
  1584	}
  1585	
  1586	sub show_mini_usage () {
  1587	    print _("Usage: mknmz [options] <target>...\n");
  1588	    print _("Try `mknmz --help' for more information.\n");
  1589	}
  1590	
  1591	sub show_version () {
  1592	    print $usage::VERSION_INFO;
  1593	}
  1594	
  1595	#
  1596	# check the file. No $msg is good.
  1597	#
  1598	sub check_file ($$$$$) {
  1599	    my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
  1600	
  1601	    my $msg = undef;
  1602	    if ($mtype =~ /; x-system=unsupported$/) {
  1603		$mtype =~ s/; x-system=unsupported$//;
  1604		$msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
  1605	    } elsif ($mtype =~ /; x-error=file_size_max/) {
  1606	        $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
  1607	    } elsif ($mtype =~ /; x-error=.*$/) {
  1608		$mtype =~ s/^.*; x-error=(.*)$/$1/;
  1609		$msg = $mtype;
  1610	    } elsif ($mtype =~ /^x-system/) {
  1611		$msg = _("system error occurred! ")."($mtype)"._(" skipped.");
  1612	    } elsif (! -e $cfile) {
  1613		$msg = _("does NOT EXIST! skipped.");
  1614	    } elsif (! util::canopen($cfile)) {
  1615		$msg = _("is NOT READABLE! skipped.");
  1616	    } elsif ($text_size == 0 || $cfile_size == 0) {
  1617		$msg = _("is 0 size! skipped.");
  1618	    } elsif ($mtype =~ /^application\/octet-stream/) {
  1619		$msg = _("may be a BINARY file! skipped.");
  1620	    } elsif ($cfile_size > $conf::FILE_SIZE_MAX) {
  1621		$msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
  1622	    } elsif ($text_size > $conf::TEXT_SIZE_MAX) {
  1623		$msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ;
  1624	    } 
  1625	
  1626	    return $msg;
  1627	}
  1628	
  1629	
  1630	#
  1631	# Write NMZ.version file.
  1632	#
  1633	sub write_version() {
  1634	    unless (-f $var::NMZ{'version'}) {
  1635		my $fh = util::efopen(">$var::NMZ{'version'}");
  1636		print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
  1637	        util::fclose($fh);
  1638	    }
  1639	}
  1640	
  1641	#
  1642	# rename each temporary file to a real file name.
  1643	#
  1644	sub write_nmz_files () {
  1645	    util::Rename($var::NMZ{'_i'},   $var::NMZ{'i'});
  1646	    util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
  1647	    util::Rename($var::NMZ{'_w'},  $var::NMZ{'w'});
  1648	    util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
  1649	    util::Rename($var::NMZ{'_p'},  $var::NMZ{'p'});
  1650	    util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
  1651	}
  1652	
  1653	# output NMZ.body
  1654	sub write_body_msg () {
  1655	    for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
  1656	        if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
  1657	            my $suffix = $1 ? $1 : '';
  1658	            write_message("$var::NMZ{'body'}${suffix}");
  1659	        }
  1660	    }
  1661	}
  1662	
  1663	# output NMZ.tips
  1664	sub write_tips_msg () {
  1665	    for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
  1666	        if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
  1667	            my $suffix = $1 ? $1 : '';
  1668	            write_message("$var::NMZ{'tips'}${suffix}");
  1669	        }
  1670	    }
  1671	}
  1672	
  1673	
  1674	# output NMZ.result.*
  1675	sub write_result_file () {
  1676	    my $fname = "NMZ.result.normal";
  1677	
  1678	    my @files = glob "$TEMPLATEDIR/NMZ.result.*";
  1679	
  1680	    for my $file (@files) {
  1681		$file =~ m!(NMZ\.result\.[^/]*)$!;
  1682		my $target = "$var::OUTPUT_DIR/$1";
  1683		if (-f $target) {  # already exist;
  1684		    next;
  1685		} else {
  1686		    my $buf = util::readfile($file);
  1687		    my $fh_file = util::efopen(">$target");
  1688		    print $fh_file $buf;
  1689	            util::fclose($fh_file);
  1690		}
  1691	    }
  1692	}
  1693	
  1694	# write NMZ.body and etc.
  1695	sub write_message ($) {
  1696	    my ($msgfile) = @_;
  1697	
  1698	    if (! -f $msgfile) {
  1699		my ($template, $fname);
  1700		
  1701		$msgfile =~ m!.*/(.*)$!;
  1702		$fname = $1;
  1703		$template = "$TEMPLATEDIR/$fname";
  1704	
  1705		if (-f $template) {
  1706		    my $buf = util::readfile($template);
  1707		    my $fh_output = util::efopen(">$msgfile");
  1708		    print $fh_output $buf;
  1709	            util::fclose($fh_output);
  1710		}
  1711	    }
  1712	}
  1713	
  1714	
  1715	#
  1716	# Make the NMZ.slog file for logging.
  1717	#
  1718	sub make_slog_file () {
  1719	    if (! -f $var::NMZ{'slog'}) {
  1720		my $fh = util::efopen(">$var::NMZ{'slog'}");
  1721	        util::fclose($fh);
  1722	        undef $fh;
  1723	        chmod 0666, $var::NMZ{'slog'};
  1724	    }
  1725	    {
  1726		my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
  1727	        util::fclose($fh_slogfile);
  1728	    }
  1729	}
  1730	
  1731	
  1732	#
  1733	# Concatenate $CURRENTDIR to the head of each file.
  1734	#
  1735	sub change_filenames ($) {
  1736	    my $dir = $var::OUTPUT_DIR;
  1737	
  1738	    for my $key (sort keys %var::NMZ) {
  1739		next if $key =~ /^_/;    # exclude temporary file
  1740		$var::NMZ{$key} = "$dir/$var::NMZ{$key}";
  1741	    }
  1742	
  1743	    # temporary files
  1744	    for my $key (sort keys %var::NMZ) {
  1745		if ($key =~ /^_/) {
  1746		    $var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
  1747		}
  1748	    }
  1749	
  1750	    if ($var::Opt{'debug'}) {
  1751		for my $key (sort keys %var::NMZ) {
  1752		    util::dprint("NMZ: $var::NMZ{$key}\n");
  1753		}
  1754	    }
  1755	}
  1756	
  1757	
  1758	#
  1759	# Preparation processing for appending index files.
  1760	#
  1761	sub append_index (@) {
  1762	    my @flist = @_;
  1763	
  1764	    my $docid_base = 0;
  1765	    ($docid_base, @flist) = set_target_files(@flist);
  1766	
  1767	    unless (@flist) { 	# if @flist is empty
  1768		if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
  1769		    set_lockfile($var::NMZ{'lock2'});
  1770		    update_dateindex();
  1771		    update_registry(0);
  1772		    make_headfoot_pages(0, get_total_keys());
  1773		    put_log(0, 0, 0, get_total_keys());
  1774		    make_headfoot_pages(get_status("files"), get_status("keys"));
  1775		    util::remove_tmpfiles();
  1776		}
  1777		print _("No files to index.\n");
  1778		exit 0;
  1779	    }
  1780	
  1781	    $APPENDMODE = 1;
  1782	    # conserve files by copying
  1783	    copy($var::NMZ{'i'},  $var::NMZ{'_i'});
  1784	    copy($var::NMZ{'w'},  $var::NMZ{'_w'});
  1785	    copy($var::NMZ{'t'},  $var::NMZ{'_t'}) 
  1786		unless -f $var::NMZ{'_t'}; # preupdated ?
  1787	    copy($var::NMZ{'p'},  $var::NMZ{'_p'});
  1788	    copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
  1789	
  1790	    return ($docid_base, @flist);
  1791	}
  1792	
  1793	#
  1794	# Set target files to @flist and return with the number of regiested files.
  1795	#
  1796	sub set_target_files() {
  1797	    my %rdocs;    # 'rdocs' means 'registered documents'
  1798	    my @found_files = @_;
  1799	
  1800	    # Load the list of registered documents
  1801	    $rdocs{'name'} = load_registry();
  1802	
  1803	    # Pick up overlapped documents and do marking
  1804	    my %mark1;
  1805	    my @overlapped_files;
  1806	    grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
  1807	    $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
  1808	    for my $overlapped (grep { $mark1{$_} } @found_files) {
  1809		$rdocs{'overlapped'}{$overlapped} = 1;
  1810		push @overlapped_files, $overlapped;
  1811	    };
  1812	
  1813	    # Pick up not overlapped documents which are files to index.
  1814	    my @flist = grep { ! $mark1{$_} } @found_files;
  1815		 
  1816	    if ($var::Opt{'noupdate'}) {
  1817		return (scalar @{$rdocs{'name'}}, @flist);
  1818	    };
  1819	
  1820	    # Load the date index.
  1821	    $rdocs{'mtime'} = load_dateindex();
  1822	
  1823	    if (@{$rdocs{'mtime'}} == 0) {
  1824		return (scalar @{$rdocs{'name'}}, @flist); 
  1825	    };
  1826	
  1827	    util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
  1828			 "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
  1829	
  1830	    # Pick up deleted documents and do marking
  1831	    # (registered in the NMZ.r but not existent in the filesystem)
  1832	    my @deleted_documents;
  1833	    unless ($var::Opt{'nodelete'}) {
  1834		my %mark2;
  1835		grep { $mark2{$_}++ } @found_files;
  1836		for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} && 
  1837					! $rdocs{'overlapped'}{$_} } 
  1838				 @{$rdocs{'name'}}) 
  1839		{
  1840		    $rdocs{'deleted'}{$deleted} = 1;
  1841		    push @deleted_documents, $deleted;
  1842		}
  1843	    }
  1844	
  1845	    # check filesize
  1846	    if ($var::Opt{'checkfilesize'}) {
  1847		$rdocs{'size'} = load_sizefield();
  1848	    }
  1849	
  1850	    # Pick up updated documents and set the missing number for deleted files.
  1851	    my @updated_documents = pickup_updated_documents(\%rdocs);
  1852	
  1853	    # Append updated files to the list of files to index.
  1854	    if (@updated_documents) {
  1855		push @flist, @updated_documents;
  1856	    }
  1857	
  1858	    # Remove duplicates.
  1859	    my %seen = ();
  1860	    @flist = grep { ! $seen{$_}++ } @flist;
  1861	
  1862	    util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
  1863	    util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
  1864	    util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
  1865	    util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
  1866	    util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
  1867	    util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
  1868	
  1869	    # Update NMZ.t with the missing number infomation and
  1870	    # append updated files and deleted files to NMZ.r with leading '# '
  1871	    if (@updated_documents || @deleted_documents) {
  1872		$DeletedFilesCount = 0;
  1873		$UpdatedFilesCount = 0;
  1874		$UpdatedFilesCount += @updated_documents;
  1875	#	$DeletedFilesCount += @updated_documents;
  1876		$DeletedFilesCount += @deleted_documents;
  1877		preupdate_dateindex(@{$rdocs{'mtime'}});
  1878		preupdate_registry(@updated_documents, @deleted_documents);
  1879	    }
  1880	
  1881	    # Return the number of registered documents and list of files to index.
  1882	    return (scalar @{$rdocs{'name'}}, @flist);
  1883	}
  1884	
  1885	sub preupdate_registry(@) {
  1886	    my (@list) = @_;
  1887	
  1888	    my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
  1889	    @list = grep { s/(.*)/\# $1\n/ } @list;
  1890	    print $fh_registry @list;
  1891	    print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
  1892	    util::fclose($fh_registry);
  1893	}
  1894	
  1895	sub preupdate_dateindex(@) {
  1896	    my @mtimes = @_;
  1897	
  1898	    # Since rewriting the entire file, it is not efficient, 
  1899	    # but simple and reliable. this would be revised in the future.
  1900	    my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
  1901	#    print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
  1902	    print $fh_dateindex pack("N*", @mtimes);
  1903	    util::fclose($fh_dateindex);
  1904	}
  1905	
  1906	sub update_registry ($) {
  1907	    my ($docid_count) = @_;
  1908	
  1909	    {
  1910		my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
  1911		my $fh_registry_ = util::efopen($var::NMZ{'_r'});
  1912		while (defined(my $line = <$fh_registry_>)) {
  1913		    print $fh_registry $line;
  1914		}
  1915		if ($docid_count > 0) {
  1916		    print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
  1917		}
  1918	        util::fclose($fh_registry_) if (defined $fh_registry_);
  1919	        util::fclose($fh_registry);
  1920	    }
  1921	    unlink $var::NMZ{'_r'};
  1922	}
  1923	
  1924	sub update_dateindex () {
  1925	    util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
  1926	}
  1927	
  1928	sub update_field_index () {
  1929	    my @list = glob "$var::NMZ{'field'}.*.tmp";
  1930	    for my $tmp (@list) {
  1931	        if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
  1932		    my $fname_tmp = $1;
  1933		    my $fname_out = $2;
  1934		    {
  1935			my $fh_field = util::efopen(">>$fname_out");
  1936			my $fh_tmp = util::efopen($fname_tmp);
  1937	
  1938			while (defined(my $line = <$fh_tmp>)) {
  1939			    print $fh_field $line;
  1940			}
  1941	                util::fclose($fh_tmp) if (defined $fh_tmp);
  1942	                util::fclose($fh_field);
  1943		    }
  1944		    unlink $fname_tmp;
  1945		} else {
  1946		    util::cdie(_("update_field_index: ")."@list");
  1947		}
  1948	    }
  1949	}
  1950	
  1951	sub pickup_updated_documents (\%) {
  1952	    my ($rdocs_ref) = @_;
  1953	    my @updated_documents = ();
  1954	
  1955	    # To avoid duplicated outputs caused by --html-split support.
  1956	    my %printed = ();
  1957	    my $i = 0;
  1958	    for my $cfile (@{$rdocs_ref->{'name'}}) {
  1959		if (defined($rdocs_ref->{'deleted'}{$cfile})) {
  1960		    unless ($printed{$cfile}) {
  1961			print "$cfile " . _("was deleted!\n");
  1962			$printed{$cfile} = 1;
  1963		    }
  1964		    $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
  1965		} elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
  1966		    my $cfile_mtime = (stat($cfile))[9];
  1967		    my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
  1968		    my ($cfile_size, $rfile_size);
  1969		    if ($var::Opt{'checkfilesize'}) {
  1970			$cfile_size = (stat($cfile))[7];
  1971			$rfile_size = $rdocs_ref->{'size'}[$i];
  1972		    }
  1973	
  1974		    if ($rfile_mtime != $cfile_mtime || 
  1975			($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) {
  1976			# The file is updated!
  1977			unless ($printed{$cfile}) {
  1978			    print "$cfile " . _("was updated!\n");
  1979			    $printed{$cfile} = 1;
  1980			}
  1981			push(@updated_documents, $cfile);
  1982			$rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
  1983		    }
  1984		}
  1985		$i++;
  1986	    }
  1987	
  1988	    return @updated_documents
  1989	}
  1990	
  1991	sub load_dateindex() {
  1992	    my $fh_dateindex = util::efopen($var::NMZ{'t'});
  1993	
  1994	    my $size = -s $var::NMZ{'t'};
  1995	    my $buf  = "";
  1996	    read($fh_dateindex, $buf, $size);
  1997	    my @list = unpack("N*", $buf);  # load date index
  1998	#    print "\nload_dateindex\n", join("\n", @list), "\n\n";
  1999	
  2000	    util::fclose($fh_dateindex);
  2001	    return [ @list ];
  2002	}
  2003	
  2004	sub load_registry () {
  2005	    my $fh_registry = util::efopen($var::NMZ{'r'});
  2006	
  2007	    my @list = ();
  2008	    my %deleted    = ();
  2009	    my @registered = ();
  2010	
  2011	    while (defined(my $line = <$fh_registry>)) {
  2012		chomp($line);
  2013		next if $line =~ /^\s*$/;  # an empty line
  2014		next if $line =~ /^##/;    # a comment
  2015		if ($line =~ s/^\#\s+//) { # deleted document
  2016		    $deleted{$line}++;
  2017		} else {
  2018		    # Remove HTML's anchor generated by --html-split option.
  2019		    $line =~ s/\t.*$//g;  
  2020		    push @registered, $line;
  2021		}
  2022	    }
  2023	
  2024	    util::fclose($fh_registry) if (defined $fh_registry);
  2025	
  2026	    # Exclude deleted documents.
  2027	    for my $doc (@registered) {
  2028		if ($deleted{$doc}) {
  2029		    push @list, "# $doc";
  2030		    $deleted{$doc}--;
  2031		} else {
  2032		    push @list, $doc;
  2033		}
  2034	    }
  2035	
  2036	    return [ @list ];
  2037	}
  2038	
  2039	# get file size information from NMZ.field.size
  2040	sub load_sizefield() {
  2041	    my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size');
  2042	    return [] unless defined $fh_sizefield;
  2043	    my $line;
  2044	    my @ret = ();
  2045	    while (defined($line = <$fh_sizefield>)) {
  2046		chomp $line;
  2047		push @ret, $line;
  2048	    }
  2049	    util::fclose($fh_sizefield) if (defined $fh_sizefield);
  2050	    return \@ret;
  2051	}
  2052	
  2053	sub get_total_keys() {
  2054	    my $keys = get_status("keys");
  2055	    $keys =~ s/,//g if (defined $keys);
  2056	    $keys = 0 unless defined $keys;
  2057	    return $keys;
  2058	}
  2059	
  2060	sub get_total_files() {
  2061	    my $files = get_status("files");
  2062	    $files =~ s/,//g if (defined $files);
  2063	    $files = 0 unless defined $files;
  2064	    return $files;
  2065	}
  2066	
  2067	sub get_status($) {
  2068	    my ($key) = @_;
  2069	
  2070	    my $fh = util::fopen($var::NMZ{'status'});
  2071	    return undef unless defined $fh;
  2072	
  2073	    while (defined(my $line = <$fh>)) {
  2074		if ($line =~ /^$key\s+(.*)$/) {
  2075		    util::dprint("status: $key = $1\n");
  2076		    $fh->close;
  2077		    return $1;
  2078		}
  2079	    }
  2080	    util::fclose($fh) if (defined $fh);
  2081	    return undef;
  2082	}
  2083	
  2084	sub put_total_files($) {
  2085	    my ($number) = @_;
  2086	    $number =~ tr/,//d;
  2087	    put_status("files", $number);
  2088	}
  2089	
  2090	sub put_total_keys($) {
  2091	    my ($number) = @_;
  2092	    $number =~ tr/,//d;
  2093	    put_status("keys", $number);
  2094	}
  2095	
  2096	sub put_status($$) {
  2097	    my ($key, $value) = @_;
  2098	
  2099	    # remove NMZ.status file if the file has a previous value.
  2100	    unlink $var::NMZ{'status'} if defined get_status($key);
  2101	
  2102	    my $fh = util::efopen(">> $var::NMZ{'status'}");
  2103	    print $fh "$key $value\n";
  2104	    util::fclose($fh);
  2105	}
  2106	
  2107	# do logging
  2108	sub put_log ($$$$) {
  2109	    my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
  2110	
  2111	    my $date = localtime;
  2112	    my $added_files_count   = $docid_count;
  2113	    my $deleted_documents_count = $DeletedFilesCount;
  2114	    my $updated_documents_count = $UpdatedFilesCount;
  2115	    my $total_files_count   = get_total_files() + $docid_count
  2116				      - $DeletedFilesCount - $UpdatedFilesCount;
  2117	    my $added_keys_count    = 0;
  2118	    $added_keys_count       = $total_keys_count - get_total_keys();
  2119	
  2120	    my $processtime         = time - $start_time;
  2121	    $processtime            = 0 if $start_time == 0;
  2122	    $total_files_size       = $total_files_size;
  2123	    $total_keys_count       = $total_keys_count;
  2124	
  2125	    my @logmsgs = ();
  2126	    if ($APPENDMODE) {
  2127		push @logmsgs, N_("[Append]");
  2128	    } else {
  2129		push @logmsgs, N_("[Base]");
  2130	    }
  2131	    push @logmsgs, N_("Date:"), "$date" if $date;
  2132	    push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
  2133		if $added_files_count;
  2134	    push @logmsgs, N_("Deleted Documents:"), 
  2135		util::commas("$deleted_documents_count") if $deleted_documents_count;
  2136	    push @logmsgs, N_("Updated Documents:"), 
  2137		util::commas("$updated_documents_count") if $updated_documents_count;
  2138	    push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
  2139		if $total_files_size;
  2140	    push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count") 
  2141		if $total_files_count;
  2142	    push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count") 
  2143		if $added_keys_count;
  2144	    push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count") 
  2145		if $total_keys_count;
  2146	    push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
  2147	    push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
  2148		if $processtime;
  2149	    push @logmsgs, N_("File/Sec:"),  sprintf "%.2f", 
  2150	    	(($added_files_count + $updated_documents_count) / $processtime) 
  2151		if $processtime;
  2152	    push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME;
  2153	    push @logmsgs, N_("Perl:"),   sprintf("%f", $English::PERL_VERSION);
  2154	    push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
  2155	
  2156	    my $log_for_file = "";
  2157	
  2158	    my $msg = shift @logmsgs;	# [Base] or [Append]
  2159	    # To stdout, use gettext.
  2160	    print _($msg), "\n";
  2161	    # To log file, do not use gettext.
  2162	    $log_for_file = $msg . "\n";
  2163	    while (@logmsgs) {
  2164		my $field = shift @logmsgs;
  2165		my $value = shift @logmsgs;
  2166		printf "%-20s %s\n", _($field), "$value";
  2167		$log_for_file .= sprintf "%-20s %s\n", $field, "$value";
  2168	    }
  2169	    print "\n";
  2170	    $log_for_file .= "\n";
  2171	
  2172	    put_log_to_logfile($log_for_file);
  2173	    put_total_files($total_files_count);
  2174	    put_total_keys($total_keys_count);
  2175	
  2176	    my $argv = join "\t", @ARGV;
  2177	    my $cwd  = cwd();
  2178	    put_status("argv", $argv);
  2179	    put_status("cwd",  $cwd);
  2180	}
  2181	
  2182	sub put_log_to_logfile ($) {
  2183	    my ($logmsg) = @_;
  2184	    my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
  2185	    print $fh_logfile $logmsg;
  2186	    util::fclose($fh_logfile);
  2187	}
  2188	
  2189	sub get_year() {
  2190	    my $year = (localtime)[5] + 1900;
  2191	
  2192	    return $year;
  2193	}
  2194	
  2195	# Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
  2196	# Insert $docid_count, $key_count, and $month/$day/$year respectively.
  2197	sub make_headfoot ($$$) {
  2198	    my ($file, $docid_count, $key_count) = @_;
  2199	
  2200	    my $day   = sprintf("%02d", (localtime)[3]);
  2201	    my $month = sprintf("%02d", (localtime)[4] + 1);
  2202	    my $year  = get_year();
  2203	    my $buf   = "";
  2204	
  2205	    if (-f $file) {
  2206		$buf = util::readfile($file);
  2207	    } else {
  2208		$file =~ m!.*/(.*)$!;
  2209		my $fname = $1;
  2210		my $template = "$TEMPLATEDIR/$fname";
  2211	
  2212		if (-f $template) {
  2213		    $buf = util::readfile($template);
  2214		} else {
  2215		    return;
  2216		}
  2217	    }
  2218	
  2219	    my $fh_file = util::efopen(">$file");
  2220	
  2221	    if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
  2222	        my $total_files_count = util::commas(get_total_files() + $docid_count 
  2223	                                   - $DeletedFilesCount - $UpdatedFilesCount);
  2224		$buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
  2225	
  2226	    }
  2227	    if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
  2228		my $tmp = $2;
  2229		$tmp =~ tr/,//d;
  2230		$tmp = $key_count;
  2231		$tmp = util::commas($tmp);
  2232		$buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
  2233	    }
  2234	    $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
  2235	    $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
  2236	    $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
  2237		     {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
  2238	    $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
  2239		     {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs;
  2240	
  2241	    print $fh_file $buf;
  2242	    util::fclose($fh_file);
  2243	}
  2244	
  2245	# Make phrase hashes for NMZ.p
  2246	# Handle two words each for calculating a hash value ranged 0-65535.
  2247	sub make_phrase_hash ($$$) {
  2248	    my ($docid_count, $docid_base, $contref) = @_;
  2249	
  2250	    my %tmp = ();
  2251	    $$contref =~ s!\x7f */? *\d+ *\x7f!!g;  # remove tags of weight
  2252	    $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
  2253	    my @words = split(/\s+/, $$contref);
  2254	    @words = grep {$_ ne ""} @words;   # remove empty words
  2255	    my $word_b = shift @words;
  2256	    my $docid = $docid_count + $docid_base;
  2257	    for my $word (@words) {
  2258		next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
  2259		my $hash = hash($word_b . $word);
  2260		unless (defined $tmp{$hash}) {
  2261		    $tmp{$hash} = 1;
  2262		    $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
  2263		    $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
  2264	#	    util::dprint("<$word_b, $word> $hash\n");
  2265		    $PhraseHashLast{$hash} = $docid;
  2266		}
  2267		$word_b = $word;
  2268	    }
  2269	}
  2270	
  2271	# Construct NMZ.p and NMZ.pi file. this processing is rather complex.
  2272	sub write_phrase_hash () {
  2273	    write_phrase_hash_sub();
  2274	    util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
  2275	    util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
  2276	}
  2277	
  2278	sub write_phrase_hash_sub () {
  2279	    my $opened = 0;
  2280	
  2281	    return 0 if %PhraseHash eq '';    # namazu-devel-ja #3146
  2282	    util::dprint(_("doing write_phrase_hash() processing.\n"));
  2283	
  2284	    my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
  2285	    my $fh_tmp_p  = util::efopen(">$var::NMZ{'__p'}");
  2286	
  2287	    my $fh_phrase = util::fopen($var::NMZ{'_p'});
  2288	    my $fh_phraseindex = undef;
  2289	    if ($fh_phrase) {
  2290		$fh_phraseindex = util::efopen($var::NMZ{'_pi'});
  2291		$opened = 1;
  2292	    }
  2293		
  2294	    my $ptr = 0;
  2295	    for (my $i = 0; $i < 65536; $i++) {
  2296	
  2297		my $baserecord = "";
  2298		my $baseleng = 0;
  2299	
  2300		if ($opened) {
  2301		    my $tmp = 0;
  2302		    read($fh_phraseindex, $tmp, $var::INTSIZE);
  2303		    $tmp = unpack("N", $tmp);
  2304		    if ($tmp != 0xffffffff) { # 0xffffffff
  2305			$baseleng = readw($fh_phrase);
  2306			read($fh_phrase, $baserecord, $baseleng);
  2307		    }
  2308		}
  2309		if (defined($PhraseHash{$i})) {
  2310		    if ($baserecord eq "") {
  2311			print $fh_tmp_pi pack("N", $ptr);
  2312			my $record = $PhraseHash{$i};
  2313			my $n2 = length($record);
  2314			my $data = pack("w", $n2) . $record;
  2315			print $fh_tmp_p $data;
  2316			$ptr += length($data);
  2317		    } else {
  2318			print $fh_tmp_pi pack("N", $ptr);
  2319			my $record = $PhraseHash{$i};
  2320			my $last_docid = get_last_docid($baserecord, 1);
  2321			my $adjrecord = adjust_first_docid($record, $last_docid);
  2322			check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
  2323			$record = $adjrecord;
  2324			my $n2 = length($record) + $baseleng;
  2325			my $data = pack("w", $n2) .  $baserecord . $record;
  2326			print $fh_tmp_p $data;
  2327			$ptr += length($data);
  2328		    }
  2329		} else {
  2330		    if ($baserecord eq "") {
  2331			# if $baserecord has no data, set to 0xffffffff
  2332			print $fh_tmp_pi pack("N", 0xffffffff);
  2333		    } else {
  2334			print $fh_tmp_pi pack("N", $ptr);
  2335			my $data = pack("w", $baseleng) . $baserecord;
  2336			print $fh_tmp_p $data;
  2337			$ptr += length($data);
  2338		    }
  2339		}
  2340	    }
  2341	
  2342	    if ($opened) {
  2343	        util::fclose($fh_phraseindex);
  2344	    }
  2345	    if (defined $fh_phrase) {
  2346	        util::fclose($fh_phrase);
  2347	    }
  2348	    util::fclose($fh_tmp_p);
  2349	    util::fclose($fh_tmp_pi);
  2350	
  2351	    %PhraseHash = ();
  2352	    %PhraseHashLast = ();
  2353	}
  2354	
  2355	# Dr. Knuth's  ``hash'' from (UNIX MAGAZINE May 1998)
  2356	sub hash ($) {
  2357	    my ($word) = @_;
  2358	
  2359	    my $hash = 0;
  2360	    for (my $i = 0; $word ne ""; $i++) {
  2361		$hash ^= $Seed[$i & 0x03][ord($word)];
  2362	        $word = substr $word, 1;
  2363		# $word =~ s/^.//;  is slower
  2364	    }
  2365	    return $hash & 65535;
  2366	}
  2367	
  2368	# Count frequencies of words.
  2369	sub count_words ($$$$) {
  2370	    my ($docid_count, $docid_base, $contref, $kanji) = @_;
  2371	    my (@tmp);
  2372	
  2373	    # Normalize into small letter.
  2374	    $$contref =~ tr/A-Z/a-z/;
  2375	
  2376	    # Remove control char.
  2377	    $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
  2378	
  2379	    # It corresponds to -j option of ChaSen.
  2380	    $$contref =~ s/^[ \t\f]+//gm;    # except "\r\n"
  2381	    $$contref =~ s/[ \t\f]+$//gm;    # except "\r\n"
  2382	    $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi;   # for hyphenation
  2383	    if (util::islang("ja")) {
  2384	        $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
  2385	        $$contref =~ s/(。」|。「)/$1\n/gs;
  2386	    }
  2387	    $$contref =~ s/\n+/\n/gs;
  2388	
  2389	    # Do wakatigaki if necessary.
  2390	    if (util::islang("ja")) {
  2391		wakati::wakatize_japanese($contref) if $kanji;
  2392	    }
  2393	
  2394	    my $part1 = "";
  2395	    my $part2 = "";
  2396	    if ($$contref =~ /\x7f/) {
  2397		$part1 = substr $$contref, 0, index($$contref, "\x7f");
  2398		$part2 = substr $$contref, index($$contref, "\x7f");
  2399	#	$part1 = $PREMATCH;  # $& and friends are not efficient
  2400	#	$part2 = $MATCH . $POSTMATCH;
  2401	    } else {
  2402		$part1 = $$contref;
  2403		$part2 = "";
  2404	    }
  2405	
  2406	    # do scoring
  2407	    my %word_count = ();
  2408	    $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
  2409		wordcount_sub($2, $1, \%word_count)!ge;
  2410	    wordcount_sub($part1, 1, \%word_count);
  2411	
  2412	    # Add them to whole index
  2413	    my $docid = $docid_count + $docid_base;
  2414	    for my $word (keys(%word_count)) {
  2415		next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
  2416		$KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
  2417		$KeyIndex{$word} .= pack("w2", 
  2418					 $docid - $KeyIndexLast{$word}, 
  2419					 $word_count{$word});
  2420		$KeyIndexLast{$word} = $docid;
  2421	    }
  2422	}
  2423	
  2424	#
  2425	# Count words and do score weighting
  2426	#
  2427	sub wordcount_sub ($$\%) {
  2428	    my ($text, $weight, $word_count) = @_;
  2429	
  2430	    # Remove all symbols when -K option is specified.
  2431	    $text =~ tr/\xa1-\xfea-z0-9/   /c if $var::Opt{'nosymbol'};
  2432	
  2433	    # Count frequencies of words in a current document.
  2434	    # Handle symbols as follows.
  2435	    #
  2436	    # tcp/ip      ->  tcp/ip,     tcp,      ip
  2437	    # (tcp/ip)    ->  (tcp/ip),   tcp/ip,   tcp, ip
  2438	    # ((tcpi/ip)) ->  ((tcp/ip)), (tcp/ip), tcp
  2439	    #
  2440	    # Don't do processing for nested symbols.
  2441	    # NOTE: When -K is specified, all symbols are already removed.
  2442	
  2443	    my @words = split /\s+/, $text;
  2444	    for my $word (@words) {
  2445		next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
  2446		if ($var::Opt{'noedgesymbol'}) {
  2447		    # remove symbols at both ends
  2448		    $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
  2449		}
  2450		$word_count->{$word} = 0 unless defined($word_count->{$word});
  2451		$word_count->{$word} += $weight;
  2452		unless ($var::Opt{'nosymbol'}) {
  2453		    if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
  2454			$word_count->{$1} = 0 unless defined($word_count->{$1});
  2455			$word_count->{$1} += $weight;
  2456			next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
  2457		    } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
  2458			$word_count->{$1} = 0 unless defined($word_count->{$1});
  2459			$word_count->{$1} += $weight;
  2460			next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
  2461		    } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
  2462			$word_count->{$1} = 0 unless defined($word_count->{$1});
  2463			$word_count->{$1} += $weight;
  2464			next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
  2465		    }
  2466		    my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
  2467			if $word =~ /[^\xa1-\xfea-z_0-9]/;
  2468		    for my $tmp (@words_) {
  2469			next if $tmp eq "";
  2470			$word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
  2471			$word_count->{$tmp} += $weight;
  2472		    }
  2473		    @words_ = ();
  2474		}
  2475	    }
  2476	    return "";
  2477	}
  2478	
  2479	# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
  2480	sub write_index () {
  2481	    my $key_count = write_index_sub();
  2482	    util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
  2483	    util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
  2484	
  2485	    return $key_count;
  2486	}
  2487	
  2488	# readw: read one pack 'w' word.
  2489	# This code was contributed by <furukawa@tcp-ip.or.jp>.
  2490	sub readw ($) {
  2491	    my $fh = shift;
  2492	    my $ret = '';
  2493	    my $c;
  2494	    
  2495	    while (read($fh, $c, 1)){
  2496		$ret .= $c;
  2497		last unless 0x80 & ord $c;
  2498	    }
  2499	    return unpack('w', $ret);
  2500	}
  2501	
  2502	sub get_last_docid ($$) {
  2503	    my ($record, $step) = @_;
  2504	    my (@data) = unpack 'w*', $record;
  2505	
  2506	    my $sum = 0;
  2507	    for (my $i = 0; $i < @data; $i += $step) {
  2508		$sum += $data[$i];
  2509	    }
  2510	    my $leng = @data / $step;
  2511	    return $sum;
  2512	}
  2513	
  2514	sub adjust_first_docid ($$) {
  2515	    my ($record, $last_docid) = @_;
  2516	    my (@data) = unpack 'w*', $record;
  2517	
  2518	    $data[0] = $data[0] - $last_docid;
  2519	    return undef if ($data[0] < 0); # namazu-bug-ja#31
  2520	    $record = pack 'w*', @data;
  2521	    return $record;
  2522	}
  2523	
  2524	sub write_index_sub () {
  2525	    my @words = sort keys(%KeyIndex);
  2526	    return 0 if $#words == -1;
  2527	
  2528	    my $cnt = 0;
  2529	    my $ptr_i = 0;
  2530	    my $ptr_w = 0;
  2531	    my $key_count = 0;
  2532	    my $baserecord = "";
  2533	
  2534	    util::dprint(_("doing write_index() processing.\n"));
  2535	    my $fh_tmp_i  = util::efopen(">$var::NMZ{'__i'}");
  2536	    my $fh_tmp_w  = util::efopen(">$var::NMZ{'__w'}");
  2537	    my $fh_i      = util::fopen($var::NMZ{'_i'});
  2538	    my $fh_ii     = util::efopen(">$var::NMZ{'_ii'}");
  2539	    my $fh_w      = util::fopen($var::NMZ{'_w'});
  2540	    my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
  2541	
  2542	    if ($fh_w) {
  2543	      FOO:
  2544		while (defined(my $line = <$fh_w>)) {
  2545		    chop $line;
  2546		    my $current_word = $line;
  2547	
  2548		    my $baseleng = readw($fh_i);
  2549		    read($fh_i, $baserecord, $baseleng);
  2550	
  2551	 	    for (; $cnt < @words; $cnt++) {
  2552			last unless $words[$cnt] le $current_word;
  2553			my $record = $KeyIndex{$words[$cnt]};
  2554			my $leng = length($record);
  2555	
  2556			if ($current_word eq $words[$cnt]) {
  2557			    my $last_docid = get_last_docid($baserecord, 2);
  2558			    my $adjrecord = adjust_first_docid($record, $last_docid);
  2559			    check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
  2560			    $record = $adjrecord;
  2561			    $leng = length($record);  # re-measure
  2562			    my $tmp = pack("w", $leng + $baseleng);
  2563	
  2564			    my $data_i = "$tmp$baserecord$record";
  2565			    my $data_w = "$current_word\n";
  2566			    print $fh_tmp_i $data_i;
  2567			    print $fh_tmp_w $data_w;
  2568			    print $fh_ii pack("N", $ptr_i);
  2569			    print $fh_wi pack("N", $ptr_w);
  2570			    $ptr_i += length($data_i);
  2571			    $ptr_w += length($data_w);
  2572			    $key_count++;
  2573	
  2574			    $cnt++;
  2575			    next FOO;
  2576			} else {
  2577			    my $tmp = pack("w", $leng);
  2578			    my $data_i = "$tmp$record";
  2579			    my $data_w = "$words[$cnt]\n";
  2580			    print $fh_tmp_i $data_i;
  2581			    print $fh_tmp_w $data_w;
  2582			    print $fh_ii pack("N", $ptr_i);
  2583			    print $fh_wi pack("N", $ptr_w);
  2584			    $ptr_i += length($data_i);
  2585			    $ptr_w += length($data_w);
  2586			    $key_count++;
  2587			}
  2588		    }
  2589		    my $tmp  = pack("w", $baseleng);
  2590		    my $data_i = "$tmp$baserecord";
  2591		    my $data_w = "$current_word\n";
  2592		    print $fh_tmp_i $data_i;
  2593		    print $fh_tmp_w $data_w;
  2594		    print $fh_ii pack("N", $ptr_i);
  2595		    print $fh_wi pack("N", $ptr_w);
  2596		    $ptr_i += length($data_i);
  2597		    $ptr_w += length($data_w);
  2598		    $key_count++;
  2599		}
  2600	    }
  2601	    while ($cnt < @words) {
  2602		my $leng = length($KeyIndex{$words[$cnt]});
  2603		my $tmp = pack("w", $leng);
  2604		my $record = $KeyIndex{$words[$cnt]};
  2605	
  2606		my $data_i = "$tmp$record";
  2607		my $data_w = "$words[$cnt]\n";
  2608		print $fh_tmp_i $data_i;
  2609		print $fh_tmp_w $data_w;
  2610		print $fh_ii pack("N", $ptr_i);
  2611		print $fh_wi pack("N", $ptr_w);
  2612		$ptr_i += length($data_i);
  2613		$ptr_w += length($data_w);
  2614		$key_count++;
  2615		$cnt++;
  2616	    }
  2617	    %KeyIndex = ();
  2618	    %KeyIndexLast = ();
  2619	
  2620	    util::fclose($fh_wi);
  2621	    util::fclose($fh_w) if (defined $fh_w);
  2622	    util::fclose($fh_ii);
  2623	    util::fclose($fh_i) if (defined $fh_i);
  2624	    util::fclose($fh_tmp_w);
  2625	    util::fclose($fh_tmp_i);
  2626	
  2627	    return $key_count;
  2628	}
  2629	
  2630	#
  2631	# Decide the media type. 
  2632	# FIXME: Very ad hoc. It's just a compromise. -- satoru
  2633	#
  2634	sub decide_type ($$) {
  2635	    my ($name, $cont) = @_;
  2636	    return $name if (!defined $cont || $name eq $cont);
  2637	
  2638	    util::dprint("decide_type: name: $name, cont: $cont\n");
  2639	    if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
  2640		return $name;
  2641	    } elsif ($cont =~ m!^application/octet-stream! &&
  2642	             $name !~ m!^text/!) {
  2643		return $name;
  2644	    } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
  2645		     $name !~ m!^application/octet-stream!)  {
  2646		# FIXME: Currently File::MMagic 1.02's checktype_data() 
  2647		# is unreliable for them.
  2648		return $name;
  2649	    } elsif ($cont =~ m!^application/x-zip! &&
  2650	             $name =~ m!^application/!) {
  2651	        # zip format is used other applications e.g. OpenOffice.
  2652	        # It is necessary to add to check extention.
  2653	        return $name;
  2654	    }
  2655	
  2656	    return $cont;
  2657	}
  2658	
  2659	#
  2660	# Debugging code for the "negative numbers" problem.
  2661	#
  2662	sub check_records ($$$) {
  2663	    my ($recref, $baserecref, $step) = @_;
  2664	    dump_record($baserecref, $step);
  2665	    dump_record($recref, $step);
  2666	    print STDERR "The \x22negative number\x22 problem occurred.\n";
  2667	    exit(1);
  2668	}
  2669	
  2670	sub dump_record($$) {
  2671	    my ($recref, $step) = @_;
  2672	    my (@data) = unpack 'w*', $$recref;
  2673	    print STDERR "dump record data to NMZ.bug.info (step: $step)...";
  2674	    my $fh_info = util::fopen(">> NMZ.bug.info");
  2675	    print $fh_info "dumped record data (step: $step)...";
  2676	    foreach (@data) {
  2677		print $fh_info sprintf(" %08x", $_);
  2678	    }
  2679	    print $fh_info "\n";
  2680	    util::fclose($fh_info);
  2681	    return;
  2682	}
  2683	
  2684	sub trapintr {
  2685	    my ($signame) = @_;
  2686	    print STDERR "Warning: signal $signame occured.\n";
  2687	}
  2688	
  2689	#
  2690	# For avoiding "used only once: possible typo at ..." warnings.
  2691	#
  2692	muda($conf::ON_MEMORY_MAX,
  2693	     $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
  2694	     $conf::DENY_FILE, $var::INTSIZE,
  2695	     $conf::CHASEN_NOUN, $conf::CHASEN,
  2696	     $conf::KAKASI, $var::Opt{'okurigana'},
  2697	     $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
  2698	     $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
  2699	     $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
  2700	     $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
  2701	     $conf::ADDRESS, $var::MAILING_ADDRESS,
  2702	     $conf::FILE_SIZE_MAX,
  2703	     $conf::MECAB,
  2704	     $conf::DENY_DDN,
  2705	     $var::TRAC_URI,
  2706	     );
  2707	
  2708	sub muda {}
  2709	

まりんきょ学問所全文検索システムNamazu > mknmz のソースコードを読む


MARUYAMA Satosi