namazu は大きく二つのプログラムに分かれる。一つはインデックスを作るプログラム、 もう一つは検索をするプログラムである。インデックスを作るプログラムは mknmz、 検索をするプログラムは namazu である。 ここではインデックスを作るプログラムである mknmz のソースコードを読んでみよう。 実際には mknmz.in ファイルを読むことにする。 両者の違いは、mknmz.in を作る元ソース(テンプレート)であるのに対し、 mknmz は環境等を置き換えたあとで実際にスクリプトを実行する実ソースである、ということである。 また、mknmz はプログラミング言語 perl で書かれている。
以下の仕組みで表示用のソースコードを加工している。 html では &, >, < の表示にあたり工夫する必要がある。 ここで、UNIX のコマンドである sed と cat -n を使っている。
% sed -e 's/\&/\&/g' -e 's/>/\>/g' -e 's/</\</g' scripts/mknmz.in | cat -n > mknmz.in.html
1 #! %PERL% -w
2 # -*- Perl -*-
3 # mknmz - indexer of Namazu
4 # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
5 #
6 # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
7 # Copyright (C) 2000-2008 Namazu Project All rights reserved.
8 # This is free software with ABSOLUTELY NO WARRANTY.
9 #
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either versions 2, or (at your option)
13 # any later version.
14 #
15 # This program is distributed in the hope that it will be useful
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, write to the Free Software
22 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 # 02111-1307, USA
24 #
25 # This file must be encoded in EUC-JP encoding
26 #
27
28 package mknmz;
29 require 5.004;
30 use English;
31 use lib "%ADDITIONAL_INC%";
32 use Cwd;
33 use IO::File;
34 use File::Find;
35 use File::MMagic;
36 use Time::Local;
37 use strict; # be strict since v1.2.0
38 use Getopt::Long;
39 use File::Copy;
40 use DirHandle;
41 use File::Basename;
42
43 use vars qw($SYSTEM);
44 # It exists only for back compatibility.
45 $SYSTEM = $English::OSNAME;
28 行め、package mknmz; はパッケージ mknmz の名前空間を宣言している。
29 行め、require 5.004; は依存バージョンである。
30行目以降の use は各種モジュールの宣言である。
lib → ここは mknmz では use lib "."; と展開される。 English → 各種の数などを英語で表すためのモジュール Cwd → カレントワーキングディレクトリを得るためのモジュール File::Find →ファイルの検索など File::MMagic →ファイルのマジックナンバーを見る File::Copy → ファイルをコピーする File::Basename →ファイルのフルパスから IO::File → IOを使う Time::Local → 時間(ローカル時間) strict → インタープリターで解釈を厳密にして怪しいところはすべてエラーを出し停止する Getopt::Long → 長形式のオプション解釈モジュール DirHandle → ディレクトリを扱うモジュール vars qw($SYSTEM) 45 行目の $SYSTEM = $English::OSNAME; は後方互換性のためだけに存在する。
46
47 my $NAMAZU_INDEX_VERSION = "2.0";
48
49 my $CodingSystem = "euc";
50 my $PKGDATADIR = $ENV{'pkgdatadir'} || "@pkgdatadir@";
51 my $CONFDIR = "@CONFDIR@"; # directory where mknmzrc are in.
52 my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
53 my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
54 my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
55
56 my $DeletedFilesCount = 0;
57 my $UpdatedFilesCount = 0;
58 my $APPENDMODE = 0;
59 my %PhraseHash = ();
60 my %PhraseHashLast = ();
61 my %KeyIndex = ();
62 my %KeyIndexLast = ();
63 my %CheckPoint = ("on" => undef, "continue" => undef);
64 my $ConfigFile = undef;
65 my $MediaType = undef;
66
67 my $ReplaceCode = undef; # perl code for transforming URI
68 my @Seed = ();
69 my @LoadedRcfiles = ();
70 my $Magic = new File::MMagic;
71
72 my $ReceiveTERM = 0;
73
74 STDOUT->autoflush(1);
75 STDERR->autoflush(1);
47 行から 58 行まではスカラー型変数の宣言。 その中で、49 行は日本語の表現形式を宣言している。ここでは euc である。 59 行から 63 行まではハッシュ(連想配列)である。ここもほとんど何もない状態の宣言である。 64 行から 67 行はふたたびスカラー型変数の宣言。 68 行と69 行は空の配列の宣言である。 70 行と72 行は再度スカラー型変数の宣言。 73 行は標準出力のバッファをフラッシュする(バッファを貯めない)宣言。 74 行は標準エラー出力に対して同様にバッファをフラッシュする宣言。
76 main();
77 sub main {
78 my $start_time = time;
79
80 if ($English::PERL_VERSION == 5.008001) {
81 unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
82 print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
83 exit 1;
84 }
85 }
86
87 init();
88
89 # At first, loading pl/conf.pl to prevent overriding some variables.
90 preload_modules();
91
92 # set LANG and bind textdomain
93 util::set_lang();
94 textdomain('namazu', $util::LANG_MSG);
95
96 load_modules();
97 my ($output_dir, @targets) = parse_options();
98 my ($docid_base, $total_files_num) = prep($output_dir, @targets);
99
100 my $swap = 1;
101 my $docid_count = 0;
102 my $file_count = 0;
103 my $total_files_size = 0;
104 my $key_count = 0;
105 my $checkpoint = 0;
106 my $flist_ptr = 0;
107 my $processed_files_size = 0;
108
109 if ($CheckPoint{'continue'}) {
110 # Restore variables
111 eval util::readfile($var::NMZ{'_checkpoint'}) ;
112 } else {
113 print $total_files_num . _(" files are found to be indexed.\n");
114 }
115
116 {
117 my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
118 my $fh_flist = util::efopen($var::NMZ{'_flist'});
119 my %field_indices = ();
120 get_field_index_base(\%field_indices);
121
122 if ($CheckPoint{'continue'}) {
123 seek($fh_flist, $flist_ptr, 0);
124 }
125
126 # Process target files one by one
127 while (defined(my $line = <$fh_flist>)) {
128 $flist_ptr += length($line);
129 my $cfile = $line;
130 chomp $cfile;
131 util::dprint(_("target file: ")."$cfile\n");
132
133 my ($cfile_size, $num) =
134 process_file($cfile, $docid_count, $docid_base,
135 $file_count, \%field_indices,
136 $fh_errorsfile, $total_files_num);
137 if ($num == 0) {
138 $total_files_num--;
139 next;
140 } else {
141 $docid_count += $num;
142 $file_count++;
143 }
144
145 $total_files_size += $cfile_size;
146 $processed_files_size += $cfile_size;
147 last if $ReceiveTERM;
148 if ($processed_files_size > $conf::ON_MEMORY_MAX) {
149 if (%KeyIndex) {
150 $key_count = write_index();
151 print _("Writing index files...");
152 write_phrase_hash();
153 print "\n";
154 }
155 $processed_files_size = 0;
156 $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
157 }
158 }
159
160 util::fclose($fh_flist);
161 util::fclose($fh_errorsfile);
162 }
163 # This should be out of above blocks because of file handler closing.
164 re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
165 $total_files_size, $total_files_num,
166 $file_count, $key_count) if $checkpoint;
167
168 if (%KeyIndex) {
169 $key_count = write_index();
170 print _("Writing index files...");
171 write_phrase_hash();
172 print "\n";
173 }
174
175 $key_count = get_total_keys() unless $key_count;
176 do_remain_job($total_files_size, $docid_count, $key_count,
177 $start_time);
178 exit 0;
179 }
結構な量があるが、少しずつみていこう。まず 76 行で main を呼び出す。 呼び出しの main 関数は 77 行から定義されている。 変数 $start_time はまず今の時刻を測りたい、という要求からである。 if ($English::PERL_VERSION == 5.008001) だったら、 次に続く条件を吟味したうえで不合格となる。
87 行は初期化を実行する。実際の初期化の内容は別に回す。
90 行は pl/conf.pl をローディングすることにしている。
93 行は、set_lang()関数で LANG が設定される。 この関数の詳細も追って書く。
94 行にtextdomain 関数がある。これは gettext() がコールされた 際に検索をかけられる条件も知っている。
96 行は自前モジュールのロード (373 行 参照)。
97 行は オプションを解析したときの返り値2種類を得ている。98 行は 909 行で定義された prep 関数による値。 prep は 準備 (preparation) の意味であろう。
100 行から 107 行は変数定義と初期化である。
109 行から 114 行はチェックポイントを表す CheckPoint ハッシュの 'continue' 要素の値を調べるている。
116 行から 162 行はブロックになっている。 117
180
181 #
182 # FIXME: Very complicated.
183 #
184 sub process_file ($$$$\%$$) {
185 my ($cfile, $docid_count, $docid_base, $file_count,
186 $field_indices, $fh_errorsfile, $total_files_num) = @_;
187
188 my $processed_num = 0;
189 my $file_size = util::filesize($cfile);
190
191 if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
192 my @parts;
193 @parts = htmlsplit::split($cfile, "NMZ.partial")
194 if ($file_size <= $conf::FILE_SIZE_MAX);
195 if (@parts > 1) {
196 my $id = 0;
197 for my $part (@parts) {
198 next if (defined $conf::EXCLUDE_PATH &&
199 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
200 my $fname = util::tmpnam("NMZ.partial.$id");
201 my $fragment = defined $part ? $part : undef;
202 my $uri = generate_uri($cfile, $fragment);
203 my $result = namazu_core($fname,
204 $docid_count + $processed_num,
205 $docid_base, $file_count,
206 $field_indices, $fh_errorsfile,
207 $total_files_num,
208 $uri, $id, $#parts);
209 if ($result > 0) {
210 $processed_num++;
211 my $rname = defined $part ? "$cfile\t$part" : "$cfile";
212 put_registry($rname);
213 }
214 unlink $fname;
215 $id++;
216 }
217 return ($file_size, $processed_num);
218 }
219 }
220 my $result = namazu_core($cfile, $docid_count, $docid_base,
221 $file_count, $field_indices,
222 $fh_errorsfile, $total_files_num,
223 undef, undef, undef);
224 if ($result > 0) {
225 $processed_num++;
226 put_registry($cfile);
227 }
228
229 return ($file_size, $processed_num);
230 }
まず、sub process_file ($$$$\%$$)の引数に驚いてしまうが、 リファレンスに関係があることをまず注意しておく。
231 232 # 233 # Load mknmzrcs: 234 # 235 # 1. MKNMZRC environment 236 # 237 # 2. $(sysconfdir)/$(PACKAGE)/mknmzrc 238 # 239 # 3. ~/.mknmzrc 240 # 241 # 4. user-specified mknmzrc set by mknmz --config=file option. 242 # 243 # If multiple files exists, read all of them. 244 # 245 sub load_rcfiles () { 246 my (@cand) = (); 247 248 # To support Windows. Since they have nasty drive letter convention, 249 # it is necessary to change mknmzrc dynamically with env. variable. 250 push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'}; 251 push @cand, "$CONFDIR/mknmzrc"; 252 push @cand, "$ENV{'HOME'}/.mknmzrc"; 253 254 util::vprint(_("Reading rcfile: ")); 255 for my $rcfile (@cand) { 256 if (-f $rcfile) { 257 load_rcfile ($rcfile); 258 util::vprint(" $rcfile"); 259 } 260 } 261 util::vprint("\n"); 262 } 263 264 sub load_rcfile ($) { 265 my ($rcfile) = @_; 266 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { 267 util::win32_yen_to_slash(\$rcfile); 268 } 269 return if (grep {m/^$rcfile$/} @LoadedRcfiles); 270 do $rcfile; 271 if ($@) { 272 chop $@; 273 push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'"; 274 }else { 275 push @LoadedRcfiles, $rcfile; 276 } 277 278 # Dirty workaround. 279 $LIBDIR = $conf::LIBDIR 280 if (defined $conf::LIBDIR && -d $conf::LIBDIR); 281 $FILTERDIR = $conf::FILTERDIR 282 if (defined $conf::FILTERDIR && -d $conf::FILTERDIR); 283 $TEMPLATEDIR = $conf::TEMPLATEDIR 284 if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR); 285 } 286 287 sub re_exec($$$$$$$$) { 288 my ($flist_ptr, $docid_count, $docid_base, $start_time, 289 $total_files_size, $total_files_num, $file_count, $key_count) = @_; 290 291 # store variables 292 { 293 my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}"); 294 295 print $fh_checkpoint <<EOM; 296 \$DeletedFilesCount = $DeletedFilesCount; 297 \$UpdatedFilesCount = $UpdatedFilesCount; 298 \$APPENDMODE = $APPENDMODE; 299 \$flist_ptr = $flist_ptr; 300 \$docid_count = $docid_count; 301 \$docid_base = $docid_base; 302 \$start_time = $start_time; 303 \$total_files_size = $total_files_size; 304 \$total_files_num = $total_files_num; 305 \$key_count = $key_count; 306 \$file_count = $file_count; 307 \$\$ = $$; 308 EOM 309 util::fclose($fh_checkpoint); 310 } 311 312 @ARGV = ("-S", @ARGV) ; 313 print _("Checkpoint reached: re-exec mknmz...\n"); 314 util::dprint(join ' ', ("::::", @ARGV, "\n")); 315 exec ($0, @ARGV) ; 316 } 317 318 sub put_registry ($) { 319 my ($filename) = @_; 320 my $fh_registry = util::efopen(">>$var::NMZ{'_r'}"); 321 print $fh_registry $filename, "\n"; 322 util::fclose($fh_registry); 323 } 324 325 326 # Initialization 327 # $CodingSystem: Character Coding System 'euc' or 'sjis' 328 sub init () { 329 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 330 $CodingSystem = "sjis"; 331 if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { 332 $CONFDIR = $1 . $CONFDIR ; 333 } 334 if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { 335 $LIBDIR = $1 . $LIBDIR ; 336 } 337 if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { 338 $FILTERDIR = $1 . $FILTERDIR ; 339 } 340 if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { 341 $TEMPLATEDIR = $1 . $TEMPLATEDIR ; 342 } 343 } else { 344 $CodingSystem = "euc"; 345 } 346 347 $SIG{'INT'} = sub { 348 util::cdie("SIGINT caught! Aborted.\n"); 349 }; 350 351 $SIG{'TERM'} = sub { 352 print STDERR "SIGTERM caught!\n"; 353 $ReceiveTERM = 1; 354 }; 355 } 356 357 sub preload_modules () { 358 unshift @INC, $LIBDIR; 359 # workaround for test suites. 360 unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'}; 361 362 require "var.pl" || die "unable to require \"var.pl\"\n"; 363 require "conf.pl" || die "unable to require \"conf.pl\"\n"; 364 require "util.pl" || die "unable to require \"util.pl\"\n"; 365 require "gettext.pl" || die "unable to require \"gettext.pl\"\n"; 366 require "ext.pl" || die "unable to require \"ext.pl\"\n"; 367 } 368 369 sub postload_modules () { 370 require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n"; 371 } 372 373 sub load_modules () { 374 require "usage.pl" || die "unable to require \"usage.pl\"\n"; 375 require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n"; 376 require "wakati.pl" || die "unable to require \"wakati.pl\"\n"; 377 require "seed.pl" || die "unable to require \"seed.pl\"\n"; 378 require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n"; 379 380 @Seed = seed::init(); 381 } 382 383 sub load_filtermodules () { 384 unshift @INC, $FILTERDIR; 385 386 # 387 # Windows modules must be loaded first. 388 # Because OLE filters have low precedence over normal ones. 389 # 390 load_win32modules() if $English::OSNAME eq "MSWin32"; 391 392 # Check filter modules 393 my @filters = (); 394 @filters = glob "$FILTERDIR/*.pl"; 395 396 load_filters(@filters); 397 } 398 399 sub load_win32modules () { 400 # Check filter modules 401 my @filters = (); 402 if (-f "../filter/win32/olemsword.pl") { # to ease developing 403 @filters = glob "../filter/win32/*.pl"; 404 unshift @INC, "../filter/win32"; 405 } else { 406 @filters = glob "$FILTERDIR/win32/*.pl"; 407 unshift @INC, "$FILTERDIR/win32"; 408 } 409 410 load_filters(@filters); 411 } 412 413 sub load_filters (@) { 414 my @filters = @_; 415 416 for my $filter (@filters) { 417 $filter =~ m!([-\w]+)\.pl$!; 418 my $module = $1; 419 require "$module.pl" || die "unable to require \"$module.pl\"\n";; 420 my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv); 421 422 eval "\@mtypes = ${module}::mediatype();"; 423 die $@ if $@; # eval error 424 eval "\$status = ${module}::status();"; 425 die $@ if $@; 426 eval "\$recursive = ${module}::recursive();"; 427 die $@ if $@; 428 eval "\$pre_codeconv = ${module}::pre_codeconv();"; 429 die $@ if $@; 430 eval "\$post_codeconv = ${module}::post_codeconv();"; 431 die $@ if $@; 432 eval "${module}::add_magic(\$Magic);"; 433 die $@ if $@; 434 435 for my $mt (@mtypes) { 436 next if (defined $var::Supported{$mt} && 437 $var::Supported{$mt} eq 'yes' && $status eq 'no'); 438 $var::Supported{$mt} = $status; 439 $var::REQUIRE_ACTIONS{$mt} = $module; 440 $var::RECURSIVE_ACTIONS{$mt} = $recursive; 441 $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv; 442 $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv; 443 } 444 } 445 } 446 447 # Core routine. 448 # 449 # FIXME: Too many parameters. They must be cleared. 450 # 451 sub namazu_core ($$$$$$$$$$) { 452 my ($cfile, $docid_count, $docid_base, 453 $file_count, $field_indices, $fh_errorsfile, $total_files_num, 454 $uri, $part_id, $part_num) = @_; 455 456 my $headings = ""; 457 my $content = ""; 458 my $weighted_str = ""; 459 my %fields; 460 my $msg_prefix; 461 462 if ($part_id) { 463 $msg_prefix = " $part_id/$part_num - "; 464 } else { 465 $msg_prefix = $file_count + 1 . "/$total_files_num - "; 466 } 467 468 unless ($uri) { 469 $uri = generate_uri($cfile); # Make a URI from a file name. 470 } 471 my ($cfile_size, $text_size, $kanji, $mtype) = 472 load_document(\$cfile, \$content, \$weighted_str, 473 \$headings, \%fields); 474 475 { 476 $fields{'mtime'} = (stat($cfile))[9]; 477 my $utc = $fields{'mtime'}; 478 $utc = time::rfc822time_to_mtime($fields{'date'}) 479 if (defined $fields{'date'}); 480 if ($utc == -1) { 481 my $date = $fields{'date'}; 482 print "$cfile Illegal date format. : $date\n"; 483 print $fh_errorsfile "$cfile Illegal date format. : $date\n"; 484 $utc = $fields{'mtime'}; 485 delete $fields{'date'}; 486 } 487 $fields{'utc'} = $utc; 488 } 489 490 util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n"); 491 492 # Check if the file is acceptable. 493 my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri); 494 if (defined $err) { 495 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 496 my $uri2 = codeconv::eucjp_to_shiftjis($uri); 497 print $msg_prefix . "$uri2 $err\n"; 498 } else { 499 print $msg_prefix . "$uri $err\n"; 500 } 501 print $fh_errorsfile "$cfile $err\n"; 502 return 0; # return 0 if error 503 } 504 505 # Print processing file name as URI. 506 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 507 my $uri2 = codeconv::eucjp_to_shiftjis($uri); 508 print $msg_prefix . "$uri2 [$mtype]\n"; 509 } else { 510 print $msg_prefix . "$uri [$mtype]\n"; 511 } 512 513 # Add filename. 514 my $filename = defined $cfile ? $cfile : ''; 515 codeconv::toeuc(\$filename); 516 $filename = basename($filename); 517 $fields{'filename'} = $filename; 518 519 complete_field_info(\%fields, $cfile, $uri, 520 \$headings, \$content, \$weighted_str); 521 put_field_index(\%fields, $field_indices); 522 523 put_dateindex($cfile); 524 $content .= "\n\n$filename\n\n"; # add filename 525 $content .= $weighted_str; # add weights 526 count_words($docid_count, $docid_base, \$content, $kanji); 527 make_phrase_hash($docid_count, $docid_base, \$content); 528 529 # assertion 530 util::assert($cfile_size != 0, 531 "cfile_size == 0 at the end of namazu_core."); 532 533 return $cfile_size; 534 } 535 536 # 537 # Make the URI from the given file name. 538 # 539 sub generate_uri (@) { 540 my ($file, $fragment) = @_; 541 return "" unless defined $file; 542 543 # omit a file name if omittable 544 $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o; 545 546 if (defined $ReplaceCode) { 547 # transforming URI by evaling 548 $_ = $file; 549 eval $ReplaceCode; 550 $file = $_; 551 } 552 553 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 554 $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C| 555 } 556 557 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 558 $file = codeconv::shiftjis_to_eucjp($file); 559 } 560 if (defined $fragment) { 561 codeconv::toeuc(\$fragment); 562 } 563 564 unless ($var::Opt{'noencodeuri'}) { 565 for my $tmp ($file, $fragment) { 566 next unless defined $tmp; 567 568 # Escape unsafe characters (not strict) 569 $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2 570 $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/ 571 sprintf("%%%02X",ord($1))/ge; 572 } 573 } 574 575 576 my $uri = $file; 577 $uri .= "#" . $fragment if defined $fragment; 578 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 579 # restore '|' for drive letter rule of Win32, OS/2 580 $uri =~ s!^/([A-Z])%7C!/$1|!i; 581 } 582 return $uri; 583 } 584 585 586 sub get_field_index_base (\%) { 587 my ($field_indices) = @_; 588 589 my @keys = split('\|', $conf::SEARCH_FIELD); 590 if ($var::Opt{'meta'}) { 591 push @keys, (split '\|', $conf::META_TAGS); 592 } 593 for my $key (@keys) { 594 $key = lc($key); 595 my $fname = "$var::NMZ{'field'}.$key"; 596 my $tmp_fname = util::tmpnam("NMZ.field.$key"); 597 my $size = 0; 598 $size = -s $fname if -f $fname; 599 $size += -s $tmp_fname if -f $tmp_fname; 600 $field_indices->{$key} = $size; 601 } 602 } 603 604 sub complete_field_info (\%$$\$\$\$) { 605 my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_; 606 607 for my $field (keys %{$fields}) { 608 if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) { 609 delete $fields->{$field}; 610 } 611 } 612 613 unless (defined($fields->{'title'})) { 614 $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref); 615 } 616 unless (defined($fields->{'date'})) { 617 my $mtime = $fields->{'mtime'}; 618 my $date = util::rfc822time($mtime); 619 $fields->{'date'} = $date; 620 } 621 unless (defined($fields->{'uri'})) { 622 $fields->{'uri'} = $uri; 623 } 624 unless (defined($fields->{'size'})) { 625 $fields->{'size'} = -s $cfile; 626 } 627 unless (defined($fields->{'summary'})) { 628 $fields->{'summary'} = make_summary($contref, $headings, $cfile); 629 } 630 unless (defined($fields->{'from'}) || defined($fields->{'author'})) { 631 $fields->{'from'} = getmsg("unknown"); 632 } 633 } 634 635 # 636 # Currently, messages for NMZ.* files should be encoded in 637 # EUC-JP currently. We cannot use gettext.pl for the messsage 638 # because gettext.pl may use Shift_JIS encoded messages. 639 # So, we should use the function instead of gettext(). 640 # 641 # FIXME: Ad hoc impl. getmsg() is effective only for "unknown". 642 # 643 sub getmsg($) { 644 my ($msg) = @_; 645 646 if (util::islang_msg("ja")) { 647 if ($msg eq "unknown") { 648 return "不明"; 649 } 650 } 651 return $msg; 652 } 653 654 sub make_summary ($$$) { 655 my ($contref, $headings, $cfile) = @_; 656 657 # pick up $conf::MAX_FIELD_LENGTH bytes string 658 my $tmp = ""; 659 if ($$headings ne "") { 660 $$headings =~ s/^\s+//; 661 $$headings =~ s/\s+/ /g; 662 $tmp = $$headings; 663 } else { 664 $tmp = ""; 665 } 666 667 my $offset = 0; 668 my $tmplen = 0; 669 while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0 670 && $offset < length($$contref)) 671 { 672 $tmp .= substr $$contref, $offset, $tmplen; 673 $offset += $tmplen; 674 $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge; 675 $tmp =~ s/([-=*\#])\1{2,}/$1$1/g; 676 } 677 678 # -1 means "LF" 679 my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1; 680 # Remove a garbage Kanji 1st char at the end. 681 $summary = codeconv::chomp_eucjp($summary); 682 683 $summary =~ s/^\s+//; 684 $summary =~ s/\s+/ /g; # normalize white spaces 685 686 return $summary; 687 } 688 689 690 # output the field infomation into NMZ.fields.* files 691 sub put_field_index (\%$) { 692 my ($fields, $field_indices) = @_; 693 694 my $aliases_regex = 695 join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES); 696 697 for my $field (keys %{$fields}) { 698 util::dprint("Field: $field: $fields->{$field}\n"); 699 if ($field =~ /^($aliases_regex)$/o) { 700 unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) { 701 $fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field}; 702 } 703 undef $fields->{$field}; 704 } 705 } 706 707 my @keys = split '\|', $conf::SEARCH_FIELD; 708 if ($var::Opt{'meta'}) { 709 my @meta = split '\|', $conf::META_TAGS; 710 while (my $meta = shift(@meta)) { 711 $meta = $conf::FIELD_ALIASES{$meta} 712 if (defined $conf::FIELD_ALIASES{$meta}); 713 714 push @keys, $meta; 715 } 716 717 # uniq @keys 718 my %mark = (); 719 @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys; 720 } 721 for my $key (@keys) { 722 my $lkey = lc($key); 723 my $fname = util::tmpnam("NMZ.field.$lkey"); 724 my $fh_field = util::efopen(">>$fname"); 725 my $output = ""; 726 if (defined($fields->{$key})) { 727 if ($key ne 'uri') { # workaround for namazu-bugs-ja#30 728 $fields->{$key} =~ s/\s+/ /g; 729 $fields->{$key} =~ s/\s+$//; 730 $fields->{$key} =~ s/^\s+//; 731 } 732 $output = $fields->{$key}; 733 734 # -1 means "LF" 735 $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1; 736 # Remove a garbage Kanji 1st char at the end. 737 $output = codeconv::chomp_eucjp($output); 738 739 $output =~ s/\n.*$//s; 740 $output .= "\n"; 741 } else { 742 $output = "\n"; 743 } 744 print $fh_field $output; 745 util::fclose($fh_field); 746 747 # put index of field index 748 { 749 my $fname = util::tmpnam("NMZ.field.$lkey.i"); 750 my $fh_field_idx = util::efopen(">>$fname"); 751 print $fh_field_idx pack("N", $field_indices->{$lkey}); 752 $field_indices->{$lkey} += length $output; 753 util::fclose($fh_field_idx); 754 } 755 } 756 757 } 758 759 # put the date infomation into NMZ.t file 760 sub put_dateindex ($) { 761 my ($cfile) = @_; 762 my $mtime = (stat($cfile))[9]; 763 764 my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}"); 765 print $fh_dataindex pack("N", $mtime); 766 util::fclose($fh_dataindex); 767 } 768 769 770 # load a document file 771 sub load_document ($$$$\%) { 772 my ($orig_cfile, $contref, $weighted_str, $headings, $fields) 773 = @_; 774 my $cfile = $$orig_cfile; 775 776 return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile)); 777 778 # for handling a filename which contains Shift_JIS code for Windows. 779 # for handling a filename which contains including space. 780 my $shelter_cfile = ""; 781 if (($cfile =~ /\s/) || 782 ($English::OSNAME eq "MSWin32" 783 && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) ) 784 { 785 $shelter_cfile = $cfile; 786 $cfile = util::tmpnam("NMZ.win32"); 787 unlink $cfile if (-e $cfile); 788 copy($shelter_cfile, $cfile); 789 } 790 791 my $file_size; 792 $file_size = util::filesize($cfile); # not only file in feature. 793 if ($file_size > $conf::FILE_SIZE_MAX) { 794 return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max'); 795 } 796 797 $$contref = util::readfile($cfile); 798 # $file_size = length($$contref); 799 800 my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef); 801 802 if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") { 803 unlink $cfile; 804 $cfile = $shelter_cfile; 805 } 806 807 # Measure the text size at this time. 808 my $text_size = length($$contref) + length($$weighted_str); 809 810 return ($file_size, $text_size, $kanji, $mtype); 811 } 812 813 sub apply_filter($$$$$$$) { 814 my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype) 815 = @_; 816 my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile; 817 818 # Filtering process. 819 my $mtype; 820 my $called_dt = 0; 821 while (1) { 822 if (defined $MediaType) { 823 $mtype = $MediaType; 824 } elsif (defined $mmtype) { 825 $mtype = $mmtype; 826 } else { 827 my $mtype_n = $Magic->checktype_byfilename($cfile); 828 my $mtype_c = $Magic->checktype_data($$contref); 829 my $mtype_m; 830 $mtype_m = $Magic->checktype_magic($$contref) 831 if ((! defined $mtype_c) || 832 $mtype_c =~ 833 /^(text\/html|text\/plain|application\/octet-stream)$/); 834 $mtype_c = $mtype_m 835 if (defined $mtype_m && 836 $mtype_m !~ 837 /^(text\/html|text\/plain|application\/octet-stream)$/); 838 $mtype_c = 'text/plain' unless defined $mtype_c; 839 if ($called_dt) { 840 $mtype = $mtype_c; 841 } else { 842 $mtype = decide_type($mtype_n, $mtype_c); 843 $called_dt = 1; 844 } 845 } 846 util::dprint(_("Detected type: ")."$mtype\n"); 847 848 # Pre code conversion. 849 if ($var::REQUIRE_PRE_CODECONV{$mtype}) { 850 util::dprint("pre_codeconv\n"); 851 codeconv_document($contref); 852 } 853 854 if (! $var::Supported{$mtype} || 855 $var::Supported{$mtype} ne 'yes') 856 { 857 util::vprint(_("Unsupported media type ")."$mtype\n"); 858 return (0, "$mtype; x-system=unsupported"); 859 } 860 861 if ($var::REQUIRE_ACTIONS{$mtype}) { 862 util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n"); 863 require $var::REQUIRE_ACTIONS{$mtype}.'.pl' 864 || die _("unable to require ") . 865 "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n"; 866 my $err = undef; 867 { 868 local $SIG{'PIPE'} = \&trapintr; 869 eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} . 870 '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);'; 871 } 872 if ($err) { 873 if ($err =~ m/; x-system=unsupported$/) { 874 return (0, $err); 875 } 876 return (0, "$mtype; x-error=$err"); 877 } 878 879 if ($@) { 880 util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n"); 881 return (0, "$mtype; x-error=$@"); 882 } 883 884 # Post code conversion. 885 if ($var::REQUIRE_POST_CODECONV{$mtype}) { 886 util::dprint("post_codeconv\n"); 887 codeconv_document($contref); 888 } 889 890 next if ($var::RECURSIVE_ACTIONS{$mtype}); 891 } 892 last; 893 } 894 895 my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained? 896 $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/; 897 898 return ($kanji, $mtype); 899 } 900 901 sub codeconv_document ($) { 902 my ($textref) = @_; 903 codeconv::toeuc($textref); 904 $$textref =~ s/\r\n/\n/g; 905 $$textref =~ s/\r/\n/g; 906 $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char. 907 } 908 909 sub prep () { 910 my $docid_base = 0; 911 my $output_dir = shift @_ ; 912 my @targets = @_ ; 913 my @flist = (); 914 915 $var::OUTPUT_DIR = $output_dir; 916 917 require_modules(); 918 change_filenames(); 919 check_present_index(); 920 921 # if Checkpoint mode, return 922 return (0, 0) if $CheckPoint{'continue'}; 923 924 check_lockfile($var::NMZ{'lock2'}); 925 print _("Looking for indexing files...\n"); 926 @flist = find_target(@targets); 927 ($docid_base, @flist) = append_index(@flist) 928 if -f $var::NMZ{'r'}; 929 unless (@flist) { # if @flist is empty 930 print _("No files to index.\n"); 931 exit 0; 932 } 933 set_lockfile($var::NMZ{'lock2'}); 934 save_flist(@flist); 935 my $total_files_num = @flist; 936 937 return ($docid_base, $total_files_num); 938 } 939 940 sub save_flist(@) { 941 my @flist = @_; 942 return if (@flist == 0); 943 944 my $fh_flist = util::efopen(">$var::NMZ{'_flist'}"); 945 print $fh_flist join("\n", @flist), "\n"; 946 util::fclose($fh_flist); 947 } 948 949 sub require_modules() { 950 if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) { 951 require NKF || die "unable to require \"NKF\"\n"; 952 util::dprint(_("code conversion: using NKF module\n")); 953 $var::USE_NKF_MODULE = 1; 954 } 955 if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) { 956 require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n"; 957 util::dprint(_("wakati: using Text::Kakasi module\n")); 958 my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w'); 959 } 960 if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) { 961 require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n"; 962 util::dprint(_("wakati: using Text::ChaSen module\n")); 963 my @arg = ('-i', 'e', '-j', '-F', '%m '); 964 @arg = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'}; 965 my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg); 966 } 967 if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) { 968 require MeCab || die "unable to require \"MeCab\"\n"; 969 util::dprint(_("wakati: using MeCab module\n")); 970 } 971 } 972 973 sub check_lockfile ($) { 974 # warn if check file exists in case other process is running or abnormal 975 # stop execution (later is not the major purpose, though). 976 # This is mainly for early detection before longish find_target. 977 my ($file) = @_; 978 979 if (-f $file) { 980 print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n"); 981 exit 1; 982 } 983 } 984 985 sub set_lockfile ($) { 986 my ($file) = @_; 987 988 # make a lock file 989 if (-f $file) { 990 print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n"; 991 exit 1; 992 } else { 993 my $fh_lockfile = util::efopen(">$file"); 994 print $fh_lockfile "$$"; # save pid 995 util::fclose($fh_lockfile); 996 } 997 } 998 999 sub remove_lockfile ($) { 1000 my ($file) = @_; 1001 1002 # remove lock file 1003 unlink $file if -f $file; 1004 } 1005 1006 # check present index whether it is old type of not 1007 sub check_present_index () { 1008 if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}") 1009 { 1010 util::cdie(_("Present index is old type. it's unsupported.\n")); 1011 } 1012 } 1013 1014 # remain 1015 sub do_remain_job ($$$$) { 1016 my ($total_files_size, $docid_count, $key_count, $start_time) = @_; 1017 1018 if ($docid_count == 0) { 1019 # No files are indexed 1020 if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) { 1021 update_dateindex(); 1022 update_registry($docid_count); 1023 } 1024 } else { 1025 set_lockfile($var::NMZ{'lock'}); 1026 write_version(); 1027 write_body_msg(); 1028 write_tips_msg(); 1029 write_result_file(); 1030 update_field_index(); 1031 update_dateindex(); 1032 update_registry($docid_count); 1033 write_nmz_files(); 1034 make_slog_file(); 1035 remove_lockfile($var::NMZ{'lock'}); 1036 } 1037 make_headfoot_pages($docid_count, $key_count); 1038 put_log($total_files_size, $start_time, $docid_count, $key_count); 1039 util::remove_tmpfiles(); 1040 unlink $var::NMZ{'_flist'}; 1041 } 1042 1043 sub make_headfoot_pages($$) { 1044 my ($docid_count, $key_count) = @_; 1045 1046 for my $file (glob "$TEMPLATEDIR/NMZ.head*") { 1047 if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){ 1048 my $suffix = $1 ? $1 : ''; 1049 make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count); 1050 } 1051 } 1052 for my $file (glob "$TEMPLATEDIR/NMZ.foot*") { 1053 if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){ 1054 my $suffix = $1 ? $1 : ''; 1055 make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count); 1056 } 1057 } 1058 } 1059 1060 # Parse command line options. 1061 sub parse_options 1062 { 1063 if (@ARGV == 0) { 1064 show_mini_usage(); 1065 exit 1; 1066 } 1067 1068 my @targets = (); 1069 my $targets_loaded = 0; 1070 my @argv = @ARGV; 1071 my $cwd = cwd(); 1072 1073 my $opt_dummy = 0; 1074 my $opt_version = 0; 1075 my $opt_help = 0; 1076 my $opt_all = 0; 1077 my $opt_chasen = 0; 1078 my $opt_chasen_noun = 0; 1079 my $opt_kakasi = 0; 1080 my $opt_mecab = 0; 1081 my $opt_checkpoint_sub = 0; 1082 my $opt_show_config = 0; 1083 my $opt_mailnews = 0; 1084 my $opt_mhonarc = 0; 1085 my $opt_norc = 0; 1086 1087 my $opt_quiet = undef; 1088 my $opt_config = undef; 1089 my $output_dir = undef; 1090 my $update_index = undef; 1091 my $include_file = undef; 1092 my $target_list = undef; 1093 my $index_lang = undef; 1094 1095 my %opt_conf; 1096 1097 # Getopt::Long::Configure('bundling'); 1098 Getopt::Long::config('bundling'); 1099 GetOptions( 1100 '0|help' => \$opt_help, 1101 '1|exclude=s' => \$opt_conf{'EXCLUDE_PATH'}, 1102 '2|deny=s' => \$opt_conf{'DENY_FILE'}, 1103 '3|allow=s' => \$opt_conf{'ALLOW_FILE'}, 1104 '4|update=s' => \$update_index, 1105 '5|mhonarc' => \$opt_mhonarc, 1106 '6|mtime=s' => \$var::Opt{'mtime'}, 1107 '7|html-split' => \$var::Opt{'htmlsplit'}, 1108 'C|show-config' => \$opt_show_config, 1109 'E|no-edge-symbol' => \$var::Opt{'noedgesymbol'}, 1110 'F|target-list=s' => \$target_list, 1111 'G|no-okurigana' => \$var::Opt{'okurigana'}, 1112 'H|no-hiragana' => \$var::Opt{'hiragana'}, 1113 'I|include=s' => \$include_file, 1114 'K|no-symbol' => \$var::Opt{'nosymbol'}, 1115 'L|indexing-lang=s' => \$index_lang, 1116 'M|meta' => \$var::Opt{'meta'}, 1117 'O|output-dir=s' => \$output_dir, 1118 'S|checkpoint-sub' => \$opt_checkpoint_sub, 1119 'T|template-dir=s' => \$TEMPLATEDIR, 1120 'U|no-encode-uri' => \$var::Opt{'noencodeuri'} , 1121 'V|verbose' => \$var::Opt{'verbose'}, 1122 'Y|no-delete' => \$var::Opt{'nodelete'}, 1123 'Z|no-update' => \$var::Opt{'noupdate'}, 1124 'a|all' => \$opt_all, 1125 'b|use-mecab' => \$opt_mecab, 1126 'c|use-chasen' => \$opt_chasen, 1127 'd|debug' => \$var::Opt{'debug'}, 1128 'e|robots' => \$var::Opt{'robotexclude'}, 1129 'f|config=s' => \$opt_config, 1130 'h|mailnews' => \$opt_mailnews, 1131 'k|use-kakasi' => \$opt_kakasi, 1132 'm|use-chasen-noun' => \$opt_chasen_noun, 1133 'q|quiet' => \$opt_quiet, 1134 'r|replace=s' => \$ReplaceCode, 1135 's|checkpoint' => \$CheckPoint{'on'}, 1136 't|media-type=s' => \$MediaType, 1137 'u|uuencode' => \$opt_dummy, # for backward compat. 1138 'v|version' => \$opt_version, 1139 'x|no-heading-summary'=> \$var::Opt{'noheadabst'}, 1140 'z|check-filesize' => \$var::Opt{'checkfilesize'}, 1141 'decode-base64' => \$var::Opt{'decodebase64'}, 1142 'norc' => \$opt_norc, 1143 ); 1144 1145 if ($opt_quiet) { 1146 # Make STDOUT quiet by redirecting STDOUT to null device. 1147 my $devnull = util::devnull(); 1148 open(STDOUT, ">$devnull") || die "$devnull: $!"; 1149 } 1150 1151 if (defined $update_index) { 1152 unless (-d $update_index) { 1153 print _("No such index: "), "$update_index\n"; 1154 exit 1; 1155 } 1156 1157 my $orig_status = $var::NMZ{'status'}; 1158 $var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}"; 1159 1160 my $argv = get_status("argv"); 1161 if (!defined $argv) { 1162 print _("No such index: "), "$update_index\n"; 1163 exit 1; 1164 } 1165 @ARGV = split /\t/, $argv; 1166 util::dprint(_("Inherited argv: ")."@ARGV\n"); 1167 1168 my $cwd = get_status("cwd"); 1169 if (!defined $cwd) { 1170 print _("No such index: "), "$update_index\n"; 1171 exit 1; 1172 } 1173 chdir $cwd; 1174 util::dprint(_("Inherited cwd: ")."$cwd\n"); 1175 1176 ($output_dir, @targets) = parse_options(); 1177 $output_dir = $update_index; 1178 $var::NMZ{'status'} = $orig_status; # See also change_filenames() 1179 return ($output_dir, @targets); 1180 } 1181 1182 if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){ 1183 load_rcfiles(); 1184 } 1185 if ($opt_config) { 1186 if (-f $opt_config) { 1187 util::vprint(_("Reading rcfile: ")); 1188 load_rcfile($ConfigFile = $opt_config); 1189 util::vprint(" $opt_config\n"); 1190 } 1191 } 1192 1193 if ($index_lang) { 1194 $util::LANG = $index_lang; 1195 util::dprint("Override indexing language: $util::LANG\n"); 1196 } 1197 1198 if ($opt_help) { 1199 show_usage(); 1200 exit 1; 1201 } 1202 1203 if ($opt_version) { 1204 show_version(); 1205 exit 1; 1206 } 1207 1208 load_filtermodules(); # to make effect $opt_config, $index_lang. 1209 postload_modules(); 1210 1211 foreach my $key (keys %opt_conf){ 1212 if (defined ($opt_conf{$key})) { 1213 ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key}; 1214 } 1215 } 1216 1217 if ($opt_mailnews) { 1218 $MediaType = 'message/rfc822'; 1219 } 1220 if ($opt_mhonarc) { 1221 $MediaType = 'text/html; x-type=mhonarc'; 1222 } 1223 if ($opt_all) { 1224 $conf::ALLOW_FILE = ".*"; 1225 } 1226 if ($opt_chasen) { 1227 $conf::WAKATI = $conf::CHASEN; 1228 $var::Opt{'noun'} = 0; 1229 } 1230 if ($opt_chasen_noun) { 1231 $conf::WAKATI = $conf::CHASEN_NOUN; 1232 $var::Opt{'noun'} = 1; 1233 } 1234 if ($opt_kakasi) { 1235 $conf::WAKATI = $conf::KAKASI; 1236 $var::Opt{'noun'} = 0; 1237 } 1238 if ($opt_mecab) { 1239 $conf::WAKATI = $conf::MECAB; 1240 $var::Opt{'noun'} = 0; 1241 } 1242 if ($include_file) { 1243 do $include_file; 1244 util::dprint("Included: $include_file\n"); 1245 } 1246 if ($target_list) { 1247 if ($CheckPoint{'continue'}) { 1248 @targets = ("dummy"); 1249 } else { 1250 @targets = load_target_list($target_list); 1251 util::dprint(_("Loaded: ")."$target_list\n"); 1252 } 1253 $targets_loaded = 1; 1254 } 1255 if ($opt_checkpoint_sub) { 1256 $CheckPoint{'on'} = 1; 1257 $CheckPoint{'continue'} = 1; 1258 @argv = grep {! /^-S$/} @argv; # remove -S 1259 } 1260 1261 if (defined $ReplaceCode) { 1262 my $orig = "/foo/bar/baz/quux.html"; 1263 $_ = $orig; 1264 eval $ReplaceCode; 1265 if ($@) { # eval error 1266 util::cdie(_("Invalid replace: ")."$ReplaceCode\n"); 1267 } 1268 util::dprint(_("Replace: ")."$orig -> $_\n"); 1269 } 1270 1271 if ($opt_show_config) { 1272 show_config(); 1273 exit 1; 1274 } 1275 1276 if (@ARGV == 0 && $targets_loaded == 0) { 1277 show_mini_usage(); 1278 exit 1; 1279 } 1280 1281 $output_dir = $cwd unless defined $output_dir; 1282 util::cdie("$output_dir: "._("invalid output directory\n")) 1283 unless (-d $output_dir && -w $output_dir); 1284 1285 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { 1286 util::win32_yen_to_slash(\$output_dir); 1287 } 1288 1289 # take remaining @ARGV as targets 1290 if (@ARGV > 0 && $targets_loaded == 0) { 1291 @targets = @ARGV ; 1292 } 1293 1294 # revert @ARGV 1295 # unshift @ARGV, splice(@argv, 0, @argv - @ARGV); 1296 @ARGV = @argv; 1297 1298 return ($output_dir, @targets); 1299 } 1300 1301 sub show_config () { 1302 print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles; 1303 print _("System: ") . "$English::OSNAME\n" if $English::OSNAME; 1304 print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION; 1305 print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION); 1306 print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION; 1307 print _("NKF: ") . "$conf::NKF\n" if $conf::NKF; 1308 print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI; 1309 print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN; 1310 print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB; 1311 print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI; 1312 print _("Lang_Msg: ") . "$util::LANG_MSG\n"; 1313 print _("Lang: ") . "$util::LANG\n"; 1314 print _("Coding System: ") . "$CodingSystem\n"; 1315 print _("CONFDIR: ") . "$CONFDIR\n"; 1316 print _("LIBDIR: ") . "$LIBDIR\n"; 1317 print _("FILTERDIR: ") . "$FILTERDIR\n"; 1318 print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n"; 1319 1320 my @all_types = keys %var::Supported; 1321 my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types; 1322 1323 my $num_supported = @supported; 1324 my $num_unsupported = @all_types - @supported; 1325 print _("Supported media types: ") . "($num_supported)\n"; 1326 print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n"); 1327 for my $mtype (sort keys %var::Supported) { 1328 my $yn = $var::Supported{$mtype}; 1329 if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'}; 1330 print "$yn $mtype"; 1331 if ($var::REQUIRE_ACTIONS{$mtype}){ 1332 print ": $var::REQUIRE_ACTIONS{$mtype}.pl"; 1333 } 1334 print "\n"; 1335 } 1336 } 1337 1338 sub load_target_list ($) { 1339 my ($file) = @_; 1340 my $fh_targets = util::efopen($file); 1341 my @targets = <$fh_targets>; 1342 util::fclose($fh_targets); 1343 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 1344 foreach my $tmp (@targets){ 1345 $tmp =~ s/\r//g; 1346 util::win32_yen_to_slash(\$tmp); 1347 } 1348 } 1349 chomp @targets; 1350 return @targets; 1351 } 1352 1353 # convert a relative path into an absolute path 1354 sub absolute_path($$) { 1355 my ($cwd, $path) = @_; 1356 1357 $path =~ s!^\.$!\./!; 1358 $path =~ s!^\.[/\\]!$cwd/!; 1359 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 1360 util::win32_yen_to_slash(\$path); 1361 if ($path =~ m!^//!) { 1362 } elsif ($path =~ m!^/[^/]!) { 1363 my $driveletter = $cwd; 1364 if ($driveletter =~ m!^([A-Z]:)!i){ 1365 $driveletter = $1; 1366 } 1367 $path = "$driveletter$path"; 1368 } elsif ($path !~ m!^[A-Z]:/!i) { 1369 $path = "$cwd/$path"; 1370 } 1371 } else { 1372 $path =~ s!^([^/])!$cwd/$1!; 1373 } 1374 return $path; 1375 } 1376 1377 sub find_target (@) { 1378 my @targets = @_; 1379 1380 my $cwd = cwd(); 1381 @targets = map { absolute_path($cwd, $_) } @targets; 1382 1383 # Convert \ to / with consideration for Shift_JIS encoding. 1384 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { 1385 foreach my $tmp (@targets){ 1386 util::win32_yen_to_slash(\$tmp); 1387 } 1388 } 1389 1390 # For reporting effects of --allow, --deny, --exclude, --mtime 1391 # options in --verbose mode. 1392 my %counts = (); 1393 $counts{'possible'} = 0; 1394 $counts{'excluded'} = 0; 1395 $counts{'too_old'} = 0; 1396 $counts{'too_new'} = 0; 1397 $counts{'not_allowed'} = 0; 1398 $counts{'denied'} = 0; 1399 1400 # Traverse directories. 1401 # This routine is not efficent but I prefer reliable logic. 1402 my @flist = (); 1403 my $start = time(); 1404 util::vprint(_("find_target starting: "). localtime($start). "\n"); 1405 while (@targets) { 1406 my $target = shift @targets; 1407 1408 if ($target eq '') { 1409 print STDERR "Warning: target contains empty line, skip it\n"; 1410 next; 1411 } 1412 1413 if (-f $target) { # target is a file. 1414 add_target($target, \@flist, \%counts); 1415 } elsif (-d $target) { # target is a directory. 1416 my @subtargets = (); 1417 # Find subdirectories in target directory 1418 # because File::Find::find() does not follow symlink. 1419 if (-l $target) { 1420 my $dh = new DirHandle($target); 1421 while (defined(my $ent = $dh->read)) { 1422 next if ($ent =~ /^\.{1,2}$/); 1423 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { 1424 next if ($ent =~ m!^($conf::DENY_DDN)$!i); 1425 my $tmp = $ent; 1426 util::win32_yen_to_slash(\$tmp); 1427 next if ($ent ne $tmp); 1428 } 1429 my $fname = "$target/$ent"; 1430 next if ($fname eq '.' || $fname eq '..'); 1431 if (-d $fname) { 1432 push(@subtargets, $fname); 1433 } else { 1434 add_target($fname, \@flist, \%counts); 1435 } 1436 } 1437 } else { 1438 @subtargets = ($target); 1439 } 1440 1441 # 1442 # Wanted routine for File::Find's find(). 1443 # 1444 my $wanted_closure = sub { 1445 my $fname = "$File::Find::dir/$_"; 1446 add_target($fname, \@flist, \%counts); 1447 }; 1448 1449 find($wanted_closure, @subtargets) if (@subtargets > 0); 1450 } else { 1451 print STDERR _("unsupported target: ") . $target; 1452 } 1453 } 1454 1455 # uniq @flist 1456 my %mark = (); 1457 @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist; 1458 1459 # Sort file names with consideration for numbers. 1460 @flist = map { $_->[0] } 1461 sort { $a->[1] cmp $b->[1] } 1462 map { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge; 1463 [ $_, $tmp ] } @flist; 1464 1465 my $elapsed = time() - $start ; 1466 $elapsed += 1 ; # to round up and avoid 0 1467 1468 # For --verbose option. 1469 report_find_target($elapsed, $#flist + 1, %counts); 1470 1471 return @flist; 1472 } 1473 1474 sub add_target ($\@\%) { 1475 my ($target, $flists_ref, $counts_ref) = @_; 1476 1477 if ($target =~ /[\n\r\t]/) { 1478 $target =~ s/[\n\r\t]//g; 1479 print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n"; 1480 return; # skip a file name containing LF/CR/TAB chars. 1481 } 1482 1483 return unless -f $target; # Only file is targeted. 1484 1485 $counts_ref->{'possible'}++; 1486 1487 unless (util::canopen($target)) { 1488 util::vprint(sprintf(_("Unreadable: %s"), $target)); 1489 $counts_ref->{'excluded'}++; 1490 return; 1491 } 1492 1493 1494 if (defined $conf::EXCLUDE_PATH && 1495 $target =~ /$conf::EXCLUDE_PATH/ ) 1496 { 1497 util::vprint(sprintf(_("Excluded: %s"), $target)); 1498 $counts_ref->{'excluded'}++; 1499 return; 1500 } 1501 1502 # 1503 # Do processing just like find's --mtime option. 1504 # 1505 if (defined $var::Opt{'mtime'}) { 1506 my $mtime = -M $_; 1507 if ($var::Opt{'mtime'} < 0) { 1508 1509 # This must be `>=' not `>' for consistency with find(1). 1510 if (int($mtime) >= - $var::Opt{'mtime'}) { 1511 util::vprint(sprintf(_("Too old: %s"), $target)); 1512 $counts_ref->{'too_old'}++; 1513 return; 1514 } 1515 } elsif ($var::Opt{'mtime'} > 0) { 1516 if ($var::Opt{'mtime'} =~ /^\+/) { 1517 if ((int($mtime) < $var::Opt{'mtime'})) { 1518 util::vprint(sprintf(_("Too new: %s"), $target)); 1519 $counts_ref->{'too_new'}++; 1520 return; 1521 } 1522 } else { 1523 if (int($mtime) != $var::Opt{'mtime'}) { 1524 if (int($mtime) > $var::Opt{'mtime'}) { 1525 util::vprint(sprintf(_("Too old: %s"),$target)); 1526 $counts_ref->{'too_old'}++; 1527 } else { 1528 util::vprint(sprintf(_("Too new: %s"),$target)); 1529 $counts_ref->{'too_new'}++; 1530 } 1531 return; 1532 } 1533 } 1534 } else { 1535 # $var::Opt{'mtime'} == 0 ; 1536 return; 1537 } 1538 } 1539 1540 # Extract the file name of the target. 1541 $target =~ m!^.*/([^/]+)$!; 1542 my $fname = $1; 1543 1544 if ($fname =~ m!^($conf::DENY_FILE)$!i ) { 1545 util::vprint(sprintf(_("Denied: %s"), $target)); 1546 $counts_ref->{'denied'}++; 1547 return; 1548 } 1549 if ($fname !~ m!^($conf::ALLOW_FILE)$!i) { 1550 util::vprint(sprintf(_("Not allowed: %s"), $target)); 1551 $counts_ref->{'not_allowed'}++; 1552 return; 1553 } else{ 1554 util::vprint(sprintf(_("Targeted: %s"), $target)); 1555 push @$flists_ref, $target; 1556 } 1557 1558 } 1559 1560 sub report_find_target ($$%) { 1561 my ($elapsed, $num_targeted, %counts) = @_; 1562 1563 util::vprint(_("find_target finished: ") . localtime(time()). "\n"); 1564 util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"), 1565 $num_targeted, $elapsed, 1566 $num_targeted /$elapsed)); 1567 util::vprint(sprintf(_(" Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"), 1568 $counts{'possible'}, 1569 $counts{'not_allowed'}, 1570 $counts{'denied'}, 1571 $counts{'excluded'})); 1572 util::vprint(sprintf(_(" MTIME too old: %d, MTIME too new: %d"), 1573 $counts{'too_old'}, 1574 $counts{'too_new'})); 1575 } 1576 1577 sub show_usage () { 1578 util::dprint(_("lang_msg: ")."$util::LANG_MSG\n"); 1579 util::dprint(_("lang: ")."$util::LANG\n"); 1580 1581 my $usage = $usage::USAGE; 1582 $usage = _($usage); 1583 printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS; 1584 } 1585 1586 sub show_mini_usage () { 1587 print _("Usage: mknmz [options] <target>...\n"); 1588 print _("Try `mknmz --help' for more information.\n"); 1589 } 1590 1591 sub show_version () { 1592 print $usage::VERSION_INFO; 1593 } 1594 1595 # 1596 # check the file. No $msg is good. 1597 # 1598 sub check_file ($$$$$) { 1599 my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_; 1600 1601 my $msg = undef; 1602 if ($mtype =~ /; x-system=unsupported$/) { 1603 $mtype =~ s/; x-system=unsupported$//; 1604 $msg = _("Unsupported media type ")."($mtype)"._(" skipped."); 1605 } elsif ($mtype =~ /; x-error=file_size_max/) { 1606 $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ; 1607 } elsif ($mtype =~ /; x-error=.*$/) { 1608 $mtype =~ s/^.*; x-error=(.*)$/$1/; 1609 $msg = $mtype; 1610 } elsif ($mtype =~ /^x-system/) { 1611 $msg = _("system error occurred! ")."($mtype)"._(" skipped."); 1612 } elsif (! -e $cfile) { 1613 $msg = _("does NOT EXIST! skipped."); 1614 } elsif (! util::canopen($cfile)) { 1615 $msg = _("is NOT READABLE! skipped."); 1616 } elsif ($text_size == 0 || $cfile_size == 0) { 1617 $msg = _("is 0 size! skipped."); 1618 } elsif ($mtype =~ /^application\/octet-stream/) { 1619 $msg = _("may be a BINARY file! skipped."); 1620 } elsif ($cfile_size > $conf::FILE_SIZE_MAX) { 1621 $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ; 1622 } elsif ($text_size > $conf::TEXT_SIZE_MAX) { 1623 $msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ; 1624 } 1625 1626 return $msg; 1627 } 1628 1629 1630 # 1631 # Write NMZ.version file. 1632 # 1633 sub write_version() { 1634 unless (-f $var::NMZ{'version'}) { 1635 my $fh = util::efopen(">$var::NMZ{'version'}"); 1636 print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n"; 1637 util::fclose($fh); 1638 } 1639 } 1640 1641 # 1642 # rename each temporary file to a real file name. 1643 # 1644 sub write_nmz_files () { 1645 util::Rename($var::NMZ{'_i'}, $var::NMZ{'i'}); 1646 util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'}); 1647 util::Rename($var::NMZ{'_w'}, $var::NMZ{'w'}); 1648 util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'}); 1649 util::Rename($var::NMZ{'_p'}, $var::NMZ{'p'}); 1650 util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'}); 1651 } 1652 1653 # output NMZ.body 1654 sub write_body_msg () { 1655 for my $file (glob "$TEMPLATEDIR/NMZ.body*") { 1656 if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){ 1657 my $suffix = $1 ? $1 : ''; 1658 write_message("$var::NMZ{'body'}${suffix}"); 1659 } 1660 } 1661 } 1662 1663 # output NMZ.tips 1664 sub write_tips_msg () { 1665 for my $file (glob "$TEMPLATEDIR/NMZ.tips*") { 1666 if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){ 1667 my $suffix = $1 ? $1 : ''; 1668 write_message("$var::NMZ{'tips'}${suffix}"); 1669 } 1670 } 1671 } 1672 1673 1674 # output NMZ.result.* 1675 sub write_result_file () { 1676 my $fname = "NMZ.result.normal"; 1677 1678 my @files = glob "$TEMPLATEDIR/NMZ.result.*"; 1679 1680 for my $file (@files) { 1681 $file =~ m!(NMZ\.result\.[^/]*)$!; 1682 my $target = "$var::OUTPUT_DIR/$1"; 1683 if (-f $target) { # already exist; 1684 next; 1685 } else { 1686 my $buf = util::readfile($file); 1687 my $fh_file = util::efopen(">$target"); 1688 print $fh_file $buf; 1689 util::fclose($fh_file); 1690 } 1691 } 1692 } 1693 1694 # write NMZ.body and etc. 1695 sub write_message ($) { 1696 my ($msgfile) = @_; 1697 1698 if (! -f $msgfile) { 1699 my ($template, $fname); 1700 1701 $msgfile =~ m!.*/(.*)$!; 1702 $fname = $1; 1703 $template = "$TEMPLATEDIR/$fname"; 1704 1705 if (-f $template) { 1706 my $buf = util::readfile($template); 1707 my $fh_output = util::efopen(">$msgfile"); 1708 print $fh_output $buf; 1709 util::fclose($fh_output); 1710 } 1711 } 1712 } 1713 1714 1715 # 1716 # Make the NMZ.slog file for logging. 1717 # 1718 sub make_slog_file () { 1719 if (! -f $var::NMZ{'slog'}) { 1720 my $fh = util::efopen(">$var::NMZ{'slog'}"); 1721 util::fclose($fh); 1722 undef $fh; 1723 chmod 0666, $var::NMZ{'slog'}; 1724 } 1725 { 1726 my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}"); 1727 util::fclose($fh_slogfile); 1728 } 1729 } 1730 1731 1732 # 1733 # Concatenate $CURRENTDIR to the head of each file. 1734 # 1735 sub change_filenames ($) { 1736 my $dir = $var::OUTPUT_DIR; 1737 1738 for my $key (sort keys %var::NMZ) { 1739 next if $key =~ /^_/; # exclude temporary file 1740 $var::NMZ{$key} = "$dir/$var::NMZ{$key}"; 1741 } 1742 1743 # temporary files 1744 for my $key (sort keys %var::NMZ) { 1745 if ($key =~ /^_/) { 1746 $var::NMZ{$key} = util::tmpnam($var::NMZ{$key}); 1747 } 1748 } 1749 1750 if ($var::Opt{'debug'}) { 1751 for my $key (sort keys %var::NMZ) { 1752 util::dprint("NMZ: $var::NMZ{$key}\n"); 1753 } 1754 } 1755 } 1756 1757 1758 # 1759 # Preparation processing for appending index files. 1760 # 1761 sub append_index (@) { 1762 my @flist = @_; 1763 1764 my $docid_base = 0; 1765 ($docid_base, @flist) = set_target_files(@flist); 1766 1767 unless (@flist) { # if @flist is empty 1768 if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) { 1769 set_lockfile($var::NMZ{'lock2'}); 1770 update_dateindex(); 1771 update_registry(0); 1772 make_headfoot_pages(0, get_total_keys()); 1773 put_log(0, 0, 0, get_total_keys()); 1774 make_headfoot_pages(get_status("files"), get_status("keys")); 1775 util::remove_tmpfiles(); 1776 } 1777 print _("No files to index.\n"); 1778 exit 0; 1779 } 1780 1781 $APPENDMODE = 1; 1782 # conserve files by copying 1783 copy($var::NMZ{'i'}, $var::NMZ{'_i'}); 1784 copy($var::NMZ{'w'}, $var::NMZ{'_w'}); 1785 copy($var::NMZ{'t'}, $var::NMZ{'_t'}) 1786 unless -f $var::NMZ{'_t'}; # preupdated ? 1787 copy($var::NMZ{'p'}, $var::NMZ{'_p'}); 1788 copy($var::NMZ{'pi'}, $var::NMZ{'_pi'}); 1789 1790 return ($docid_base, @flist); 1791 } 1792 1793 # 1794 # Set target files to @flist and return with the number of regiested files. 1795 # 1796 sub set_target_files() { 1797 my %rdocs; # 'rdocs' means 'registered documents' 1798 my @found_files = @_; 1799 1800 # Load the list of registered documents 1801 $rdocs{'name'} = load_registry(); 1802 1803 # Pick up overlapped documents and do marking 1804 my %mark1; 1805 my @overlapped_files; 1806 grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}}; 1807 $rdocs{'overlapped'} = {}; # Prepare an anonymous hash. 1808 for my $overlapped (grep { $mark1{$_} } @found_files) { 1809 $rdocs{'overlapped'}{$overlapped} = 1; 1810 push @overlapped_files, $overlapped; 1811 }; 1812 1813 # Pick up not overlapped documents which are files to index. 1814 my @flist = grep { ! $mark1{$_} } @found_files; 1815 1816 if ($var::Opt{'noupdate'}) { 1817 return (scalar @{$rdocs{'name'}}, @flist); 1818 }; 1819 1820 # Load the date index. 1821 $rdocs{'mtime'} = load_dateindex(); 1822 1823 if (@{$rdocs{'mtime'}} == 0) { 1824 return (scalar @{$rdocs{'name'}}, @flist); 1825 }; 1826 1827 util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}}, 1828 "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!"); 1829 1830 # Pick up deleted documents and do marking 1831 # (registered in the NMZ.r but not existent in the filesystem) 1832 my @deleted_documents; 1833 unless ($var::Opt{'nodelete'}) { 1834 my %mark2; 1835 grep { $mark2{$_}++ } @found_files; 1836 for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} && 1837 ! $rdocs{'overlapped'}{$_} } 1838 @{$rdocs{'name'}}) 1839 { 1840 $rdocs{'deleted'}{$deleted} = 1; 1841 push @deleted_documents, $deleted; 1842 } 1843 } 1844 1845 # check filesize 1846 if ($var::Opt{'checkfilesize'}) { 1847 $rdocs{'size'} = load_sizefield(); 1848 } 1849 1850 # Pick up updated documents and set the missing number for deleted files. 1851 my @updated_documents = pickup_updated_documents(\%rdocs); 1852 1853 # Append updated files to the list of files to index. 1854 if (@updated_documents) { 1855 push @flist, @updated_documents; 1856 } 1857 1858 # Remove duplicates. 1859 my %seen = (); 1860 @flist = grep { ! $seen{$_}++ } @flist; 1861 1862 util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n"); 1863 util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n"); 1864 util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n"); 1865 util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n"); 1866 util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n"); 1867 util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n"); 1868 1869 # Update NMZ.t with the missing number infomation and 1870 # append updated files and deleted files to NMZ.r with leading '# ' 1871 if (@updated_documents || @deleted_documents) { 1872 $DeletedFilesCount = 0; 1873 $UpdatedFilesCount = 0; 1874 $UpdatedFilesCount += @updated_documents; 1875 # $DeletedFilesCount += @updated_documents; 1876 $DeletedFilesCount += @deleted_documents; 1877 preupdate_dateindex(@{$rdocs{'mtime'}}); 1878 preupdate_registry(@updated_documents, @deleted_documents); 1879 } 1880 1881 # Return the number of registered documents and list of files to index. 1882 return (scalar @{$rdocs{'name'}}, @flist); 1883 } 1884 1885 sub preupdate_registry(@) { 1886 my (@list) = @_; 1887 1888 my $fh_registry = util::efopen(">$var::NMZ{'_r'}"); 1889 @list = grep { s/(.*)/\# $1\n/ } @list; 1890 print $fh_registry @list; 1891 print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n"; 1892 util::fclose($fh_registry); 1893 } 1894 1895 sub preupdate_dateindex(@) { 1896 my @mtimes = @_; 1897 1898 # Since rewriting the entire file, it is not efficient, 1899 # but simple and reliable. this would be revised in the future. 1900 my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}"); 1901 # print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n"; 1902 print $fh_dateindex pack("N*", @mtimes); 1903 util::fclose($fh_dateindex); 1904 } 1905 1906 sub update_registry ($) { 1907 my ($docid_count) = @_; 1908 1909 { 1910 my $fh_registry = util::efopen(">>$var::NMZ{'r'}"); 1911 my $fh_registry_ = util::efopen($var::NMZ{'_r'}); 1912 while (defined(my $line = <$fh_registry_>)) { 1913 print $fh_registry $line; 1914 } 1915 if ($docid_count > 0) { 1916 print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n"; 1917 } 1918 util::fclose($fh_registry_) if (defined $fh_registry_); 1919 util::fclose($fh_registry); 1920 } 1921 unlink $var::NMZ{'_r'}; 1922 } 1923 1924 sub update_dateindex () { 1925 util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'}); 1926 } 1927 1928 sub update_field_index () { 1929 my @list = glob "$var::NMZ{'field'}.*.tmp"; 1930 for my $tmp (@list) { 1931 if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) { 1932 my $fname_tmp = $1; 1933 my $fname_out = $2; 1934 { 1935 my $fh_field = util::efopen(">>$fname_out"); 1936 my $fh_tmp = util::efopen($fname_tmp); 1937 1938 while (defined(my $line = <$fh_tmp>)) { 1939 print $fh_field $line; 1940 } 1941 util::fclose($fh_tmp) if (defined $fh_tmp); 1942 util::fclose($fh_field); 1943 } 1944 unlink $fname_tmp; 1945 } else { 1946 util::cdie(_("update_field_index: ")."@list"); 1947 } 1948 } 1949 } 1950 1951 sub pickup_updated_documents (\%) { 1952 my ($rdocs_ref) = @_; 1953 my @updated_documents = (); 1954 1955 # To avoid duplicated outputs caused by --html-split support. 1956 my %printed = (); 1957 my $i = 0; 1958 for my $cfile (@{$rdocs_ref->{'name'}}) { 1959 if (defined($rdocs_ref->{'deleted'}{$cfile})) { 1960 unless ($printed{$cfile}) { 1961 print "$cfile " . _("was deleted!\n"); 1962 $printed{$cfile} = 1; 1963 } 1964 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number. 1965 } elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) { 1966 my $cfile_mtime = (stat($cfile))[9]; 1967 my $rfile_mtime = $rdocs_ref->{'mtime'}[$i]; 1968 my ($cfile_size, $rfile_size); 1969 if ($var::Opt{'checkfilesize'}) { 1970 $cfile_size = (stat($cfile))[7]; 1971 $rfile_size = $rdocs_ref->{'size'}[$i]; 1972 } 1973 1974 if ($rfile_mtime != $cfile_mtime || 1975 ($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) { 1976 # The file is updated! 1977 unless ($printed{$cfile}) { 1978 print "$cfile " . _("was updated!\n"); 1979 $printed{$cfile} = 1; 1980 } 1981 push(@updated_documents, $cfile); 1982 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number. 1983 } 1984 } 1985 $i++; 1986 } 1987 1988 return @updated_documents 1989 } 1990 1991 sub load_dateindex() { 1992 my $fh_dateindex = util::efopen($var::NMZ{'t'}); 1993 1994 my $size = -s $var::NMZ{'t'}; 1995 my $buf = ""; 1996 read($fh_dateindex, $buf, $size); 1997 my @list = unpack("N*", $buf); # load date index 1998 # print "\nload_dateindex\n", join("\n", @list), "\n\n"; 1999 2000 util::fclose($fh_dateindex); 2001 return [ @list ]; 2002 } 2003 2004 sub load_registry () { 2005 my $fh_registry = util::efopen($var::NMZ{'r'}); 2006 2007 my @list = (); 2008 my %deleted = (); 2009 my @registered = (); 2010 2011 while (defined(my $line = <$fh_registry>)) { 2012 chomp($line); 2013 next if $line =~ /^\s*$/; # an empty line 2014 next if $line =~ /^##/; # a comment 2015 if ($line =~ s/^\#\s+//) { # deleted document 2016 $deleted{$line}++; 2017 } else { 2018 # Remove HTML's anchor generated by --html-split option. 2019 $line =~ s/\t.*$//g; 2020 push @registered, $line; 2021 } 2022 } 2023 2024 util::fclose($fh_registry) if (defined $fh_registry); 2025 2026 # Exclude deleted documents. 2027 for my $doc (@registered) { 2028 if ($deleted{$doc}) { 2029 push @list, "# $doc"; 2030 $deleted{$doc}--; 2031 } else { 2032 push @list, $doc; 2033 } 2034 } 2035 2036 return [ @list ]; 2037 } 2038 2039 # get file size information from NMZ.field.size 2040 sub load_sizefield() { 2041 my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size'); 2042 return [] unless defined $fh_sizefield; 2043 my $line; 2044 my @ret = (); 2045 while (defined($line = <$fh_sizefield>)) { 2046 chomp $line; 2047 push @ret, $line; 2048 } 2049 util::fclose($fh_sizefield) if (defined $fh_sizefield); 2050 return \@ret; 2051 } 2052 2053 sub get_total_keys() { 2054 my $keys = get_status("keys"); 2055 $keys =~ s/,//g if (defined $keys); 2056 $keys = 0 unless defined $keys; 2057 return $keys; 2058 } 2059 2060 sub get_total_files() { 2061 my $files = get_status("files"); 2062 $files =~ s/,//g if (defined $files); 2063 $files = 0 unless defined $files; 2064 return $files; 2065 } 2066 2067 sub get_status($) { 2068 my ($key) = @_; 2069 2070 my $fh = util::fopen($var::NMZ{'status'}); 2071 return undef unless defined $fh; 2072 2073 while (defined(my $line = <$fh>)) { 2074 if ($line =~ /^$key\s+(.*)$/) { 2075 util::dprint("status: $key = $1\n"); 2076 $fh->close; 2077 return $1; 2078 } 2079 } 2080 util::fclose($fh) if (defined $fh); 2081 return undef; 2082 } 2083 2084 sub put_total_files($) { 2085 my ($number) = @_; 2086 $number =~ tr/,//d; 2087 put_status("files", $number); 2088 } 2089 2090 sub put_total_keys($) { 2091 my ($number) = @_; 2092 $number =~ tr/,//d; 2093 put_status("keys", $number); 2094 } 2095 2096 sub put_status($$) { 2097 my ($key, $value) = @_; 2098 2099 # remove NMZ.status file if the file has a previous value. 2100 unlink $var::NMZ{'status'} if defined get_status($key); 2101 2102 my $fh = util::efopen(">> $var::NMZ{'status'}"); 2103 print $fh "$key $value\n"; 2104 util::fclose($fh); 2105 } 2106 2107 # do logging 2108 sub put_log ($$$$) { 2109 my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_; 2110 2111 my $date = localtime; 2112 my $added_files_count = $docid_count; 2113 my $deleted_documents_count = $DeletedFilesCount; 2114 my $updated_documents_count = $UpdatedFilesCount; 2115 my $total_files_count = get_total_files() + $docid_count 2116 - $DeletedFilesCount - $UpdatedFilesCount; 2117 my $added_keys_count = 0; 2118 $added_keys_count = $total_keys_count - get_total_keys(); 2119 2120 my $processtime = time - $start_time; 2121 $processtime = 0 if $start_time == 0; 2122 $total_files_size = $total_files_size; 2123 $total_keys_count = $total_keys_count; 2124 2125 my @logmsgs = (); 2126 if ($APPENDMODE) { 2127 push @logmsgs, N_("[Append]"); 2128 } else { 2129 push @logmsgs, N_("[Base]"); 2130 } 2131 push @logmsgs, N_("Date:"), "$date" if $date; 2132 push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count") 2133 if $added_files_count; 2134 push @logmsgs, N_("Deleted Documents:"), 2135 util::commas("$deleted_documents_count") if $deleted_documents_count; 2136 push @logmsgs, N_("Updated Documents:"), 2137 util::commas("$updated_documents_count") if $updated_documents_count; 2138 push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size") 2139 if $total_files_size; 2140 push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count") 2141 if $total_files_count; 2142 push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count") 2143 if $added_keys_count; 2144 push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count") 2145 if $total_keys_count; 2146 push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI; 2147 push @logmsgs, N_("Time (sec):"), util::commas("$processtime") 2148 if $processtime; 2149 push @logmsgs, N_("File/Sec:"), sprintf "%.2f", 2150 (($added_files_count + $updated_documents_count) / $processtime) 2151 if $processtime; 2152 push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME; 2153 push @logmsgs, N_("Perl:"), sprintf("%f", $English::PERL_VERSION); 2154 push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION; 2155 2156 my $log_for_file = ""; 2157 2158 my $msg = shift @logmsgs; # [Base] or [Append] 2159 # To stdout, use gettext. 2160 print _($msg), "\n"; 2161 # To log file, do not use gettext. 2162 $log_for_file = $msg . "\n"; 2163 while (@logmsgs) { 2164 my $field = shift @logmsgs; 2165 my $value = shift @logmsgs; 2166 printf "%-20s %s\n", _($field), "$value"; 2167 $log_for_file .= sprintf "%-20s %s\n", $field, "$value"; 2168 } 2169 print "\n"; 2170 $log_for_file .= "\n"; 2171 2172 put_log_to_logfile($log_for_file); 2173 put_total_files($total_files_count); 2174 put_total_keys($total_keys_count); 2175 2176 my $argv = join "\t", @ARGV; 2177 my $cwd = cwd(); 2178 put_status("argv", $argv); 2179 put_status("cwd", $cwd); 2180 } 2181 2182 sub put_log_to_logfile ($) { 2183 my ($logmsg) = @_; 2184 my $fh_logfile = util::efopen(">>$var::NMZ{'log'}"); 2185 print $fh_logfile $logmsg; 2186 util::fclose($fh_logfile); 2187 } 2188 2189 sub get_year() { 2190 my $year = (localtime)[5] + 1900; 2191 2192 return $year; 2193 } 2194 2195 # Compose NMZ.head and NMZ.foot. Prepare samples if necessary. 2196 # Insert $docid_count, $key_count, and $month/$day/$year respectively. 2197 sub make_headfoot ($$$) { 2198 my ($file, $docid_count, $key_count) = @_; 2199 2200 my $day = sprintf("%02d", (localtime)[3]); 2201 my $month = sprintf("%02d", (localtime)[4] + 1); 2202 my $year = get_year(); 2203 my $buf = ""; 2204 2205 if (-f $file) { 2206 $buf = util::readfile($file); 2207 } else { 2208 $file =~ m!.*/(.*)$!; 2209 my $fname = $1; 2210 my $template = "$TEMPLATEDIR/$fname"; 2211 2212 if (-f $template) { 2213 $buf = util::readfile($template); 2214 } else { 2215 return; 2216 } 2217 } 2218 2219 my $fh_file = util::efopen(">$file"); 2220 2221 if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) { 2222 my $total_files_count = util::commas(get_total_files() + $docid_count 2223 - $DeletedFilesCount - $UpdatedFilesCount); 2224 $buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/; 2225 2226 } 2227 if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) { 2228 my $tmp = $2; 2229 $tmp =~ tr/,//d; 2230 $tmp = $key_count; 2231 $tmp = util::commas($tmp); 2232 $buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/; 2233 } 2234 $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs; 2235 $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs; 2236 $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)} 2237 {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs; 2238 $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)} 2239 {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs; 2240 2241 print $fh_file $buf; 2242 util::fclose($fh_file); 2243 } 2244 2245 # Make phrase hashes for NMZ.p 2246 # Handle two words each for calculating a hash value ranged 0-65535. 2247 sub make_phrase_hash ($$$) { 2248 my ($docid_count, $docid_base, $contref) = @_; 2249 2250 my %tmp = (); 2251 $$contref =~ s!\x7f */? *\d+ *\x7f!!g; # remove tags of weight 2252 $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols 2253 my @words = split(/\s+/, $$contref); 2254 @words = grep {$_ ne ""} @words; # remove empty words 2255 my $word_b = shift @words; 2256 my $docid = $docid_count + $docid_base; 2257 for my $word (@words) { 2258 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX); 2259 my $hash = hash($word_b . $word); 2260 unless (defined $tmp{$hash}) { 2261 $tmp{$hash} = 1; 2262 $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash}; 2263 $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash}); 2264 # util::dprint("<$word_b, $word> $hash\n"); 2265 $PhraseHashLast{$hash} = $docid; 2266 } 2267 $word_b = $word; 2268 } 2269 } 2270 2271 # Construct NMZ.p and NMZ.pi file. this processing is rather complex. 2272 sub write_phrase_hash () { 2273 write_phrase_hash_sub(); 2274 util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'}); 2275 util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'}); 2276 } 2277 2278 sub write_phrase_hash_sub () { 2279 my $opened = 0; 2280 2281 return 0 if %PhraseHash eq ''; # namazu-devel-ja #3146 2282 util::dprint(_("doing write_phrase_hash() processing.\n")); 2283 2284 my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}"); 2285 my $fh_tmp_p = util::efopen(">$var::NMZ{'__p'}"); 2286 2287 my $fh_phrase = util::fopen($var::NMZ{'_p'}); 2288 my $fh_phraseindex = undef; 2289 if ($fh_phrase) { 2290 $fh_phraseindex = util::efopen($var::NMZ{'_pi'}); 2291 $opened = 1; 2292 } 2293 2294 my $ptr = 0; 2295 for (my $i = 0; $i < 65536; $i++) { 2296 2297 my $baserecord = ""; 2298 my $baseleng = 0; 2299 2300 if ($opened) { 2301 my $tmp = 0; 2302 read($fh_phraseindex, $tmp, $var::INTSIZE); 2303 $tmp = unpack("N", $tmp); 2304 if ($tmp != 0xffffffff) { # 0xffffffff 2305 $baseleng = readw($fh_phrase); 2306 read($fh_phrase, $baserecord, $baseleng); 2307 } 2308 } 2309 if (defined($PhraseHash{$i})) { 2310 if ($baserecord eq "") { 2311 print $fh_tmp_pi pack("N", $ptr); 2312 my $record = $PhraseHash{$i}; 2313 my $n2 = length($record); 2314 my $data = pack("w", $n2) . $record; 2315 print $fh_tmp_p $data; 2316 $ptr += length($data); 2317 } else { 2318 print $fh_tmp_pi pack("N", $ptr); 2319 my $record = $PhraseHash{$i}; 2320 my $last_docid = get_last_docid($baserecord, 1); 2321 my $adjrecord = adjust_first_docid($record, $last_docid); 2322 check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31 2323 $record = $adjrecord; 2324 my $n2 = length($record) + $baseleng; 2325 my $data = pack("w", $n2) . $baserecord . $record; 2326 print $fh_tmp_p $data; 2327 $ptr += length($data); 2328 } 2329 } else { 2330 if ($baserecord eq "") { 2331 # if $baserecord has no data, set to 0xffffffff 2332 print $fh_tmp_pi pack("N", 0xffffffff); 2333 } else { 2334 print $fh_tmp_pi pack("N", $ptr); 2335 my $data = pack("w", $baseleng) . $baserecord; 2336 print $fh_tmp_p $data; 2337 $ptr += length($data); 2338 } 2339 } 2340 } 2341 2342 if ($opened) { 2343 util::fclose($fh_phraseindex); 2344 } 2345 if (defined $fh_phrase) { 2346 util::fclose($fh_phrase); 2347 } 2348 util::fclose($fh_tmp_p); 2349 util::fclose($fh_tmp_pi); 2350 2351 %PhraseHash = (); 2352 %PhraseHashLast = (); 2353 } 2354 2355 # Dr. Knuth's ``hash'' from (UNIX MAGAZINE May 1998) 2356 sub hash ($) { 2357 my ($word) = @_; 2358 2359 my $hash = 0; 2360 for (my $i = 0; $word ne ""; $i++) { 2361 $hash ^= $Seed[$i & 0x03][ord($word)]; 2362 $word = substr $word, 1; 2363 # $word =~ s/^.//; is slower 2364 } 2365 return $hash & 65535; 2366 } 2367 2368 # Count frequencies of words. 2369 sub count_words ($$$$) { 2370 my ($docid_count, $docid_base, $contref, $kanji) = @_; 2371 my (@tmp); 2372 2373 # Normalize into small letter. 2374 $$contref =~ tr/A-Z/a-z/; 2375 2376 # Remove control char. 2377 $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /; 2378 2379 # It corresponds to -j option of ChaSen. 2380 $$contref =~ s/^[ \t\f]+//gm; # except "\r\n" 2381 $$contref =~ s/[ \t\f]+$//gm; # except "\r\n" 2382 $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi; # for hyphenation 2383 if (util::islang("ja")) { 2384 $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs; 2385 $$contref =~ s/(。」|。「)/$1\n/gs; 2386 } 2387 $$contref =~ s/\n+/\n/gs; 2388 2389 # Do wakatigaki if necessary. 2390 if (util::islang("ja")) { 2391 wakati::wakatize_japanese($contref) if $kanji; 2392 } 2393 2394 my $part1 = ""; 2395 my $part2 = ""; 2396 if ($$contref =~ /\x7f/) { 2397 $part1 = substr $$contref, 0, index($$contref, "\x7f"); 2398 $part2 = substr $$contref, index($$contref, "\x7f"); 2399 # $part1 = $PREMATCH; # $& and friends are not efficient 2400 # $part2 = $MATCH . $POSTMATCH; 2401 } else { 2402 $part1 = $$contref; 2403 $part2 = ""; 2404 } 2405 2406 # do scoring 2407 my %word_count = (); 2408 $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f! 2409 wordcount_sub($2, $1, \%word_count)!ge; 2410 wordcount_sub($part1, 1, \%word_count); 2411 2412 # Add them to whole index 2413 my $docid = $docid_count + $docid_base; 2414 for my $word (keys(%word_count)) { 2415 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX); 2416 $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word}; 2417 $KeyIndex{$word} .= pack("w2", 2418 $docid - $KeyIndexLast{$word}, 2419 $word_count{$word}); 2420 $KeyIndexLast{$word} = $docid; 2421 } 2422 } 2423 2424 # 2425 # Count words and do score weighting 2426 # 2427 sub wordcount_sub ($$\%) { 2428 my ($text, $weight, $word_count) = @_; 2429 2430 # Remove all symbols when -K option is specified. 2431 $text =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'}; 2432 2433 # Count frequencies of words in a current document. 2434 # Handle symbols as follows. 2435 # 2436 # tcp/ip -> tcp/ip, tcp, ip 2437 # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip 2438 # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp 2439 # 2440 # Don't do processing for nested symbols. 2441 # NOTE: When -K is specified, all symbols are already removed. 2442 2443 my @words = split /\s+/, $text; 2444 for my $word (@words) { 2445 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX); 2446 if ($var::Opt{'noedgesymbol'}) { 2447 # remove symbols at both ends 2448 $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g; 2449 } 2450 $word_count->{$word} = 0 unless defined($word_count->{$word}); 2451 $word_count->{$word} += $weight; 2452 unless ($var::Opt{'nosymbol'}) { 2453 if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) { 2454 $word_count->{$1} = 0 unless defined($word_count->{$1}); 2455 $word_count->{$1} += $weight; 2456 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/; 2457 } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) { 2458 $word_count->{$1} = 0 unless defined($word_count->{$1}); 2459 $word_count->{$1} += $weight; 2460 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/; 2461 } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) { 2462 $word_count->{$1} = 0 unless defined($word_count->{$1}); 2463 $word_count->{$1} += $weight; 2464 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/; 2465 } 2466 my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word) 2467 if $word =~ /[^\xa1-\xfea-z_0-9]/; 2468 for my $tmp (@words_) { 2469 next if $tmp eq ""; 2470 $word_count->{$tmp} = 0 unless defined($word_count->{$tmp}); 2471 $word_count->{$tmp} += $weight; 2472 } 2473 @words_ = (); 2474 } 2475 } 2476 return ""; 2477 } 2478 2479 # Construct NMZ.i and NMZ.ii file. this processing is rather complex. 2480 sub write_index () { 2481 my $key_count = write_index_sub(); 2482 util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'}); 2483 util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'}); 2484 2485 return $key_count; 2486 } 2487 2488 # readw: read one pack 'w' word. 2489 # This code was contributed by <furukawa@tcp-ip.or.jp>. 2490 sub readw ($) { 2491 my $fh = shift; 2492 my $ret = ''; 2493 my $c; 2494 2495 while (read($fh, $c, 1)){ 2496 $ret .= $c; 2497 last unless 0x80 & ord $c; 2498 } 2499 return unpack('w', $ret); 2500 } 2501 2502 sub get_last_docid ($$) { 2503 my ($record, $step) = @_; 2504 my (@data) = unpack 'w*', $record; 2505 2506 my $sum = 0; 2507 for (my $i = 0; $i < @data; $i += $step) { 2508 $sum += $data[$i]; 2509 } 2510 my $leng = @data / $step; 2511 return $sum; 2512 } 2513 2514 sub adjust_first_docid ($$) { 2515 my ($record, $last_docid) = @_; 2516 my (@data) = unpack 'w*', $record; 2517 2518 $data[0] = $data[0] - $last_docid; 2519 return undef if ($data[0] < 0); # namazu-bug-ja#31 2520 $record = pack 'w*', @data; 2521 return $record; 2522 } 2523 2524 sub write_index_sub () { 2525 my @words = sort keys(%KeyIndex); 2526 return 0 if $#words == -1; 2527 2528 my $cnt = 0; 2529 my $ptr_i = 0; 2530 my $ptr_w = 0; 2531 my $key_count = 0; 2532 my $baserecord = ""; 2533 2534 util::dprint(_("doing write_index() processing.\n")); 2535 my $fh_tmp_i = util::efopen(">$var::NMZ{'__i'}"); 2536 my $fh_tmp_w = util::efopen(">$var::NMZ{'__w'}"); 2537 my $fh_i = util::fopen($var::NMZ{'_i'}); 2538 my $fh_ii = util::efopen(">$var::NMZ{'_ii'}"); 2539 my $fh_w = util::fopen($var::NMZ{'_w'}); 2540 my $fh_wi = util::efopen(">$var::NMZ{'_wi'}"); 2541 2542 if ($fh_w) { 2543 FOO: 2544 while (defined(my $line = <$fh_w>)) { 2545 chop $line; 2546 my $current_word = $line; 2547 2548 my $baseleng = readw($fh_i); 2549 read($fh_i, $baserecord, $baseleng); 2550 2551 for (; $cnt < @words; $cnt++) { 2552 last unless $words[$cnt] le $current_word; 2553 my $record = $KeyIndex{$words[$cnt]}; 2554 my $leng = length($record); 2555 2556 if ($current_word eq $words[$cnt]) { 2557 my $last_docid = get_last_docid($baserecord, 2); 2558 my $adjrecord = adjust_first_docid($record, $last_docid); 2559 check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31 2560 $record = $adjrecord; 2561 $leng = length($record); # re-measure 2562 my $tmp = pack("w", $leng + $baseleng); 2563 2564 my $data_i = "$tmp$baserecord$record"; 2565 my $data_w = "$current_word\n"; 2566 print $fh_tmp_i $data_i; 2567 print $fh_tmp_w $data_w; 2568 print $fh_ii pack("N", $ptr_i); 2569 print $fh_wi pack("N", $ptr_w); 2570 $ptr_i += length($data_i); 2571 $ptr_w += length($data_w); 2572 $key_count++; 2573 2574 $cnt++; 2575 next FOO; 2576 } else { 2577 my $tmp = pack("w", $leng); 2578 my $data_i = "$tmp$record"; 2579 my $data_w = "$words[$cnt]\n"; 2580 print $fh_tmp_i $data_i; 2581 print $fh_tmp_w $data_w; 2582 print $fh_ii pack("N", $ptr_i); 2583 print $fh_wi pack("N", $ptr_w); 2584 $ptr_i += length($data_i); 2585 $ptr_w += length($data_w); 2586 $key_count++; 2587 } 2588 } 2589 my $tmp = pack("w", $baseleng); 2590 my $data_i = "$tmp$baserecord"; 2591 my $data_w = "$current_word\n"; 2592 print $fh_tmp_i $data_i; 2593 print $fh_tmp_w $data_w; 2594 print $fh_ii pack("N", $ptr_i); 2595 print $fh_wi pack("N", $ptr_w); 2596 $ptr_i += length($data_i); 2597 $ptr_w += length($data_w); 2598 $key_count++; 2599 } 2600 } 2601 while ($cnt < @words) { 2602 my $leng = length($KeyIndex{$words[$cnt]}); 2603 my $tmp = pack("w", $leng); 2604 my $record = $KeyIndex{$words[$cnt]}; 2605 2606 my $data_i = "$tmp$record"; 2607 my $data_w = "$words[$cnt]\n"; 2608 print $fh_tmp_i $data_i; 2609 print $fh_tmp_w $data_w; 2610 print $fh_ii pack("N", $ptr_i); 2611 print $fh_wi pack("N", $ptr_w); 2612 $ptr_i += length($data_i); 2613 $ptr_w += length($data_w); 2614 $key_count++; 2615 $cnt++; 2616 } 2617 %KeyIndex = (); 2618 %KeyIndexLast = (); 2619 2620 util::fclose($fh_wi); 2621 util::fclose($fh_w) if (defined $fh_w); 2622 util::fclose($fh_ii); 2623 util::fclose($fh_i) if (defined $fh_i); 2624 util::fclose($fh_tmp_w); 2625 util::fclose($fh_tmp_i); 2626 2627 return $key_count; 2628 } 2629 2630 # 2631 # Decide the media type. 2632 # FIXME: Very ad hoc. It's just a compromise. -- satoru 2633 # 2634 sub decide_type ($$) { 2635 my ($name, $cont) = @_; 2636 return $name if (!defined $cont || $name eq $cont); 2637 2638 util::dprint("decide_type: name: $name, cont: $cont\n"); 2639 if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) { 2640 return $name; 2641 } elsif ($cont =~ m!^application/octet-stream! && 2642 $name !~ m!^text/!) { 2643 return $name; 2644 } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! && 2645 $name !~ m!^application/octet-stream!) { 2646 # FIXME: Currently File::MMagic 1.02's checktype_data() 2647 # is unreliable for them. 2648 return $name; 2649 } elsif ($cont =~ m!^application/x-zip! && 2650 $name =~ m!^application/!) { 2651 # zip format is used other applications e.g. OpenOffice. 2652 # It is necessary to add to check extention. 2653 return $name; 2654 } 2655 2656 return $cont; 2657 } 2658 2659 # 2660 # Debugging code for the "negative numbers" problem. 2661 # 2662 sub check_records ($$$) { 2663 my ($recref, $baserecref, $step) = @_; 2664 dump_record($baserecref, $step); 2665 dump_record($recref, $step); 2666 print STDERR "The \x22negative number\x22 problem occurred.\n"; 2667 exit(1); 2668 } 2669 2670 sub dump_record($$) { 2671 my ($recref, $step) = @_; 2672 my (@data) = unpack 'w*', $$recref; 2673 print STDERR "dump record data to NMZ.bug.info (step: $step)..."; 2674 my $fh_info = util::fopen(">> NMZ.bug.info"); 2675 print $fh_info "dumped record data (step: $step)..."; 2676 foreach (@data) { 2677 print $fh_info sprintf(" %08x", $_); 2678 } 2679 print $fh_info "\n"; 2680 util::fclose($fh_info); 2681 return; 2682 } 2683 2684 sub trapintr { 2685 my ($signame) = @_; 2686 print STDERR "Warning: signal $signame occured.\n"; 2687 } 2688 2689 # 2690 # For avoiding "used only once: possible typo at ..." warnings. 2691 # 2692 muda($conf::ON_MEMORY_MAX, 2693 $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX, 2694 $conf::DENY_FILE, $var::INTSIZE, 2695 $conf::CHASEN_NOUN, $conf::CHASEN, 2696 $conf::KAKASI, $var::Opt{'okurigana'}, 2697 $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX, 2698 $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO, 2699 $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX, 2700 $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE, 2701 $conf::ADDRESS, $var::MAILING_ADDRESS, 2702 $conf::FILE_SIZE_MAX, 2703 $conf::MECAB, 2704 $conf::DENY_DDN, 2705 $var::TRAC_URI, 2706 ); 2707 2708 sub muda {} 2709
まりんきょ学問所> 全文検索システムNamazu > mknmz のソースコードを読む