#!/usr/local/bin/perl # converter from my diary.shtml to blosxom # Copyright (C) 2004 Kengo Ichiki # $Id: diary2blosxom.pl,v 1.14 2004/04/05 20:56:39 ichiki Exp $ use HTML::Parser; use Jcode; $topurl = "http://kichiki.com/cgi-bin/blosxom.cgi/diary"; $flavour = "html"; @list0 = (); # old blosxom entry files # globals for check_category_in_link $check_link_tag = ''; $check_link_li = 0; $check_link_text = ''; # check for command-line argument die "Usage: diary2blosxom.pl (HTML file)\n" unless @ARGV == 1; # get the command-line argument my $file = shift; my $path, $file0; if ($file =~ /^diary\d{4}\_\d{2}.shtml$/) { $path = ''; } elsif ($file =~ /(.*\/)(diary\d{4}\_\d{2}.shtml)$/) { $path = $1; $file0 = $2; } else { print "Illeagal diary file!\n"; exit; } #print "PATH = $path\n"; # Create HTML Parser object my $p = HTML::Parser->new (api_version => 3, start_h => [\&start, "tagname,attr"], text_h => [\&text, "dtext"], end_h => [\&end, "tagname"], ); die "File $file does't exist.\n" unless -e $file; open (HTML, "< $file") or die "Cannot open $file"; $h1flag = 0; # h1 tag $h2flag = 0; # h2 tag $preflag = 0; # pre tag $nsaflag = 0; # not-shown anchor tag flag -- local copy, no-href tags $ullevel = 0; # ul level $year = 0; $month = 0; $day = 0; $num = 1; $title = ""; $body = ""; $subn = 0; $subday = 0; $subname = ""; @subfile; @subtitle; @subbody; @subullevel; while () { # convert input-line into utf8 Jcode::convert ( \$_, "utf8"); $p->parse($_); } close (HTML); foreach $file (@list0) { print "removed: $file\n"; unlink ($file); } exit; sub start { my ($tagname, $attr) = @_; my $i; my $a, $c, $tmptag; my $link_category; # h1 tag if ($tagname eq "h1") { $h1flag = 1; } # h2 tag elsif ($tagname eq "h2") { $h2flag = 1; } # a tag elsif ($tagname eq "a") { if ($ullevel == 0 && $h2flag == 1) { $_ = $attr->{id}; if (/d(\d+)/) { $day = $1; $num = 1; # reset counter } } if ($ullevel >= 1) { if ($attr->{href} eq "") { # do nothing $nsaflag = 1; } else { $_ = $attr->{href}; $_ =~ s/^images\/AMAZON/http:\/\/images.amazon.com\/images\/P/; $_ =~ s/^theorems.shtml/http:\/\/kichiki.hp.infoseek.co.jp\/diary\/theorems.shtml/; if (/^\#(.+)/) { $_ = sprintf ("diary%4d_%02d.shtml\#%s", $year, $month, $1); } if (/^diary(\d{4})\_(\d{2}).shtml\#d(\d+)-(.+)/) { $link_category = check_category_in_link ($_); if ($link_category eq "") { $_ = sprintf ("$topurl/%4d%02d%02d-%s.$flavour", $1, $2, $3, $4); } else { $_ = sprintf ("$topurl/$link_category/%4d%02d%02d-%s.$flavour", $1, $2, $3, $4); } } elsif (/^diary(\d{4})\_(\d{2}).shtml\#d(\d+)/) { $_ = "$topurl/$1/$2/$3"; } elsif (/^diary(\d{4})\_(\d{2}).shtml/) { $_ = "$topurl/$1/$2"; } if ($_ eq "") { # do nothing $nsaflag = 1; } elsif (/^LOCAL\//) { # do nothing $nsaflag = 1; } else { $nsaflag = 0; # to show (for sure) if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } else { $body .= ""; } } else { if ($subullevel[$subn-1] == $ullevel) { $subtitle[$subn-1] .= ""; } else { $subbody[$subn-1] .= ""; } } } } if ($attr->{id} ne "") { $_ = $attr->{id}; if (/d(\d+)-(.+)/) { $subn ++; $subday = $1; $subname = $2; if ($subday != $day) { print "WRONG!!\n"; } $subfile [$subn-1] = sprintf ("%4d%02d%02d-%s", $year, $month, $day, $subname); $subullevel [$subn-1] = $ullevel; $subtitle [$subn-1] = ""; $subbody [$subn-1] = ""; } } } } # ul tag elsif ($tagname eq "ul") { if ($ullevel >= 1) { if ($subn == 0) { $body .= "\n"; for ($i = 1; $i < $ullevel; $i ++) { $body .= " "; } $body .= "\n"; } } # a tag elsif ($tagname eq "a") { if ($ullevel >= 1 && $nsaflag == 0) { if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } elsif ($ullevel > 1) { $body .= ""; } } else { if ($ullevel == $subullevel[$subn-1]) { $subtitle[$subn-1] .= ""; } elsif ($ullevel > $subullevel[$subn-1]) { $subbody[$subn-1] .= ""; } } } else { $nsaflag = 0; } } # li tag elsif ($tagname eq "li") { if ($subn == 0) { if ($ullevel == 1) { $blosxomfile = sprintf ("%4d%02d%02d%02d", $year, $month, $day, $num); # end of the entry print_entry ($blosxomfile, $title, $body, $year, $month, $day, $num); # for sure $num ++; $title = ""; $body = ""; } elsif ($ullevel > 1) { $body .= "\n"; for ($i = 1; $i < $ullevel; $i ++) { $body .= " "; } $body .= "\n"; } } else { if ($ullevel == $subullevel[$subn-1]) { # end of the entry print_entry ($subfile[$subn-1], $subtitle[$subn-1], $subbody[$subn-1], $year, $month, $day, $num); # make a link # categorize #$category = categorize ($subtitle[$subn-1]); ($category, $subtitle[$subn-1]) = categorize ($subtitle[$subn-1]); # remove anchor tag in the tiltke $localtitle = $subtitle[$subn-1]; $localtitle =~ s///g; $localtitle =~ s/<\/a>//g; if ($subn == 1) { if ($category eq "") { $body .= "$localtitle"; } else { $body .= "$localtitle"; } } else { if ($category eq "") { $subbody [$subn - 2] .= "$localtitle"; } else { $subbody [$subn - 2] .= "$localtitle"; } } # for sure $num ++; $subfile [$subn-1] = ""; $subtitle [$subn-1] = ""; $subbody [$subn-1] = ""; $subullevel[$subn-1] = 0; $subn --; } else { $subbody[$subn-1] .= "\n"; for ($i = 1; $i < ($ullevel-$subullevel[$sun-1]); $i ++) { $subbody[$subn-1] .= " "; } $subbody[$subn-1] .= "\n"; } } } # pre tag elsif ($tagname eq "pre") { $preflag = 0; if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } elsif ($ullevel > 1) { $body .= ""; } } else { if ($ullevel == $subullevel[$subn-1]) { $subtitle[$subn-1] .= ""; } elsif ($ullevel > $subullevel[$subn-1]) { $subbody[$subn-1] .= ""; } } } # other tags else { if ($ullevel >= 1) { if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } else { $body .= ""; } } else { if ($ullevel == $subullevel[$subn-1]) { $subtitle[$subn-1] .= ""; } else { $subbody[$subn-1] .= ""; } } } } } sub print_entry { my ($f, # file $t, # title $b, # body $y, # year (4 digits) $m, # month (1 - 12) $d, # day (1 - 31) $n # number (1 - ) ) = @_; my $category; my @txtmon = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "13"); my $filename = ''; $t =~ s/\n//g; $t =~ s/^ +//g; $t =~ s/ +$//g; $t =~ s/ / /g; $t =~ s/\Q()\E//g; if ($t eq "") { $t = "-"; } $b =~ s/\Q()\E//g; # categorize #$category = categorize ($t); ($category, $t) = categorize ($t); if ($category ne "") { unless (-e $category) { mkdir "$category", 0755 or die "Cannot make dir $category\n"; } $filename = "./$category/$f.txt"; } else { $filename = "./$f.txt"; } open (FH, "> tmp.txt"); print (FH "$t\n"); printf (FH "meta-creation_date: %s %d, %d 00:%02d\n", $txtmon[int($m-1)], $d, $y, $n); print (FH "$b\n"); close (FH); my $flag_match = 0; my $count = 0; foreach my $file (@list0) { if ($file eq $filename) { $flag_match = 1; if (!compare_files ("tmp.txt", $file)) { # modified print ("updated: $filename\n"); rename ("tmp.txt", $filename); } else { # same unlink ("tmp.txt"); } # remove the file from @list0 splice (@list0, $count, 1); return; } $count ++; } if ($flag_match == 0) { # new entry print ("new: $filename\n"); rename ("tmp.txt", $filename); } } # "getfiles" is borrowed from # Blosxom Plugin: recentwritebacks_tree # Author(s): typester # Version: 1.0 # Blosxom Home/Docs/Licensing: http://www.blosxom.com/ sub getfiles { my $dir = $_[0]; my @files = (); my @ret = (); $dir .= "/" if ($dir =~ /[^\/]$/); if (opendir (DIR, $dir)) { @files = readdir (DIR); closedir (DIR); } foreach my $file (@files) { next if ($file eq '.' or $file eq '..'); if (-d "$dir$file" and $file ne '') { my @subdir = &getfiles("$dir$file"); @ret = (@ret, @subdir); } else { push (@ret, "$dir$file"); } } return @ret; } sub getentries { my ($dir, $year, $month) = @_; my @ret = (); my @files = getfiles ($dir); my $str_ym = sprintf ("%4d%02d", $year, $month); foreach my $file (@files) { if ($file =~ /$str_ym.+\.txt/) { push (@ret, "$file"); } } return @ret; } # return true when two files are same sub compare_files { my ($f1, $f2) = @_; my $l1 = ''; my $l2 = ''; my $ret = 1; # true my $s1 = -s $f1; my $s2 = -s $f2; if ($s1 != $s2) { $ret = 0; # false } else { open (F1, "< $f1"); open (F2, "< $f2"); COMPARE: while ($l1 = and $l2 = ) { if ($l1 ne $l2) { $ret = 0; # false last COMPARE; } } close (F1); close (F2); } return $ret; } # check link sub check_category_in_link { my $link = shift; my ($year, $month, $day); my $link_parser; my $category; # globals: $path, $check_link_tag, $check_link_li, $check_link_text. #print "CHECK : ".$link." => "; $category = ''; if ($link =~ /^diary(\d{4})\_(\d{2}).shtml\#d(\d+)-(.+)/) { $year = $1; $month = $2; $day = $3; $check_link_tag = "d$3-$4"; #print $check_link_tag." => "; $file = sprintf ("diary%4d_%02d.shtml", $1, $2); #print $file." => "; open (LINK, "< $path$file") or return $category; $link_parser = HTML::Parser->new (api_version => 3, start_h => [\&check_link_start, "tagname,attr"], text_h => [\&check_link_text, "dtext"], end_h => [\&check_link_end, "tagname"], ); $check_link_li = 0; $check_link_text = ''; while () { # convert input-line into utf8 Jcode::convert ( \$_, "utf8"); $link_parser->parse($_); } close (LINK); #$category = categorize ($check_link_text); ($category, $check_link_text) = categorize ($check_link_text); } #print $category."\n"; return $category; } sub check_link_start { my ($tagname, $attr) = @_; # $check_link_li and $check_link_tag are global. # a tag if ($tagname eq "a") { if ($attr->{id} eq $check_link_tag) { $check_link_li = 1; } } } sub check_link_text { my ($dtext) = shift; # $check_link_li and $check_link_text are global. $dtext =~ s/&/&/g; $dtext =~ s/ +/ /g; $dtext =~ s/^\n//g; if ($check_link_li == 1) { $check_link_text .= $dtext; } } sub check_link_end { my ($tagname) = shift; # $check_link_li is global. # li tag if ($tagname eq "li" and $check_link_li == 1) { $check_link_li = 0; } } # check category : news, phys sub categorize { my $t = shift; my $category = ''; #print "CATEGORY: ".$t." => "; $t =~ s/^ +//g; $t =~ s/^\n//g; if ($t =~ /^\Q[\Enews\Q]\E/) { $category = "news"; $t =~ s/\Q[\Enews\Q]\E//; } elsif ($t =~ /^\Q[\Ephys\Q]\E/) { $category = "phys"; $t =~ s/\Q[\Ephys\Q]\E//; } elsif ($t =~ /^\Q[\E.+\Q]\E/) { $category = "phys"; } #print $category."\n"; #return $category; return ($category, $t); }