diff options
Diffstat (limited to 'lib/sisu/v3/harvest_topics.rb')
| -rw-r--r-- | lib/sisu/v3/harvest_topics.rb | 559 | 
1 files changed, 559 insertions, 0 deletions
| diff --git a/lib/sisu/v3/harvest_topics.rb b/lib/sisu/v3/harvest_topics.rb new file mode 100644 index 00000000..948965dd --- /dev/null +++ b/lib/sisu/v3/harvest_topics.rb @@ -0,0 +1,559 @@ +# coding: utf-8 +=begin + + * Name: SiSU + + * Description: a framework for document structuring, publishing and search +   metadata harvest, extract topics and associated writings from document set +   (topics use topic_register header) + + * Author: Ralph Amissah + + * Copyright: (C) 1997 - 2010, Ralph Amissah, All Rights Reserved. + + * License: GPL 3 or later: + +   SiSU, a framework for document structuring, publishing and search + +   Copyright (C) Ralph Amissah + +   This program is free software: you can redistribute it and/or modify it +   under the terms of the GNU General Public License as published by the Free +   Software Foundation, either version 3 of the License, or (at your option) +   any later version. + +   This program is distributed in the hope that it will be useful, but WITHOUT +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +   FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +   more details. + +   You should have received a copy of the GNU General Public License along with +   this program. If not, see <http://www.gnu.org/licenses/>. + +   If you have Internet connection, the latest version of the GPL should be +   available at these locations: +   <http://www.fsf.org/licensing/licenses/gpl.html> +   <http://www.gnu.org/licenses/gpl.html> + +   <http://www.jus.uio.no/sisu/gpl.fsf/toc.html> +   <http://www.jus.uio.no/sisu/gpl.fsf/doc.html> +   <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt> + + * SiSU uses: +   * Standard SiSU markup syntax, +   * Standard SiSU meta-markup syntax, and the +   * Standard SiSU object citation numbering and system + + * Hompages: +   <http://www.jus.uio.no/sisu> +   <http://www.sisudoc.org> + + * Download: +   <http://www.jus.uio.no/sisu/SiSU/download.html> + + * Ralph Amissah +   <ralph@amissah.com> +   <ralph.amissah@gmail.com> + + ** Description: simple xml representation (sax style) + +=end +module HARVEST_topics +  require "#{SiSU_lib}/author_format"                      # author_format.rb +  class Songsheet +    def initialize(opt) +      @opt=opt +      @file_list=opt.files +      @env=SiSU_Env::Info_env.new +    end +    def songsheet +      files,idx_array=[],[] +      @file_list.each do  |f| +        (f =~/.+?\.ss[tm]$/) \ +        ? (files << f[/(.+?\.ss[tm])$/,1]) \ +        : (print "not .sst or .ssm ? << #{f} >> ") +      end +      files.each do |filename| +        file_array=[] +        File.open(filename,'r') do |file| +          file.each_line("\n\n") do |line| +            if line =~/^@(?:title|creator|classify):(?:\s|$)/m +              file_array << line +            elsif line =~/^@\S+?:(?:\s|$)/m \ +            or line =~/^(?:\s*\n|%+ )/ +            else break +            end +          end +        end +        idx_array=HARVEST_topics::Harvest.new(@opt,file_array,filename,idx_array).extract_harvest +      end +      the_idx=HARVEST_topics::Index.new(idx_array,@@the_idx_topics).construct_book_topic_index +      #HARVEST_topics::Output_index.new('',the_idx).screen_print.cycle if @opt.cmd.inspect =~/[VM]/ +      HARVEST_topics::Output_index.new(@opt,the_idx).html_print.html_songsheet +      puts "file://#{@env.path.output_md_harvest}/harvest_topics.html" +      puts "file://#{@env.path.pwd}/harvest_topics.html" if @opt.cmd.inspect =~/M/ +    end +  end +  class Harvest +    def initialize(opt,data,filename,idx_array) +      @opt,@data,@filename,@idx_array=opt,data,filename,idx_array +    end +    def extract_harvest +      data,filename,idx_array=@data,@filename,@idx_array +      @idx_lst,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil +      rgx={} +      rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m +      rgx[:title]=/^@title:[ ]+(.+)/ +      rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m +      rgx[:idx]=/^@classify:.+?:topic_register:[ ]+(.+?)\n/m +      data.each do |para| +        if para=~ rgx[:idx] +          @idx_list=rgx[:idx].match(para)[1] +        end +        if para=~ rgx[:title] +          @title=rgx[:title].match(para)[1] +        end +        if para=~ rgx[:subtitle] +          @subtitle=rgx[:subtitle].match(para)[1] +        end +        if para=~ rgx[:author] +          @author_format=rgx[:author].match(para)[1] +        end +        break if @title and @subtitle and @author and @idx_lst +      end +      @fulltitle=@subtitle ? (@title + ' - ' + @subtitle) : @title +      if @title \ +      and @author_format \ +      and @idx_list +        creator=FORMAT::Author.new(@author_format.strip).author_details +        @authors,@authorship=creator[:authors],creator[:authorship] +        file=if filename=~/~[a-z]{2,3}\.ss[mt]$/ +          lang='.' + /~([a-z]{2,3})\.ss[mt]$/.match(filename)[1] +          filename.sub(/~[a-z]{2,3}\.ss[mt]$/,'') +        else +          lang='' +          filename.sub(/\.ss[mt]$/,'') +        end +        page="sisu_manifest#{lang}.html" +        idx_array <<=if @idx_list =~/;/ +          g=@idx_list.scan(/[^;]+/) +          idxl=[] +          g.each do |i| +            i.strip! +            idxl << { :filename =>filename,:file =>file,:rough_idx =>i,:title =>@fulltitle,:author =>creator,:page =>page} +          end +          idxl +        else { :filename =>filename,:file =>file,:rough_idx =>@idx_list,:title =>@fulltitle,:author =>creator,:page =>page} +        end +      else +        p "missing required field in #{@filename} - [title]: <<#{@title}>>; [author]: <<#{@author_format}>>; [idx]: <<#{@idx_list}>>" if @opt.cmd.inspect =~/[VM]/ +      end +      idx_array.flatten! +      idx_array +    end +  end +  class Index +    def initialize(idx_array,the_idx) +      @idx_array,@the_idx=idx_array,the_idx +      @@the_idx_topics=@the_idx +    end +    def capital(txt) +      txt[0].chr.capitalize + txt[1,txt.length] +    end +    def contents(hash,idx) +      names='' +      idx[:author][:last_first_format_a].each do |n| +        s=n.sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_') +        names += %{<a href="harvest_authors.html##{s}">#{n}</a>, } +      end +      hash << { :filename =>idx[:filename],:file =>idx[:file],:author =>names,:title =>idx[:title],:page =>idx[:page]} +    end +    def construct_book_topic_index +      idx_array=@idx_array +      idx_array.each do |idx| +        @lv0,@lv1,@lv2,@lv3,@lv4={},{},{},{},{} +        if idx[:rough_idx] +          idx_lst=idx[:rough_idx].scan(/[^:]+/) +        else +          puts "no topic register in: << #{idx[:filename]} >>" +          next +        end +        idx_lst_alt=[] +        idx_lst.each {|lev| idx_lst_alt << lev.scan(/[^|]+/)} +        depth = idx_lst_alt.length - 1 +        range = 0..depth +        range.each do |t| +          if idx_lst_alt[t] +            case t +            when 0 +              lev0=idx_lst_alt[t] +              lev0.each do |lv0| +                lv0=capital(lv0) +                if @@the_idx_topics[lv0].class==NilClass +                  @@the_idx_topics[lv0]={:md => []} +                end +                @lv0=lv0 if lev0.length==1 +                j=@@the_idx_topics[lv0][:md] +                contents(j,idx) if idx_lst_alt.length - 1 == t +              end +            when 1 +              lev1=idx_lst_alt[t] +              lev1.each do |lv1| +                lv1=capital(lv1) +                if @@the_idx_topics[@lv0][lv1].class==NilClass +                  @@the_idx_topics[@lv0][lv1]={:md => []} +                end +                @lv1=lv1 if lev1.length==1 +                j=@@the_idx_topics[@lv0][lv1][:md] +                contents(j,idx) if idx_lst_alt.length - 1 == t +              end +            when 2 +              lev2=idx_lst_alt[t] +              lev2.each do |lv2| +                lv2=capital(lv2) +                if @@the_idx_topics[@lv0][@lv1][lv2].class==NilClass +                  @@the_idx_topics[@lv0][@lv1][lv2]={:md => []} +                end +                @lv2=lv2 if lev2.length==1 +                j=@@the_idx_topics[@lv0][@lv1][lv2][:md] +                contents(j,idx) if idx_lst_alt.length - 1 == t +              end +            when 3 +              lev3=idx_lst_alt[t] +              lev3.each do |lv3| +                lv3=capital(lv3) +                if @@the_idx_topics[@lv0][@lv1][@lv2][lv3].class==NilClass +                  @@the_idx_topics[@lv0][@lv1][@lv2][lv3]={:md => []} +                end +                @lv3=lv3 if lev3.length==1 +                j=@@the_idx_topics[@lv0][@lv1][@lv2][lv3][:md] +                contents(j,idx) if idx_lst_alt.length - 1 == t +              end +            when 4 +              lev4=idx_lst_alt[t] +              lev4.each do |lv4| +                lv4=capital(lv4) +                if @@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4].class==NilClass +                  @@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4]={:md => []} +                end +                @lv4=lv4 if lev4.length==1 +                j=@@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4][:md] +                contents(j,idx) if idx_lst_alt.length - 1 == t +              end +            end +          end +        end +      end +      @the_idx +    end +  end +  class Output_index +    def initialize(opt,the_idx) +      @opt,@the_idx=opt,the_idx +      @env=SiSU_Env::Info_env.new +      @rc=Get_init.instance.sisu_yaml.rc +      @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z] +      @letter=@alph.shift +      @vz=SiSU_Env::Get_init.instance.skin +    end +    def html_file_open +      @output={} +      @output[:html]=File.new("#{@env.path.output_md_harvest}/harvest_topics.html",'w') +      if @opt.cmd.inspect =~/-M/ +        @output[:html_mnt]=File.new("#{@env.path.pwd}/harvest_topics.html",'w') +      end +    end +    def html_file_close +      @output[:html].close +      @output[:html_mnt].close if @output[:html_mnt].class==File +    end +    def html_print +      def html_songsheet +        html_file_open +        html_head +        html_alph +        html_body +        html_tail +        html_file_close +      end +      def html_head_adjust(type='') +        css_path=(type !~/maintenance/) \ +        ? '../_sisu/css/harvest.css' \ +        : 'harvest.css' +        sv=SiSU_Env::Info_version.instance.get_version +        <<WOK +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<title>SiSU Metadata Harvest - Topics</title> +<meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> +<meta name="dc.title" content= "SiSU metadata harvest, Topics - SiSU information Structuring Universe, Structured information Serialised Units" /> +<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" /> +<meta name="generator" content="#{sv[:project]} #{sv[:version]} of #{sv[:date_stamp]} (n*x and Ruby!)" /> +<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" /> +<link rel="stylesheet" href="#{css_path}" type="text/css" /> +<link rel="shortcut icon" href="../_sisu/image/rb7.ico" /> +</head> +<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en"> +<a name="top" id="top"></a> +<a name="up" id="up"></a> +<a name="start" id="start"></a> +<h1>SiSU Metadata Harvest - Topics</h1> +<p>[<a href="../index.html"> HOME </a>] also see <a href="harvest_authors.html">SiSU Metadata Harvest - Authors</a></p> +<p>#{@env.widget_static.search_form}</p> +<hr /> +WOK +      end +      def html_head +        @output[:html_mnt] << html_head_adjust('maintenance') if @opt.cmd.inspect =~/M/ +        @output[:html] << html_head_adjust +      end +      def html_alph +        a=[] +        a << '<p>' +        @alph.each do |x| +          a << (x =~/[0-9]/) \ +          ? '' \ +          : %{<a href="##{x}">#{x}</a>, } +        end +        @output[:html_mnt] << a if @opt.cmd.inspect =~/M/ +        @output[:html] << a.join +      end +      def html_tail +        a=[] +        a <<<<WOK +<hr /> +<a name="bottom" id="bottom"></a> +<a name="down" id="down"></a> +<a name="end" id="end"></a> +<a name="finish" id="finish"></a> +<a name="stop" id="stop"></a> +<a name="credits"></a> +#{@vz.credits_sisu} +</body> +</html> +WOK +        @output[:html_mnt] << a if @output[:html_mnt].class==File +        @output[:html] << a +      end +      def do_html(html) +        @output[:html] << html +      end +      def do_html_maintenance(html) +        @output[:html_mnt] << html if @output[:html_mnt].class==File +      end +      def do_string(attrib,string) +        html=%{<p class="#{attrib}">#{string}</p>} +        do_html(html) +        do_html_maintenance(html) if @output[:html_mnt].class==File +      end +      def do_string_default(attrib,string) +        html=%{<p class="#{attrib}">#{string}</p>} +        do_html(html) +      end +      def do_string_maintenance(attrib,string) +        html=%{<p class="#{attrib}">#{string}</p>} +        do_html_maintenance(html) if @output[:html_mnt].class==File +      end +      def do_string_name(attrib,string) +        f=/^(\S)/.match(string)[1] +        if @letter < f +          while @letter < f +            if @alph.length > 0 +              @letter=@alph.shift +              if @output[:html_mnt].class==File +                @output[:html_mnt] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} +              end +              @output[:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} +            else break +            end +          end +        end +        name=string.strip.gsub(/\s+/,'_') +        html=%{<p class="#{attrib}"><a name="#{name}">#{string}</a></p>} +        do_html(html) +        do_html_maintenance(html) if @output[:html_mnt].class==File +      end +      def do_array(lv,array) +        lv+=1 +        array.each do |b| +          do_case(lv,b) +        end +      end +      def do_hash_md(attrib,hash) +        html=%{<a href="../#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}} +        do_string_default(attrib,html) +      end +      def do_hash_md_maintenance(attrib,hash) +        if @output[:html_mnt].class==File #should not be run for presentation output +          html=%{[<a href="#{hash[:file]}.sst">src</a>]  <a href="file://#{@env.path.output}/#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}} +          do_string_maintenance(attrib,html) +        end +      end +      def do_hash(lv,hash) +        lv+=1 +        key=[] +        hash.each_key do |m| +          if m == :md +            do_case(lv,hash[m]) +          elsif m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page +            key << m +          elsif m == :title +            do_hash_md('work',hash) +            do_hash_md_maintenance('work',hash) +          end +        end +        if key.length > 0 +          key.sort.each do |m| +            attrib="lev#{lv}" +            lv==0 ? do_string_name(attrib,m) : do_string(attrib,m) +            do_case(lv,hash[m]) +          end +        end +      end +      def do_case(lv,a) +        y = a.class +        case +        when y==String +          attrib="lev#{lv}" +          lv==0 ? do_string_name(attrib,a) : do_string(attrib,a) +        when y==Array +          do_array(lv,a) +        when y==Hash +          do_hash(lv,a) +        end +      end +      def html_body +        the_idx=@the_idx +        the_idx.sort.each do |a| +          do_case(-1,a) +        end +      end +      self +    end +    def screen_print +      def do_string(lv,string) +        s=' '*4 +        puts s*lv + string +      end +      def do_array(lv,array) +        lv+=1 +        array.each do |b| +          do_case(lv,b) +        end +      end +      def do_hash_md(lv,hash) +        string=hash[:title] + ' - ' + hash[:author] +        do_string(lv,string) +      end +      def do_hash(lv,hash) +        lv+=1 +        key=[] +        hash.each_key do |m| +          if m == :md +            do_case(lv,hash[m]) +          elsif m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page +            key << m +          elsif m == :title +            do_hash_md(lv,hash) +          end +        end +        if key.length > 0 +          key.sort.each do |m| +            do_string(lv,m) +            do_case(lv,hash[m]) +          end +        end +      end +      def do_case(lv,a) +        s=' '*4 +        y = a.class +        case +        when y==String +          do_string(lv,a) +        when y==Array +          do_array(lv,a) +        when y==Hash +          do_hash(lv,a) +        end +      end +      def cycle +        the_idx=@the_idx +        the_idx.each do |a| +          do_case(-1,a) +        end +      end +      self +    end +    def screen_print_unsorted +      def do_string(lv,string) +        s=' '*4 +        puts s*lv + string +      end +      def do_array(lv,array) +        lv+=1 +        array.each do |b| +          do_case(lv,b) +        end +      end +      def do_hash_md(lv,hash) +        string=hash[:title] + ' - ' + hash[:author] +        do_string(lv,string) +      end +      def do_hash(lv,hash) +        lv+=1 +        hash.each_key do |m| +          if m == :md +            do_case(lv,hash[m]) +          else +            if m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page +              do_string(lv,m) +              do_case(lv,hash[m]) +            elsif m == :title +              do_hash_md(lv,hash) +            else +            end +          end +        end +      end +      def do_case(lv,a) +        s=' '*4 +        y = a.class +        case +        when y==String +          do_string(lv,a) +        when y==Array +          do_array(lv,a) +        when y==Hash +          do_hash(lv,a) +        end +      end +      def cycle +        the_idx=@the_idx +        the_idx.each do |a| +          do_case(-1,a) +        end +      end +      self +    end +  end +end +__END__ +terms -|_  t{tl1} -|_ {fa}[fa]{filenames and other details} +       |           |_ {tl2} -|_ {fa}[fa]{filenames and other details} +       |           |         |_{tl3} -|_ {fa}[fa]{filenames and other details} +       |           |         |        |_{tl4} - {fa}[fa]{filenames and other details} +       |           |         |        | +       |           |         |        |_{tl4a} - {fa}[fa]{filenames and other details} +       |           |         |        | +       |           |         |        |_{tl4b} - {fa}[fa]{filenames and other details} +       |           |         |        | +       |           |         |        |_ ... +       |           |         | +       |           |         |_{tl3a} - {fa}[fa]{filenames and other details} +       |           | +       |           |_{tl2a} - {fa}[fa]{filenames and other details} +       | +       |_ t{tl1a} -|_ {fa}[fa]{filenames and other details} +                   |_ ... | 
