view pdf.go @ 26:5abe36dc0697 default tip

...
author anatofuz <anatofuz@cr.ie.u-ryukyu.ac.jp>
date Mon, 21 Sep 2020 16:03:31 +0900
parents bac5eb544d4d
children
line wrap: on
line source

package lectable

import (
	"bytes"
	"fmt"
	"regexp"
	"strconv"
	"strings"
	"unicode"

	"github.com/ledongthuc/pdf"
	"github.com/pkg/errors"
	"golang.org/x/text/unicode/norm"
)

type lectureFPDF struct {
	id       string `json:id`
	body     string `json:body`
	isSelect bool   `json:isSelect`
	day      string `json:day`
	place    string `json:place`
	grades   []int  `json:grades`
}

type pdfPagesStr []string

func convertStringFromPDF(pdfPaths []string) (*[]pdfPagesStr, error) {
	var ppsr []pdfPagesStr
	for _, apdf := range pdfPaths {
		file, reader, err := pdf.Open(apdf)
		defer file.Close()
		if err != nil {
			return nil, errors.Wrap(err, "failed open pdf")
		}

		var buf bytes.Buffer
		preader, err := reader.GetPlainText()
		if err != nil {
			return nil, errors.Wrap(err, "failed convert pdf")
		}

		buf.ReadFrom(preader)
		planString := string(norm.NFKC.Bytes(buf.Bytes()))
		planstrings := strings.Split(planString, "   ") // "   " is next pdf page
		//planstrings = planstrings[1:]                   // 最初はゴミ
		ppsr = append(ppsr, planstrings)
	}
	return &ppsr, nil
}

func isIE(s *string) bool {
	if strings.HasPrefix(*s, "情") { // === '^情' ( 情01)
		if strings.Contains(*s, "情報科学演習") {
			return false
		}
		if !strings.Contains(*s, "日本語表現") {
			return true
		}
	}

	if strings.HasPrefix(*s, "知能") { // === '^知能' (知能02)
		return true
	}

	if strings.Contains(*s, "情報工学") {
		return true
	}

	if strings.Contains(*s, "知能情報") {
		return true
	}

	if strings.Contains(*s, "電子情報通信") || strings.Contains(*s, "エネルギー環境") ||
		strings.Contains(*s, "機械工学") || strings.Contains(*s, "社会基盤デザイン") || strings.Contains(*s, "電気システム") {
		return false
	}

	if strings.Contains(*s, "GEプログラム") {
		return true
	}

	if strings.Contains(*s, "キャリアデザイン入門") {
		return true
	}

	if strings.Contains(*s, "情報科教育法") {
		return true
	}

	if strings.Contains(*s, "産業社会学") {
		return true
	}
	return false
}

var days = []string{"月", "火", "水", "木", "金"}
var kanjiDayLen = len("月")

func getPlace(s string) (string, int, error) {

	var lastIndex int
	if lastIndex = strings.Index(s, "半年"); lastIndex == -1 {
		if lastIndex = strings.Index(s, "第"); lastIndex == -1 {
			return "", 0, errors.New("failed found place")
		}
	}

	var startDay int
	for _, day := range days {
		if startDay = strings.Index(s, day); startDay != -1 {
			if _, err := strconv.Atoi(s[startDay : startDay+kanjiDayLen]); err != nil {
				startDay = strings.LastIndex(s, day)
			}
			if startDay < lastIndex {
				break
			}
		}
	}

	distanceDayToPlace := 0
	for i, w := range s[startDay+kanjiDayLen : lastIndex] { // +1 is skip day kanji chara (ex. "月")
		if unicode.Is(unicode.Han, w) {
			distanceDayToPlace = i
			break
		}
	}

	return s[startDay+kanjiDayLen+distanceDayToPlace : lastIndex], startDay, nil

}

var commaLen = len("・")

func getGrades(s string, dayKanjiIndex int) ([]int, error) {
	comma := s[dayKanjiIndex-commaLen-1 : dayKanjiIndex-1]
	var grade []int
	if comma == "・" {
		prevgrade := s[dayKanjiIndex-commaLen-2 : dayKanjiIndex-commaLen-1]
		pg, err := strconv.Atoi(prevgrade)
		if err != nil {
			return nil, err
		}
		grade = append(grade, pg)
	}
	gr, err := strconv.Atoi(s[dayKanjiIndex-1 : dayKanjiIndex])
	if err != nil {
		return nil, err
	}
	grade = append(grade, gr)
	return grade, nil
}

var idRgex = regexp.MustCompile(`\d{9}`)

func str2lectureFPDF(s string) ([]*lectureFPDF, error) {
	is := idRgex.FindAllStringIndex(s, -1)
	if is == nil {
		return nil, nil
	}
	maxIndex := len(is) - 1

	var lecturefpdfs []*lectureFPDF

	for i, h := range is {
		var lecturefpd lectureFPDF
		lecturefpd.id = s[h[0]:h[1]]

		if i != maxIndex {
			lecturefpd.body = s[h[1]:is[i+1][0]]
		} else {
			lecturefpd.body = s[h[1]:]
		}
		lecturefpd.isSelect = strings.Contains(lecturefpd.body, "選")

		if strings.Contains(lecturefpd.body, "集中") || strings.Contains(lecturefpd.body, "通年") {
			continue
		}

		if strings.Contains(lecturefpd.body, "セミナー") || strings.Contains(lecturefpd.body, "卒業研究") {
			continue
		}

		if !isIE(&lecturefpd.body) {
			continue
		}

		if strings.Contains(lecturefpd.body, "大学英語") && !strings.Contains(lecturefpd.body, "大学英語を") {
			lecs, err := parseUniversityEnglish(lecturefpd.body, lecturefpd.id)
			if err != nil {
				return nil, errors.Wrap(err, "failed parse univ english")
			}
			lecturefpdfs = append(lecturefpdfs, lecs...)
			continue
		}

		lplace, indexG, err := getPlace(lecturefpd.body)

		if err != nil {
			return nil, err
		}
		if indexG == -1 {
			fmt.Printf("wan!!! %s not parsed ", lecturefpd.body)
			continue
		}

		lecturefpd.place = lplace

		grades, err := getGrades(lecturefpd.body, indexG)
		if err != nil {
			return nil, err
		}
		lecturefpd.grades = grades

		lecturefpdfs = append(lecturefpdfs, &lecturefpd)
	}
	return lecturefpdfs, nil
}

func parseUniversityEnglish(s, id string) ([]*lectureFPDF, error) {
	/*
		外101大学英語42―2非富里 明美8421~4月木2共3-205(月)共3-403(木)半年知能情報(42)
		外101大学英語42―2非宮城 和文9401~4月木2共3-403(月)共3-201(木)半年知能情報(20)
	*/

	type uestruct struct {
		index      int
		kanji      string
		kanjiIndex int
	}

	var uess []uestruct

	for i, day := range days {
		if startDay := strings.Index(s, day); startDay != -1 {
			ues := uestruct{
				index:      startDay,
				kanji:      day,
				kanjiIndex: i,
			}
			uess = append(uess, ues)
		}
	}

	var lecs []*lectureFPDF

	kyouKaji := "共"
	leftParen := "("
	kindex := strings.Index(s, kyouKaji)
	leftParenIndex := strings.Index(s, leftParen)

	var strBuilder strings.Builder
	strBuilder.WriteString(uess[0].kanji)
	strBuilder.WriteString(s[uess[1].index+kanjiDayLen : uess[1].index+kanjiDayLen+1])

	lecs = append(lecs, &lectureFPDF{
		id:       id,
		isSelect: false,
		body:     s,
		place:    s[kindex:leftParenIndex],
		grades:   []int{1},
		day:      strBuilder.String(),
	})

	strBuilder.Reset()

	kindex = strings.LastIndex(s, kyouKaji)

	strBuilder.WriteString(leftParen)
	strBuilder.WriteString(uess[1].kanji)
	leftParenIndex = strings.LastIndex(s, strBuilder.String())

	strBuilder.Reset()
	strBuilder.WriteString(uess[1].kanji)
	strBuilder.WriteString(s[uess[1].index+kanjiDayLen : uess[1].index+kanjiDayLen+1])

	lecs = append(lecs, &lectureFPDF{
		id:       id,
		isSelect: false,
		body:     s,
		place:    s[kindex:leftParenIndex],
		grades:   []int{1},
		day:      strBuilder.String(),
	})

	return lecs, nil
}