Mercurial > hg > Members > anatofuz > lectable
view pdf.go @ 26:5abe36dc0697 default tip
...
author | anatofuz <anatofuz@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 21 Sep 2020 16:03:31 +0900 |
parents | bac5eb544d4d |
children |
line wrap: on
line source
package lectable import ( "bytes" "fmt" "regexp" "strconv" "strings" "unicode" "github.com/ledongthuc/pdf" "github.com/pkg/errors" "golang.org/x/text/unicode/norm" ) type lectureFPDF struct { id string `json:id` body string `json:body` isSelect bool `json:isSelect` day string `json:day` place string `json:place` grades []int `json:grades` } type pdfPagesStr []string func convertStringFromPDF(pdfPaths []string) (*[]pdfPagesStr, error) { var ppsr []pdfPagesStr for _, apdf := range pdfPaths { file, reader, err := pdf.Open(apdf) defer file.Close() if err != nil { return nil, errors.Wrap(err, "failed open pdf") } var buf bytes.Buffer preader, err := reader.GetPlainText() if err != nil { return nil, errors.Wrap(err, "failed convert pdf") } buf.ReadFrom(preader) planString := string(norm.NFKC.Bytes(buf.Bytes())) planstrings := strings.Split(planString, " ") // " " is next pdf page //planstrings = planstrings[1:] // 最初はゴミ ppsr = append(ppsr, planstrings) } return &ppsr, nil } func isIE(s *string) bool { if strings.HasPrefix(*s, "情") { // === '^情' ( 情01) if strings.Contains(*s, "情報科学演習") { return false } if !strings.Contains(*s, "日本語表現") { return true } } if strings.HasPrefix(*s, "知能") { // === '^知能' (知能02) return true } if strings.Contains(*s, "情報工学") { return true } if strings.Contains(*s, "知能情報") { return true } if strings.Contains(*s, "電子情報通信") || strings.Contains(*s, "エネルギー環境") || strings.Contains(*s, "機械工学") || strings.Contains(*s, "社会基盤デザイン") || strings.Contains(*s, "電気システム") { return false } if strings.Contains(*s, "GEプログラム") { return true } if strings.Contains(*s, "キャリアデザイン入門") { return true } if strings.Contains(*s, "情報科教育法") { return true } if strings.Contains(*s, "産業社会学") { return true } return false } var days = []string{"月", "火", "水", "木", "金"} var kanjiDayLen = len("月") func getPlace(s string) (string, int, error) { var lastIndex int if lastIndex = strings.Index(s, "半年"); lastIndex == -1 { if lastIndex = strings.Index(s, "第"); lastIndex == -1 { return "", 0, errors.New("failed found place") } } var startDay int for _, day := range days { if startDay = strings.Index(s, day); startDay != -1 { if _, err := strconv.Atoi(s[startDay : startDay+kanjiDayLen]); err != nil { startDay = strings.LastIndex(s, day) } if startDay < lastIndex { break } } } distanceDayToPlace := 0 for i, w := range s[startDay+kanjiDayLen : lastIndex] { // +1 is skip day kanji chara (ex. "月") if unicode.Is(unicode.Han, w) { distanceDayToPlace = i break } } return s[startDay+kanjiDayLen+distanceDayToPlace : lastIndex], startDay, nil } var commaLen = len("・") func getGrades(s string, dayKanjiIndex int) ([]int, error) { comma := s[dayKanjiIndex-commaLen-1 : dayKanjiIndex-1] var grade []int if comma == "・" { prevgrade := s[dayKanjiIndex-commaLen-2 : dayKanjiIndex-commaLen-1] pg, err := strconv.Atoi(prevgrade) if err != nil { return nil, err } grade = append(grade, pg) } gr, err := strconv.Atoi(s[dayKanjiIndex-1 : dayKanjiIndex]) if err != nil { return nil, err } grade = append(grade, gr) return grade, nil } var idRgex = regexp.MustCompile(`\d{9}`) func str2lectureFPDF(s string) ([]*lectureFPDF, error) { is := idRgex.FindAllStringIndex(s, -1) if is == nil { return nil, nil } maxIndex := len(is) - 1 var lecturefpdfs []*lectureFPDF for i, h := range is { var lecturefpd lectureFPDF lecturefpd.id = s[h[0]:h[1]] if i != maxIndex { lecturefpd.body = s[h[1]:is[i+1][0]] } else { lecturefpd.body = s[h[1]:] } lecturefpd.isSelect = strings.Contains(lecturefpd.body, "選") if strings.Contains(lecturefpd.body, "集中") || strings.Contains(lecturefpd.body, "通年") { continue } if strings.Contains(lecturefpd.body, "セミナー") || strings.Contains(lecturefpd.body, "卒業研究") { continue } if !isIE(&lecturefpd.body) { continue } if strings.Contains(lecturefpd.body, "大学英語") && !strings.Contains(lecturefpd.body, "大学英語を") { lecs, err := parseUniversityEnglish(lecturefpd.body, lecturefpd.id) if err != nil { return nil, errors.Wrap(err, "failed parse univ english") } lecturefpdfs = append(lecturefpdfs, lecs...) continue } lplace, indexG, err := getPlace(lecturefpd.body) if err != nil { return nil, err } if indexG == -1 { fmt.Printf("wan!!! %s not parsed ", lecturefpd.body) continue } lecturefpd.place = lplace grades, err := getGrades(lecturefpd.body, indexG) if err != nil { return nil, err } lecturefpd.grades = grades lecturefpdfs = append(lecturefpdfs, &lecturefpd) } return lecturefpdfs, nil } func parseUniversityEnglish(s, id string) ([]*lectureFPDF, error) { /* 外101大学英語42―2非富里 明美8421~4月木2共3-205(月)共3-403(木)半年知能情報(42) 外101大学英語42―2非宮城 和文9401~4月木2共3-403(月)共3-201(木)半年知能情報(20) */ type uestruct struct { index int kanji string kanjiIndex int } var uess []uestruct for i, day := range days { if startDay := strings.Index(s, day); startDay != -1 { ues := uestruct{ index: startDay, kanji: day, kanjiIndex: i, } uess = append(uess, ues) } } var lecs []*lectureFPDF kyouKaji := "共" leftParen := "(" kindex := strings.Index(s, kyouKaji) leftParenIndex := strings.Index(s, leftParen) var strBuilder strings.Builder strBuilder.WriteString(uess[0].kanji) strBuilder.WriteString(s[uess[1].index+kanjiDayLen : uess[1].index+kanjiDayLen+1]) lecs = append(lecs, &lectureFPDF{ id: id, isSelect: false, body: s, place: s[kindex:leftParenIndex], grades: []int{1}, day: strBuilder.String(), }) strBuilder.Reset() kindex = strings.LastIndex(s, kyouKaji) strBuilder.WriteString(leftParen) strBuilder.WriteString(uess[1].kanji) leftParenIndex = strings.LastIndex(s, strBuilder.String()) strBuilder.Reset() strBuilder.WriteString(uess[1].kanji) strBuilder.WriteString(s[uess[1].index+kanjiDayLen : uess[1].index+kanjiDayLen+1]) lecs = append(lecs, &lectureFPDF{ id: id, isSelect: false, body: s, place: s[kindex:leftParenIndex], grades: []int{1}, day: strBuilder.String(), }) return lecs, nil }