diff --git a/apilink.go b/apilink.go index f3d44e3..cc522fa 100644 --- a/apilink.go +++ b/apilink.go @@ -34,7 +34,7 @@ func (p *Paragraph) AddLink(text string, link string) *Hyperlink { }, } - p.Data = append(p.Data, ParagraphChild{Link: hyperlink}) + p.Children = append(p.Children, ParagraphChild{Link: hyperlink}) return hyperlink } diff --git a/apipara.go b/apipara.go index 28a2746..12bee30 100644 --- a/apipara.go +++ b/apipara.go @@ -3,19 +3,11 @@ package docxlib // AddParagraph adds a new paragraph func (f *Docx) AddParagraph() *Paragraph { p := &Paragraph{ - Data: make([]ParagraphChild, 0, 64), - file: f, + Children: make([]ParagraphChild, 0, 64), + file: f, } f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p) return p } - -func (f *Docx) Paragraphs() []*Paragraph { - return f.Document.Body.Paragraphs -} - -func (p *Paragraph) Children() (ret []ParagraphChild) { - return p.Data -} diff --git a/apirun.go b/apirun.go index 54b0298..f7af38c 100644 --- a/apirun.go +++ b/apirun.go @@ -29,7 +29,7 @@ func (p *Paragraph) AddText(text string) *Run { RunProperties: &RunProperties{}, } - p.Data = append(p.Data, ParagraphChild{Run: run}) + p.Children = append(p.Children, ParagraphChild{Run: run}) return run } diff --git a/cmd/getstructure/main.go b/cmd/getstructure/main.go index c4beef7..a0c7c63 100644 --- a/cmd/getstructure/main.go +++ b/cmd/getstructure/main.go @@ -6,7 +6,6 @@ import ( "os" "github.com/fumiama/docxlib" - "github.com/golang/glog" ) var fileLocation *string @@ -31,11 +30,16 @@ func main() { if err != nil { panic(err) } - for _, para := range doc.Paragraphs() { - glog.Infoln("There is a new paragraph", para) - for _, child := range para.Children() { - if child.Run != nil && child.Run.Text != nil { - fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) + for _, para := range doc.Document.Body.Paragraphs { + fmt.Println("New paragraph") + for _, child := range para.Children { + if child.Run != nil { + if child.Run.Text != nil { + fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) + } + if child.Run.Drawing != nil { + fmt.Printf("\tWe've found a new run with the drawing ->%s\n", child.Run.Drawing.Inline.DistT) // TODO: replace to refid + } } if child.Link != nil { id := child.Link.ID @@ -49,6 +53,7 @@ func main() { } } + fmt.Print("End of paragraph\n\n") } fmt.Println("End of main") } diff --git a/cmd/main/main.go b/cmd/main/main.go index 9698a06..538e425 100644 --- a/cmd/main/main.go +++ b/cmd/main/main.go @@ -52,10 +52,15 @@ func main() { if err != nil { panic(err) } - for _, para := range doc.Paragraphs() { - for _, child := range para.Children() { + for _, para := range doc.Document.Body.Paragraphs { + for _, child := range para.Children { if child.Run != nil { - fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) + if child.Run.Text != nil { + fmt.Printf("\tWe've found a new run with the text ->%s\n", child.Run.Text.Text) + } + if child.Run.Drawing != nil { + fmt.Printf("\tWe've found a new run with the drawing ->%s\n", child.Run.Drawing.Inline.DistT) // TODO: replace to refid + } } if child.Link != nil { id := child.Link.ID diff --git a/go.mod b/go.mod index b3b2de1..8257b64 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module github.com/fumiama/docxlib go 1.16 - -require github.com/golang/glog v0.0.0-20210429001901-424d2337a529 diff --git a/go.sum b/go.sum index 41b530d..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +0,0 @@ -github.com/golang/glog v0.0.0-20210429001901-424d2337a529 h1:2voWjNECnrZRbfwXxHB1/j8wa6xdKn85B5NzgVL/pTU= -github.com/golang/glog v0.0.0-20210429001901-424d2337a529/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= diff --git a/pack.go b/pack.go index d5e7359..e48a4d6 100644 --- a/pack.go +++ b/pack.go @@ -4,8 +4,6 @@ import ( "archive/zip" "encoding/xml" "strings" - - "github.com/golang/glog" ) // This receives a zip file writer (word documents are a zip with multiple xml inside) @@ -49,7 +47,6 @@ func marshal(data interface{}) (out string, err error) { sb.WriteString(xml.Header) err = xml.NewEncoder(&sb).Encode(data) if err != nil { - glog.Errorln("Error marshalling", err) return } out = sb.String() diff --git a/structdoc_test.go b/structdoc_test.go index 869952b..194b5a0 100644 --- a/structdoc_test.go +++ b/structdoc_test.go @@ -7,9 +7,8 @@ import ( const decoded_doc_1 = `testtest font sizetest colorNew style 1New style 2test font size and colorgoogle` const decoded_doc_2 = `Table of Contents TOC \h \z \t "Heading 1,2,S6,1,S0,1,S1,1,S2,1,S3,1,S4,1,S5,1" Holy Grail [xref:bRJduW6hNR] PAGEREF _Toc420414504 \h 21.What is your name? [xref:TH7u7QDqhD] PAGEREF _Toc420414505 \h 22.What is your quest? [xref:bC62HkFATC] PAGEREF _Toc420414506 \h 23.What is your favourite colour? [xref:I3TphuHX6N] PAGEREF _Toc420414507 \h 2Holy Grail [ FORMTEXT xref:bRJduW6hNR]What is your name? [ FORMTEXT xref:TH7u7QDqhD]My name is Sir Launcelot of Camelot.What is your quest? [ FORMTEXT xref:bC62HkFATC]To seek the Holy Grail[or a grail shaped beacon]. What is your favourite colour? [ FORMTEXT xref:I3TphuHX6N]Blue.How many paragraphs here then?` -const NUM_PARAGRAPHS = 5 -func TestStructure(t *testing.T) { +func TestPlainStructure(t *testing.T) { doc := Document{ XMLW: XMLNS_W, XMLR: XMLNS_R, @@ -22,29 +21,538 @@ func TestStructure(t *testing.T) { {decoded_doc_2, 19}, } for _, tc := range testCases { - err := xml.Unmarshal([]byte(tc.content), &doc) + err := xml.Unmarshal(StringToBytes(tc.content), &doc) if err != nil { - t.Errorf("We expected to be able to decode %s but we didn't", - tc.content) + t.Fatal(err) } if len(doc.Body.Paragraphs) != tc.numParagraphs { - t.Errorf("We expected %d paragraphs, we got %d", - NUM_PARAGRAPHS, len(doc.Body.Paragraphs)) + t.Fatalf("We expected %d paragraphs, we got %d", tc.numParagraphs, len(doc.Body.Paragraphs)) } - for _, p := range doc.Body.Paragraphs { - if len(p.Children()) == 0 { - t.Errorf("We were not able to parse paragraph %v", - p) + for i, p := range doc.Body.Paragraphs { + if len(p.Children) == 0 { + t.Fatalf("We were not able to parse paragraph %d", i) } - for _, child := range p.Children() { + for _, child := range p.Children { if child.Link == nil && child.Properties == nil && child.Run == nil { - t.Errorf("There are Paragraph children with all fields nil") + t.Fatalf("There are Paragraph children with all fields nil") } if child.Run != nil && child.Run.Text == nil && child.Run.InstrText == "" { - t.Errorf("We have a run with no text") + t.Fatalf("We have a run with no text") } if child.Link != nil && child.Link.ID == "" { - t.Errorf("We have a link without ID") + t.Fatalf("We have a link without ID") + } + } + } + } +} + +const drawing_doc = ` + + + + + + + + + + + + + 直接粘贴 + + + + + + + + + inline + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 一行2个 + + + inline + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 一行2个组合 + + + inline + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 一个 浮于上方 + + + + + + 右侧对齐 + + + + + + + + + 11.32cm + + + + + + + + + 23.73cm + + + + + + + + + + + + + + + + + 2935605 + + + 97790 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + 0 + + + + + + + + + + + + +` + +func TestDrawingStructure(t *testing.T) { + doc := Document{ + XMLW: XMLNS_W, + XMLR: XMLNS_R, + XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} + err := xml.Unmarshal(StringToBytes(drawing_doc), &doc) + if err != nil { + t.Fatal(err) + } + if len(doc.Body.Paragraphs) != 8 { + t.Fatalf("We expected %d paragraphs, we got %d", 8, len(doc.Body.Paragraphs)) + } + for i, p := range doc.Body.Paragraphs { + if len(p.Children) == 0 { + t.Fatalf("We were not able to parse paragraph %d", i) + } + for j, child := range p.Children { + if child.Link == nil && child.Properties == nil && child.Run == nil { + t.Fatalf("There are Paragraph children with all fields nil") + } + if child.Run != nil && child.Run.Text == nil && child.Run.InstrText == "" && child.Run.Drawing == nil { + t.Fatalf("We have a run with no text and drawing") + } + if child.Link != nil && child.Link.ID == "" { + t.Fatalf("We have a link without ID") + } + if child.Run != nil && child.Run.Drawing != nil { + t.Log("fild drawing at aragraph", i, ", child", j) + if child.Run.Drawing.Inline != nil { + tail := "-mock-inline-p" + string(rune('0'+i)) + "-c" + string(rune('0'+j)) + if "T"+tail != child.Run.Drawing.Inline.DistT { + t.Fatal("expect", "T"+tail, "but got", child.Run.Drawing.Inline.DistT) + } + if "B"+tail != child.Run.Drawing.Inline.DistB { + t.Fatal("expect", "B"+tail, "but got", child.Run.Drawing.Inline.DistB) + } + if "L"+tail != child.Run.Drawing.Inline.DistL { + t.Fatal("expect", "L"+tail, "but got", child.Run.Drawing.Inline.DistL) + } + if "R"+tail != child.Run.Drawing.Inline.DistR { + t.Fatal("expect", "R"+tail, "but got", child.Run.Drawing.Inline.DistR) + } } } } diff --git a/structnodes.go b/structnodes.go index 8aaab31..4b5a19b 100644 --- a/structnodes.go +++ b/structnodes.go @@ -3,8 +3,6 @@ package docxlib import ( "encoding/xml" "io" - - "github.com/golang/glog" ) type ParagraphChild struct { @@ -14,8 +12,8 @@ type ParagraphChild struct { } type Paragraph struct { - XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` - Data []ParagraphChild + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` + Children []ParagraphChild file *Docx } @@ -47,8 +45,7 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { var value Run d.DecodeElement(&value, &start) elem.Run = &value - if value.InstrText == "" && value.Text == nil { - glog.V(0).Infof("Empty run, we ignore") + if value.InstrText == "" && value.Text == nil && value.Drawing == nil { continue } case "rPr": @@ -62,7 +59,7 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { } } - *p = Paragraph{Data: children} + p.Children = children return nil } diff --git a/structrun.go b/structrun.go index fdb7096..ca10f40 100644 --- a/structrun.go +++ b/structrun.go @@ -9,29 +9,45 @@ const ( HYPERLINK_STYLE = "a1" ) -// A Run is part of a paragraph that has its own style. It could be +// Run is part of a paragraph that has its own style. It could be // a piece of text in bold, or a link type Run struct { XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main r,omitempty"` RunProperties *RunProperties `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` InstrText string `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main instrText,omitempty"` Text *Text + Drawing *Drawing } -// The Text object contains the actual text +// Text object contains the actual text type Text struct { XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main t"` XMLSpace string `xml:"xml:space,attr,omitempty"` Text string `xml:",chardata"` } -// The hyperlink element contains links +// Hyperlink element contains links type Hyperlink struct { XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main hyperlink,omitempty"` ID string `xml:"http://schemas.openxmlformats.org/officeDocument/2006/relationships id,attr"` Run Run } +// Drawing element contains photos +type Drawing struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main drawing,omitempty"` + Inline *WPInline +} + +// WPInline wp:inline +type WPInline struct { + XMLName xml.Name `xml:"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing inline,omitempty"` + DistT string `xml:"wp:distT,attr"` + DistB string `xml:"wp:distB,attr"` + DistL string `xml:"wp:distL,attr"` + DistR string `xml:"wp:distR,attr"` +} + // RunProperties encapsulates visual properties of a run type RunProperties struct { XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main rPr,omitempty"` @@ -67,7 +83,6 @@ type Size struct { } func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - var elem Run for { t, err := d.Token() if err == io.EOF { @@ -80,28 +95,30 @@ func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { case "rPr": var value RunProperties d.DecodeElement(&value, &start) - elem.RunProperties = &value + r.RunProperties = &value case "instrText": var value string d.DecodeElement(&value, &start) - elem.InstrText = value + r.InstrText = value case "t": var value Text d.DecodeElement(&value, &start) - elem.Text = &value + r.Text = &value + case "drawing": + var value Drawing + d.DecodeElement(&value, &start) + r.Drawing = &value default: continue } } } - *r = elem return nil } func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - var elem Text for { t, err := d.Token() if err == io.EOF { @@ -110,16 +127,14 @@ func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt := t.(type) { case xml.CharData: - elem.Text = string(tt) // implicitly copy + r.Text = string(tt) // implicitly copy } } - *r = elem return nil } func (r *Hyperlink) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - var elem Hyperlink for { t, err := d.Token() if err == io.EOF { @@ -129,19 +144,17 @@ func (r *Hyperlink) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt := t.(type) { case xml.StartElement: if tt.Name.Local == "r" { - d.DecodeElement(&elem.Run, &start) + d.DecodeElement(&r.Run, &start) } else { continue } } } - *r = elem return nil } -func (r *RunStyle) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - var elem RunStyle +func (r *Drawing) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { for { t, err := d.Token() if err == io.EOF { @@ -150,11 +163,57 @@ func (r *RunStyle) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt := t.(type) { case xml.StartElement: - elem.Val = getAtt(tt.Attr, "val") + switch tt.Name.Local { + case "inline": + r.Inline = new(WPInline) + r.Inline.DistT = getAtt(tt.Attr, "distT") + r.Inline.DistB = getAtt(tt.Attr, "distB") + r.Inline.DistL = getAtt(tt.Attr, "distL") + r.Inline.DistR = getAtt(tt.Attr, "distR") + d.DecodeElement(r.Inline, &start) + default: + continue + } + } + + } + return nil + +} +func (r *WPInline) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + for { + t, err := d.Token() + if err == io.EOF { + break + } + + switch tt := t.(type) { + case xml.StartElement: + switch tt.Name.Local { + case "inline": + + default: + continue + } + } + + } + return nil + +} +func (r *RunStyle) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + for { + t, err := d.Token() + if err == io.EOF { + break + } + + switch tt := t.(type) { + case xml.StartElement: + r.Val = getAtt(tt.Attr, "val") } } - *r = elem return nil } diff --git a/unpack.go b/unpack.go index d595ad7..2616fae 100644 --- a/unpack.go +++ b/unpack.go @@ -5,8 +5,6 @@ import ( "archive/zip" "encoding/xml" "io" - - "github.com/golang/glog" ) // This receives a zip file (word documents are a zip with multiple xml inside) @@ -36,10 +34,8 @@ func unpack(zipReader *zip.Reader) (docx *Docx, err error) { func processDoc(file *zip.File, doc *Document) error { filebytes, err := readZipFile(file) if err != nil { - glog.Errorln("Error reading from internal zip file") return err } - glog.V(0).Infoln("Doc:", string(filebytes)) doc.XMLW = XMLNS_W doc.XMLR = XMLNS_R @@ -47,10 +43,8 @@ func processDoc(file *zip.File, doc *Document) error { doc.XMLName.Local = "document" err = xml.Unmarshal(filebytes, doc) if err != nil { - glog.Errorln("Error unmarshalling doc", string(filebytes)) return err } - glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs) return nil } @@ -58,15 +52,12 @@ func processDoc(file *zip.File, doc *Document) error { func processRelations(file *zip.File, rels *Relationships) error { filebytes, err := readZipFile(file) if err != nil { - glog.Errorln("Error reading from internal zip file") return err } - glog.V(0).Infoln("Relations:", string(filebytes)) rels.Xmlns = XMLNS_R err = xml.Unmarshal(filebytes, rels) if err != nil { - glog.Errorln("Error unmarshalling relationships") return err } return nil