From 69a899d4a57ea1471ea0e34348d3868a13fb9046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Thu, 9 Mar 2023 17:20:17 +0800 Subject: [PATCH] add SplitByParagraph --- .github/workflows/pull.yml | 2 +- .github/workflows/push.yml | 2 +- .golangci.yml | 2 +- apitable.go | 10 +-- cmd/main/main.go | 2 +- docx.go | 19 +----- empty.go | 2 - go.mod | 2 +- structdoc.go | 130 +++++++++++++++++++++++++++++++++++- structdrawing.go | 133 +++++++++++++++++++++++++++++++++++++ structtable.go | 8 +-- unpack.go | 7 +- 12 files changed, 282 insertions(+), 37 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 68fe2d8..c3debcb 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -8,7 +8,7 @@ jobs: - name: Set up Go uses: actions/setup-go@master with: - go-version: 1.19 + go-version: '1.20' - name: Check out code into the Go module directory uses: actions/checkout@master diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index e65e50a..e30c986 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -8,7 +8,7 @@ jobs: - name: Set up Go uses: actions/setup-go@master with: - go-version: 1.19 + go-version: '1.20' - name: Check out code into the Go module directory uses: actions/checkout@master diff --git a/.golangci.yml b/.golangci.yml index 5a34144..4c33d4f 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -59,7 +59,7 @@ run: tests: false skip-dirs: - order - go: '1.19' + go: '1.20' # output configuration options output: diff --git a/apitable.go b/apitable.go index eaeca58..eede711 100644 --- a/apitable.go +++ b/apitable.go @@ -23,7 +23,7 @@ package docx // AddTable add a new table to body by col*row // // unit: twips (1/20 point) -func (f *Docx) AddTable(row int, col int) *WTable { +func (f *Docx) AddTable(row int, col int) *Table { trs := make([]*WTableRow, row) for i := 0; i < row; i++ { cells := make([]*WTableCell, col) @@ -40,7 +40,7 @@ func (f *Docx) AddTable(row int, col int) *WTable { TableCells: cells, } } - tbl := &WTable{ + tbl := &Table{ TableProperties: &WTableProperties{ Width: &WTableWidth{Type: "auto"}, TableBorders: &WTableBorders{ @@ -65,7 +65,7 @@ func (f *Docx) AddTable(row int, col int) *WTable { // AddTableTwips add a new table to body by height and width // // unit: twips (1/20 point) -func (f *Docx) AddTableTwips(rowHeights []int64, colWidths []int64) *WTable { +func (f *Docx) AddTableTwips(rowHeights []int64, colWidths []int64) *Table { grids := make([]*WGridCol, len(colWidths)) trs := make([]*WTableRow, len(rowHeights)) for i, w := range colWidths { @@ -95,7 +95,7 @@ func (f *Docx) AddTableTwips(rowHeights []int64, colWidths []int64) *WTable { } } } - tbl := &WTable{ + tbl := &Table{ TableProperties: &WTableProperties{ Width: &WTableWidth{Type: "auto"}, TableBorders: &WTableBorders{ @@ -127,7 +127,7 @@ func (f *Docx) AddTableTwips(rowHeights []int64, colWidths []int64) *WTable { // end:右对齐。 // both:两端对齐。 // distribute:分散对齐。 -func (t *WTable) Justification(val string) *WTable { +func (t *Table) Justification(val string) *Table { if t.TableProperties.Justification == nil { t.TableProperties.Justification = &Justification{Val: val} return t diff --git a/cmd/main/main.go b/cmd/main/main.go index a4e8088..5a83c5b 100644 --- a/cmd/main/main.go +++ b/cmd/main/main.go @@ -192,7 +192,7 @@ func main() { fmt.Println("Plain text:") for _, it := range doc.Document.Body.Items { switch it.(type) { - case *docx.Paragraph, *docx.WTable: // printable + case *docx.Paragraph, *docx.Table: // printable fmt.Println(it) } } diff --git a/docx.go b/docx.go index 43501cd..0319dc3 100644 --- a/docx.go +++ b/docx.go @@ -24,7 +24,6 @@ package docx import ( "archive/zip" - "bytes" "io" "io/fs" "sync" @@ -50,9 +49,6 @@ type Docx struct { tmplfs fs.FS tmpfslst []string - buf *bytes.Buffer - isbufempty bool - io.Reader io.WriterTo } @@ -109,20 +105,9 @@ func (f *Docx) WriteTo(writer io.Writer) (_ int64, err error) { return 0, f.pack(zipWriter) } -// Read allows to save a docx to buf +// Read is a fake function and cannot be used func (f *Docx) Read(p []byte) (n int, err error) { - if !f.isbufempty { - n, err = f.buf.Read(p) - if err == io.EOF { - f.buf.Reset() - f.isbufempty = true - return - } - } - zipWriter := zip.NewWriter(f.buf) - defer zipWriter.Close() - f.isbufempty = false - return f.buf.Read(p) + panic("fake stub!") } // UseTemplate will replace template files diff --git a/empty.go b/empty.go index b025a39..5688ebe 100644 --- a/empty.go +++ b/empty.go @@ -21,7 +21,6 @@ package docx import ( - "bytes" "encoding/xml" ) @@ -71,7 +70,6 @@ func newEmptyA4File() *Docx { slowIDs: make(map[string]uintptr, 64), template: "a4", tmpfslst: A4TemplateFilesList, - buf: bytes.NewBuffer(make([]byte, 0, 1024*1024)), } docx.Document.Body.file = docx return docx diff --git a/go.mod b/go.mod index abedea5..fc3f48f 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,5 @@ module github.com/fumiama/go-docx -go 1.18 +go 1.20 require github.com/fumiama/imgsz v0.0.2 diff --git a/structdoc.go b/structdoc.go index 642ac8f..7a5d72c 100644 --- a/structdoc.go +++ b/structdoc.go @@ -81,7 +81,7 @@ func (b *Body) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { value.file = b.file b.Items = append(b.Items, &value) case "tbl": - var value WTable + var value Table err = d.DecodeElement(&value, &tt) if err != nil && !strings.HasPrefix(err.Error(), "expected") { return err @@ -146,3 +146,131 @@ func (doc *Document) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error } return nil } + +// ParagraphSplitRule check whether the paragraph is a separator or not +type ParagraphSplitRule func(*Paragraph) bool + +// SplitByParagraph splits a doc to many docs by using a matched paragraph +// as the separator. +// +// The separator will be placed to the first doc item +func (doc *Docx) SplitByParagraph(separator ParagraphSplitRule) (docs []*Docx) { + items := doc.Document.Body.Items +newdoclop: + for len(items) > 0 { + ndoc := new(Docx) + + // migrate base data + ndoc.mediaNameIdx = make(map[string]int, 64) + ndoc.slowIDs = make(map[string]uintptr, 64) + ndoc.template = doc.template + ndoc.tmplfs = doc.tmplfs + ndoc.tmpfslst = doc.tmpfslst + + ndoc.Document.XMLW = XMLNS_W + ndoc.Document.XMLR = XMLNS_R + ndoc.Document.XMLWP = XMLNS_WP + // ndoc.Document.XMLMC = XMLNS_MC + // ndoc.Document.XMLO = XMLNS_O + // ndoc.Document.XMLV = XMLNS_V + ndoc.Document.XMLWPS = XMLNS_WPS + ndoc.Document.XMLWPC = XMLNS_WPC + ndoc.Document.XMLWPG = XMLNS_WPG + // ndoc.Document.XMLWP14 = XMLNS_WP14 + ndoc.Document.XMLName.Space = XMLNS_W + ndoc.Document.XMLName.Local = "document" + ndoc.Document.Body.file = ndoc + + ndoc.docRelation = Relationships{ + Xmlns: XMLNS_REL, + Relationship: []Relationship{ + { + ID: "rId1", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`, + Target: "styles.xml", + }, + { + ID: "rId2", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`, + Target: "theme/theme1.xml", + }, + { + ID: "rId3", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`, + Target: "fontTable.xml", + }, + }, + } + + ndoc.rID = 3 + + for i, item := range items { + switch o := item.(type) { + case *Paragraph: + if separator(o) && len(ndoc.Document.Body.Items) > 0 { + items = items[i:] + docs = append(docs, ndoc) + continue newdoclop + } + np := o.copymedia(ndoc) + ndoc.Document.Body.Items = append(ndoc.Document.Body.Items, &np) + case *Table: + nt := o.copymedia(ndoc) + ndoc.Document.Body.Items = append(ndoc.Document.Body.Items, &nt) + default: + ndoc.Document.Body.Items = append(ndoc.Document.Body.Items, o) + } + } + + if len(ndoc.Document.Body.Items) > 0 { + docs = append(docs, ndoc) + } + break + } + return +} + +func (p *Paragraph) copymedia(to *Docx) (np Paragraph) { + np = *p + np.Children = make([]interface{}, 0, len(p.Children)) + np.file = to + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nr := *r + nr.Children = make([]interface{}, 0, len(r.Children)) + nr.file = to + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + nr.Children = append(nr.Children, d.copymedia(to)) + continue + } + nr.Children = append(nr.Children, rc) + } + continue + } + np.Children = append(np.Children, pc) + } + return +} + +func (t *Table) copymedia(to *Docx) (nt Table) { + nt = *t + nt.TableRows = make([]*WTableRow, 0, len(t.TableRows)) + nt.file = to + for _, tr := range t.TableRows { + ntr := *tr + ntr.TableCells = make([]*WTableCell, 0, len(tr.TableCells)) + ntr.file = to + for _, tc := range tr.TableCells { + ntc := *tc + ntc.Paragraphs = make([]Paragraph, 0, len(tc.Paragraphs)) + ntc.file = to + for _, p := range tc.Paragraphs { + ntc.Paragraphs = append(ntc.Paragraphs, p.copymedia(to)) + } + ntr.TableCells = append(ntr.TableCells, &ntc) + } + nt.TableRows = append(nt.TableRows, &ntr) + } + return +} diff --git a/structdrawing.go b/structdrawing.go index d23e145..3f37c13 100644 --- a/structdrawing.go +++ b/structdrawing.go @@ -27,6 +27,7 @@ import ( "io" "strconv" "strings" + "sync/atomic" ) //nolint:revive,stylecheck @@ -89,6 +90,22 @@ func (r *Drawing) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { return nil } +func (r *Drawing) copymedia(to *Docx) *Drawing { + if r.Inline != nil { + return &Drawing{ + Inline: r.Inline.copymedia(to), + file: to, + } + } + if r.Anchor != nil { + return &Drawing{ + Anchor: r.Anchor.copymedia(to), + file: to, + } + } + return &Drawing{file: to} +} + // WPInline is an element that represents an inline image within a text paragraph. // // It contains information about the image's size and position, @@ -275,6 +292,64 @@ func (r *WPInline) String() string { return "![inln?](unknown)" } +func (r *WPInline) copymedia(to *Docx) *WPInline { + if r.Graphic.GraphicData.Pic != nil { + if r.Graphic.GraphicData.Pic.BlipFill != nil { + tgt, err := r.file.ReferTarget(r.Graphic.GraphicData.Pic.BlipFill.Blip.Embed) + if err != nil { + return nil + } + format := tgt[strings.LastIndex(tgt, ".")+1:] + idn := int(atomic.AddUintptr(&to.docID, 1)) + id := int(to.IncreaseID("图片")) + ids := strconv.Itoa(id) + m := r.file.Media(tgt[6:]) + if m == nil { + return nil + } + rid := to.addImage(format, m.Data) + inln := *r + grph := *r.Graphic + inln.Graphic = &grph + grphdata := *r.Graphic.GraphicData + grph.GraphicData = &grphdata + pic := *r.Graphic.GraphicData.Pic + grphdata.Pic = &pic + grphdata.file = to + grph.file = to + inln.file = to + + inln.DocPr = &WPDocPr{ + ID: idn, + Name: "图片 " + ids, + } + pic.NonVisualPicProperties = &PICNonVisualPicProperties{ + NonVisualDrawingProperties: NonVisualProperties{ + ID: id, + Name: "图片 " + ids, + }, + CNvPicPr: r.Graphic.GraphicData.Pic.NonVisualPicProperties.CNvPicPr, + } + pic.BlipFill = &PICBlipFill{ + Blip: ABlip{ + Embed: rid, + Cstate: r.Graphic.GraphicData.Pic.BlipFill.Blip.Cstate, + }, + Stretch: r.Graphic.GraphicData.Pic.BlipFill.Stretch, + } + return &inln + } + return nil + } + if r.Graphic.GraphicData.Shape != nil { // shape has no media + return r + } + if r.Graphic.GraphicData.Canvas != nil { //TODO: copy canvas media + return r + } + return nil +} + // WPExtent represents the extent of a drawing in a Word document. // // CX CY 's unit is English Metric Units, which is 1/914400 inch @@ -1350,6 +1425,64 @@ func (r *WPAnchor) String() string { return "![anch?](unknown)" } +func (r *WPAnchor) copymedia(to *Docx) *WPAnchor { + if r.Graphic.GraphicData.Pic != nil { + if r.Graphic.GraphicData.Pic.BlipFill != nil { + tgt, err := r.file.ReferTarget(r.Graphic.GraphicData.Pic.BlipFill.Blip.Embed) + if err != nil { + return nil + } + format := tgt[strings.LastIndex(tgt, ".")+1:] + idn := int(atomic.AddUintptr(&to.docID, 1)) + id := int(to.IncreaseID("图片")) + ids := strconv.Itoa(id) + m := r.file.Media(tgt[6:]) + if m == nil { + return nil + } + rid := to.addImage(format, m.Data) + anch := *r + grph := *r.Graphic + anch.Graphic = &grph + grphdata := *r.Graphic.GraphicData + grph.GraphicData = &grphdata + pic := *r.Graphic.GraphicData.Pic + grphdata.Pic = &pic + grphdata.file = to + grph.file = to + anch.file = to + + anch.DocPr = &WPDocPr{ + ID: idn, + Name: "图片 " + ids, + } + pic.NonVisualPicProperties = &PICNonVisualPicProperties{ + NonVisualDrawingProperties: NonVisualProperties{ + ID: id, + Name: "图片 " + ids, + }, + CNvPicPr: r.Graphic.GraphicData.Pic.NonVisualPicProperties.CNvPicPr, + } + pic.BlipFill = &PICBlipFill{ + Blip: ABlip{ + Embed: rid, + Cstate: r.Graphic.GraphicData.Pic.BlipFill.Blip.Cstate, + }, + Stretch: r.Graphic.GraphicData.Pic.BlipFill.Stretch, + } + return &anch + } + return nil + } + if r.Graphic.GraphicData.Shape != nil { // shape has no media + return r + } + if r.Graphic.GraphicData.Canvas != nil { //TODO: copy canvas media + return r + } + return nil +} + // WPSimplePos represents the position of an object in a Word document. type WPSimplePos struct { XMLName xml.Name `xml:"wp:simplePos,omitempty"` diff --git a/structtable.go b/structtable.go index 63333b4..f8351cb 100644 --- a/structtable.go +++ b/structtable.go @@ -27,8 +27,8 @@ import ( "strings" ) -// WTable represents a table within a Word document. -type WTable struct { +// Table represents a table within a Word document. +type Table struct { XMLName xml.Name `xml:"w:tbl,omitempty"` TableProperties *WTableProperties TableGrid *WTableGrid @@ -37,7 +37,7 @@ type WTable struct { file *Docx } -func (t *WTable) String() string { +func (t *Table) String() string { if len(t.TableRows) == 0 || len(t.TableRows[0].TableCells) == 0 { return "" } @@ -62,7 +62,7 @@ func (t *WTable) String() string { } // UnmarshalXML implements the xml.Unmarshaler interface. -func (t *WTable) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { +func (t *Table) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { for { token, err := d.Token() if err == io.EOF { diff --git a/unpack.go b/unpack.go index da17c81..97672ff 100644 --- a/unpack.go +++ b/unpack.go @@ -22,7 +22,6 @@ package docx import ( "archive/zip" - "bytes" "encoding/xml" "io" "strings" @@ -67,7 +66,8 @@ func unpack(zipReader *zip.Reader) (docx *Docx, err error) { // fill remaining files into tmpfslst docx.tmpfslst = append(docx.tmpfslst, f.Name) } - docx.buf = bytes.NewBuffer(make([]byte, 0, 1024*1024)) + //TODO: find last imageID + docx.imageID = 100000 return } @@ -94,6 +94,7 @@ func (f *Docx) parseDocument(file *zip.File) error { f.Document.Body.file = f //TODO: find last docID + f.docID = 100000 err = xml.NewDecoder(zf).Decode(&f.Document) return err } @@ -108,12 +109,12 @@ func (f *Docx) parseDocRelation(file *zip.File) error { f.docRelation.Xmlns = XMLNS_R //TODO: find last rID + f.rID = 100000 return xml.NewDecoder(zf).Decode(&f.docRelation) } // parseMedia add the media into Docx struct func (f *Docx) parseMedia(file *zip.File) error { - //TODO: find last imageID name := file.Name[len(MEDIA_FOLDER):] zf, err := file.Open() if err != nil {