diff --git a/apipara.go b/apipara.go index 1ad272a..0e123ce 100644 --- a/apipara.go +++ b/apipara.go @@ -32,12 +32,12 @@ func (f *Docx) AddParagraph() *Paragraph { // AddParagraph adds a new paragraph func (c *WTableCell) AddParagraph() *Paragraph { - c.Paragraphs = append(c.Paragraphs, Paragraph{ + c.Paragraphs = append(c.Paragraphs, &Paragraph{ Children: make([]interface{}, 0, 64), file: c.file, }) - return &c.Paragraphs[len(c.Paragraphs)-1] + return c.Paragraphs[len(c.Paragraphs)-1] } // Justification allows to set para's horizonal alignment diff --git a/cmd/main/main.go b/cmd/main/main.go index 5a83c5b..10b040a 100644 --- a/cmd/main/main.go +++ b/cmd/main/main.go @@ -33,6 +33,7 @@ import ( func main() { fileLocation := flag.String("f", "new-file.docx", "file location") analyzeOnly := flag.Bool("a", false, "analyze file only") + clean := flag.Bool("c", false, "clean mode (keep text and picture only)") unm := flag.Bool("u", false, "lease unmarshalled file") flag.Parse() var w *docx.Docx @@ -173,6 +174,9 @@ func main() { if err != nil { panic(err) } + if *clean { + doc.Document.Body.DropDrawingOf("NilPicture") + } if *unm { i := strings.LastIndex(*fileLocation, "/") name := (*fileLocation)[:i+1] + "unmarshal_" + (*fileLocation)[i+1:] @@ -191,9 +195,11 @@ func main() { } fmt.Println("Plain text:") for _, it := range doc.Document.Body.Items { - switch it.(type) { - case *docx.Paragraph, *docx.Table: // printable - fmt.Println(it) + switch o := it.(type) { + case *docx.Paragraph: // printable + fmt.Println(o.String()) + case *docx.Table: // printable + fmt.Println(o.String()) } } fmt.Println("End of main") diff --git a/structdoc.go b/structdoc.go index 983318a..7da7491 100644 --- a/structdoc.go +++ b/structdoc.go @@ -23,7 +23,10 @@ package docx import ( "encoding/xml" "io" + "reflect" + "regexp" "strings" + "unsafe" ) //nolint:revive,stylecheck @@ -74,19 +77,19 @@ func (b *Body) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt.Name.Local { case "p": var value Paragraph + value.file = b.file err = d.DecodeElement(&value, &tt) if err != nil && !strings.HasPrefix(err.Error(), "expected") { return err } - value.file = b.file b.Items = append(b.Items, &value) case "tbl": var value Table + value.file = b.file err = d.DecodeElement(&value, &tt) if err != nil && !strings.HasPrefix(err.Error(), "expected") { return err } - value.file = b.file b.Items = append(b.Items, &value) default: err = d.Skip() // skip unsupported tags @@ -99,6 +102,51 @@ func (b *Body) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { return nil } +// KeepElements keep named elems amd removes others +// +// names: *docx.Paragraph *docx.Table +func (b *Body) KeepElements(name ...string) { + items := make([]interface{}, 0, len(b.Items)) + namemap := make(map[string]struct{}, len(name)*2) + for _, n := range name { + namemap[n] = struct{}{} + } + for _, item := range b.Items { + _, ok := namemap[reflect.ValueOf(item).Type().String()] + if ok { + items = append(items, item) + } + } + b.Items = items +} + +// DropDrawingOf drops all matched drawing in body +// name: Canvas, Shape, Group, ShapeAndCanvas, ShapeAndCanvasAndGroup, NilPicture +func (b *Body) DropDrawingOf(name string) { + for _, item := range b.Items { + switch o := item.(type) { + case *Paragraph: + f := reflect.ValueOf(o).MethodByName("Drop" + name) + if *(*uintptr)(unsafe.Pointer(&f)) == 0 { + continue + } + _ = f.Call(nil) + case *Table: + for _, tr := range o.TableRows { + for _, tc := range tr.TableCells { + for _, p := range tc.Paragraphs { + f := reflect.ValueOf(p).MethodByName("Drop" + name) + if *(*uintptr)(unsafe.Pointer(&f)) == 0 { + continue + } + _ = f.Call(nil) + } + } + } + } + } +} + // Document type Document struct { XMLName xml.Name `xml:"w:document"` @@ -150,6 +198,13 @@ func (doc *Document) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error // ParagraphSplitRule check whether the paragraph is a separator or not type ParagraphSplitRule func(*Paragraph) bool +// SplitDocxByPlainTextRegex matches p.String() +func SplitDocxByPlainTextRegex(re *regexp.Regexp) ParagraphSplitRule { + return func(p *Paragraph) bool { + return re.MatchString(p.String()) + } +} + // SplitByParagraph splits a doc to many docs by using a matched paragraph // as the separator. // @@ -263,10 +318,11 @@ func (t *Table) copymedia(to *Docx) (nt Table) { ntr.file = to for _, tc := range tr.TableCells { ntc := *tc - ntc.Paragraphs = make([]Paragraph, 0, len(tc.Paragraphs)) + ntc.Paragraphs = make([]*Paragraph, 0, len(tc.Paragraphs)) ntc.file = to for _, p := range tc.Paragraphs { - ntc.Paragraphs = append(ntc.Paragraphs, p.copymedia(to)) + np := p.copymedia(to) + ntc.Paragraphs = append(ntc.Paragraphs, &np) } ntr.TableCells = append(ntr.TableCells, &ntc) } diff --git a/structpara.go b/structpara.go index 8544726..a7f2dc8 100644 --- a/structpara.go +++ b/structpara.go @@ -23,6 +23,7 @@ package docx import ( "encoding/xml" "io" + "reflect" "strconv" "strings" ) @@ -296,3 +297,183 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { p.Children = children return nil } + +// KeepElements keep named elems amd removes others +// +// names: *docx.Hyperlink *docx.Run *docx.RunProperties +func (p *Paragraph) KeepElements(name ...string) { + items := make([]interface{}, 0, len(p.Children)) + namemap := make(map[string]struct{}, len(name)*2) + for _, n := range name { + namemap[n] = struct{}{} + } + for _, item := range p.Children { + _, ok := namemap[reflect.ValueOf(item).Type().String()] + if ok { + items = append(items, item) + } + } + p.Children = items +} + +// DropCanvas drops all canvases in paragraph +func (p *Paragraph) DropCanvas() { + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nrc := make([]interface{}, 0, len(r.Children)) + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData != nil { + if d.Inline.Graphic.GraphicData.Canvas != nil { + continue + } + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData != nil { + if d.Anchor.Graphic.GraphicData.Canvas != nil { + continue + } + } + } + nrc = append(nrc, rc) + } + r.Children = nrc + } + } +} + +// DropShape drops all shapes in paragraph +func (p *Paragraph) DropShape() { + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nrc := make([]interface{}, 0, len(r.Children)) + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData != nil { + if d.Inline.Graphic.GraphicData.Shape != nil { + continue + } + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData != nil { + if d.Anchor.Graphic.GraphicData.Shape != nil { + continue + } + } + } + nrc = append(nrc, rc) + } + r.Children = nrc + } + } +} + +// DropGroup drops all groups in paragraph +func (p *Paragraph) DropGroup() { + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nrc := make([]interface{}, 0, len(r.Children)) + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData != nil { + if d.Inline.Graphic.GraphicData.Group != nil { + continue + } + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData != nil { + if d.Anchor.Graphic.GraphicData.Group != nil { + continue + } + } + } + nrc = append(nrc, rc) + } + r.Children = nrc + } + } +} + +// DropShapeAndCanvas drops all shapes and canvases in paragraph +func (p *Paragraph) DropShapeAndCanvas() { + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nrc := make([]interface{}, 0, len(r.Children)) + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData != nil { + if d.Inline.Graphic.GraphicData.Shape != nil || d.Inline.Graphic.GraphicData.Canvas != nil { + continue + } + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData != nil { + if d.Anchor.Graphic.GraphicData.Shape != nil || d.Anchor.Graphic.GraphicData.Canvas != nil { + continue + } + } + } + nrc = append(nrc, rc) + } + r.Children = nrc + } + } +} + +// DropShapeAndCanvasAndGroup drops all shapes, canvases and groups in paragraph +func (p *Paragraph) DropShapeAndCanvasAndGroup() { + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nrc := make([]interface{}, 0, len(r.Children)) + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData != nil { + if d.Inline.Graphic.GraphicData.Shape != nil || d.Inline.Graphic.GraphicData.Canvas != nil || d.Inline.Graphic.GraphicData.Group != nil { + continue + } + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData != nil { + if d.Anchor.Graphic.GraphicData.Shape != nil || d.Anchor.Graphic.GraphicData.Canvas != nil || d.Anchor.Graphic.GraphicData.Group != nil { + continue + } + } + } + nrc = append(nrc, rc) + } + r.Children = nrc + } + } +} + +// DropNilPicture drops all drawings with nil picture in paragraph +func (p *Paragraph) DropNilPicture() { + for _, pc := range p.Children { + if r, ok := pc.(*Run); ok { + nrc := make([]interface{}, 0, len(r.Children)) + for _, rc := range r.Children { + if d, ok := rc.(*Drawing); ok { + if d.Inline == nil && d.Anchor == nil { + continue + } + if (d.Inline != nil && d.Inline.Graphic == nil) || (d.Anchor != nil && d.Anchor.Graphic == nil) { + continue + } + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData == nil { + continue + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData == nil { + continue + } + if d.Inline != nil && d.Inline.Graphic != nil && d.Inline.Graphic.GraphicData != nil { + if d.Inline.Graphic.GraphicData.Pic == nil { + continue + } + } + if d.Anchor != nil && d.Anchor.Graphic != nil && d.Anchor.Graphic.GraphicData != nil { + if d.Anchor.Graphic.GraphicData.Pic == nil { + continue + } + } + } + nrc = append(nrc, rc) + } + r.Children = nrc + } + } +} diff --git a/structrun.go b/structrun.go index 68464b3..8f68fe0 100644 --- a/structrun.go +++ b/structrun.go @@ -23,6 +23,7 @@ package docx import ( "encoding/xml" "io" + "reflect" "strconv" "strings" ) @@ -175,6 +176,24 @@ func (r *Run) parse(d *xml.Decoder, tt xml.StartElement) (child interface{}, err return } +// KeepElements keep named elems amd removes others +// +// names: *docx.Text *docx.Drawing *docx.Tab *docx.BarterRabbet +func (r *Run) KeepElements(name ...string) { + items := make([]interface{}, 0, len(r.Children)) + namemap := make(map[string]struct{}, len(name)*2) + for _, n := range name { + namemap[n] = struct{}{} + } + for _, item := range r.Children { + _, ok := namemap[reflect.ValueOf(item).Type().String()] + if ok { + items = append(items, item) + } + } + r.Children = items +} + // RunProperties encapsulates visual properties of a run type RunProperties struct { XMLName xml.Name `xml:"w:rPr,omitempty"` diff --git a/structshape.go b/structshape.go index a53bdd2..3c3b229 100644 --- a/structshape.go +++ b/structshape.go @@ -600,11 +600,11 @@ func (c *WTextBoxContent) UnmarshalXML(d *xml.Decoder, start xml.StartElement) e switch tt.Name.Local { case "p": var value Paragraph + value.file = c.file err = d.DecodeElement(&value, &tt) if err != nil && !strings.HasPrefix(err.Error(), "expected") { return err } - value.file = c.file c.Paragraphs = append(c.Paragraphs, value) default: err = d.Skip() // skip unsupported tags diff --git a/structtable.go b/structtable.go index f8351cb..cdec32c 100644 --- a/structtable.go +++ b/structtable.go @@ -536,7 +536,7 @@ type WTableRowHeight struct { type WTableCell struct { XMLName xml.Name `xml:"w:tc,omitempty"` TableCellProperties *WTableCellProperties - Paragraphs []Paragraph `xml:"w:p,omitempty"` + Paragraphs []*Paragraph `xml:"w:p,omitempty"` file *Docx } @@ -556,12 +556,12 @@ func (c *WTableCell) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error switch tt.Name.Local { case "p": var value Paragraph + value.file = c.file err = d.DecodeElement(&value, &tt) if err != nil && !strings.HasPrefix(err.Error(), "expected") { return err } - value.file = c.file - c.Paragraphs = append(c.Paragraphs, value) + c.Paragraphs = append(c.Paragraphs, &value) case "tcPr": var value WTableCellProperties err = d.DecodeElement(&value, &tt)