1
0
mirror of https://github.com/fumiama/go-docx.git synced 2026-06-23 20:16:38 +08:00

优化代码结构

This commit is contained in:
源文雨
2023-02-08 16:19:09 +08:00
parent d8f39cecf1
commit 7ff4850504
16 changed files with 153 additions and 114 deletions

View File

@@ -3,6 +3,7 @@ MIT License
Copyright (c) 2020 gingfrederik Copyright (c) 2020 gingfrederik
Copyright (c) 2021 Gonzalo Fernandez-Victorio Copyright (c) 2021 Gonzalo Fernandez-Victorio
Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com) Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com)
Copyright (c) 2023 Fumiama Minamoto (源文雨)
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@@ -2,6 +2,8 @@
Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go. Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go.
This is a variant optimized and expanded by fumiama. The original repo is [gonfva/docxlib](https://github.com/gonfva/docxlib).
## Introduction ## Introduction
As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents. As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents.
@@ -31,7 +33,7 @@ In the mean time, shared as an example in case somebody finds it useful.
Go modules supported Go modules supported
```sh ```sh
go get github.com/gonfva/docxlib go get github.com/fumiama/docxlib
``` ```
### Usage ### Usage
@@ -39,8 +41,7 @@ go get github.com/gonfva/docxlib
See [main](main/main.go) for an example See [main](main/main.go) for an example
``` ```
$ go build -o docxlib ./main $ go run ./cmd/main
$ ./docxlib
Preparing new document to write at /tmp/new-file.docx Preparing new document to write at /tmp/new-file.docx
Document writen. Document writen.
Now trying to read it Now trying to read it
@@ -53,7 +54,7 @@ End of main
``` ```
You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go) You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go)
``` ```
$ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx $ go build -o docxlib ./cmd/getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx
I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...] I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...]
I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...] I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...]
I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340] I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340]

View File

@@ -1,18 +1,19 @@
package docxlib package docxlib
import "strconv" import (
"strconv"
"sync/atomic"
)
// when adding an hyperlink we need to store a reference in the relationship field // when adding an hyperlink we need to store a reference in the relationship field
func (f *DocxLib) addLinkRelation(link string) string { func (f *Docx) addLinkRelation(link string) string {
rel := &Relationship{ rel := &Relationship{
ID: "rId" + strconv.Itoa(f.rId), ID: "rId" + strconv.Itoa(int(atomic.AddUintptr(&f.rId, 1))),
Type: REL_HYPERLINK, Type: REL_HYPERLINK,
Target: link, Target: link,
TargetMode: REL_TARGETMODE, TargetMode: REL_TARGETMODE,
} }
f.rId += 1
f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel) f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel)
return rel.ID return rel.ID

View File

@@ -1,17 +1,18 @@
package docxlib package docxlib
// AddParagraph adds a new paragraph // AddParagraph adds a new paragraph
func (f *DocxLib) AddParagraph() *Paragraph { func (f *Docx) AddParagraph() *Paragraph {
p := &Paragraph{ p := &Paragraph{
Data: make([]ParagraphChild, 0), Data: make([]ParagraphChild, 0, 64),
file: f, file: f,
} }
f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p) f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p)
return p return p
} }
func (f *DocxLib) Paragraphs() []*Paragraph { func (f *Docx) Paragraphs() []*Paragraph {
return f.Document.Body.Paragraphs return f.Document.Body.Paragraphs
} }

View File

@@ -14,6 +14,7 @@ func (r *Run) Size(size int) *Run {
r.RunProperties.Size = &Size{ r.RunProperties.Size = &Size{
Val: size * 2, Val: size * 2,
} }
return r return r
} }

View File

@@ -5,8 +5,8 @@ import (
"fmt" "fmt"
"os" "os"
"github.com/fumiama/docxlib"
"github.com/golang/glog" "github.com/golang/glog"
"github.com/gonfva/docxlib"
) )
var fileLocation *string var fileLocation *string
@@ -40,7 +40,7 @@ func main() {
if child.Link != nil { if child.Link != nil {
id := child.Link.ID id := child.Link.ID
text := child.Link.Run.InstrText text := child.Link.Run.InstrText
link, err := doc.References(id) link, err := doc.Refer(id)
if err != nil { if err != nil {
fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
} else { } else {

View File

@@ -5,7 +5,7 @@ import (
"fmt" "fmt"
"os" "os"
"github.com/gonfva/docxlib" "github.com/fumiama/docxlib"
) )
var fileLocation *string var fileLocation *string
@@ -60,7 +60,7 @@ func main() {
if child.Link != nil { if child.Link != nil {
id := child.Link.ID id := child.Link.ID
text := child.Link.Run.InstrText text := child.Link.Run.InstrText
link, err := doc.References(id) link, err := doc.Refer(id)
if err != nil { if err != nil {
fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
} else { } else {

View File

@@ -6,33 +6,40 @@ import (
"io" "io"
) )
// DocxLib is the structure that allow to access the internal represntation var (
// ErrRefIDNotFound cannot find such reference
ErrRefIDNotFound = errors.New("ref id not found")
)
// Docx is the structure that allow to access the internal represntation
// in memory of the doc (either read or about to be written) // in memory of the doc (either read or about to be written)
type DocxLib struct { type Docx struct {
Document Document Document Document
DocRelation Relationships DocRelation Relationships
rId int rId uintptr
} }
// New generates a new empty docx file that we can manipulate and // New generates a new empty docx file that we can manipulate and
// later on, save // later on, save
func New() *DocxLib { func New() *Docx {
return emptyFile() return newEmptyFile()
} }
// Parse generates a new docx file in memory from a reader // Parse generates a new docx file in memory from a reader
// You can it invoke from a file // You can it invoke from a file
// readFile, err := os.Open(FILE_PATH) //
// if err != nil { // readFile, err := os.Open(FILE_PATH)
// panic(err) // if err != nil {
// } // panic(err)
// fileinfo, err := readFile.Stat() // }
// if err != nil { // fileinfo, err := readFile.Stat()
// panic(err) // if err != nil {
// } // panic(err)
// size := fileinfo.Size() // }
// doc, err := docxlib.Parse(readFile, int64(size)) // size := fileinfo.Size()
// doc, err := docxlib.Parse(readFile, int64(size))
//
// but also you can invoke from a webform (BEWARE of trusting users data!!!) // but also you can invoke from a webform (BEWARE of trusting users data!!!)
// //
// func uploadFile(w http.ResponseWriter, r *http.Request) { // func uploadFile(w http.ResponseWriter, r *http.Request) {
@@ -48,7 +55,7 @@ func New() *DocxLib {
// defer file.Close() // defer file.Close()
// docxlib.Parse(file, handler.Size) // docxlib.Parse(file, handler.Size)
// } // }
func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) { func Parse(reader io.ReaderAt, size int64) (doc *Docx, err error) {
zipReader, err := zip.NewReader(reader, size) zipReader, err := zip.NewReader(reader, size)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -58,21 +65,21 @@ func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) {
} }
// Write allows to save a docx to a writer // Write allows to save a docx to a writer
func (f *DocxLib) Write(writer io.Writer) (err error) { func (f *Docx) Write(writer io.Writer) (err error) {
zipWriter := zip.NewWriter(writer) zipWriter := zip.NewWriter(writer)
defer zipWriter.Close() defer zipWriter.Close()
return f.pack(zipWriter) return f.pack(zipWriter)
} }
// References gets the url for a reference // Refer gets the url for a reference
func (f *DocxLib) References(id string) (href string, err error) { func (f *Docx) Refer(id string) (href string, err error) {
for _, a := range f.DocRelation.Relationships { for _, a := range f.DocRelation.Relationships {
if a.ID == id { if a.ID == id {
href = a.Target href = a.Target
return return
} }
} }
err = errors.New("id not found") err = ErrRefIDNotFound
return return
} }

View File

@@ -2,29 +2,8 @@ package docxlib
import "encoding/xml" import "encoding/xml"
func emptyRelationships() []*Relationship { func newEmptyFile() *Docx {
defaultRel := []*Relationship{ return &Docx{
{
ID: "rId1",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
Target: "styles.xml",
},
{
ID: "rId2",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
Target: "theme/theme1.xml",
},
{
ID: "rId3",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
Target: "fontTable.xml",
},
}
return defaultRel
}
func emptyFile() *DocxLib {
docx := &DocxLib{
Document: Document{ Document: Document{
XMLName: xml.Name{ XMLName: xml.Name{
Space: "w", Space: "w",
@@ -35,14 +14,29 @@ func emptyFile() *DocxLib {
XMLName: xml.Name{ XMLName: xml.Name{
Space: "w", Space: "w",
}, },
Paragraphs: make([]*Paragraph, 0), Paragraphs: make([]*Paragraph, 0, 64),
}, },
}, },
DocRelation: Relationships{ DocRelation: Relationships{
Xmlns: XMLNS, Xmlns: XMLNS,
Relationships: emptyRelationships(), Relationships: []*Relationship{
{
ID: "rId1",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
Target: "styles.xml",
},
{
ID: "rId2",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
Target: "theme/theme1.xml",
},
{
ID: "rId3",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
Target: "fontTable.xml",
},
},
}, },
rId: 4, rId: 3,
} }
return docx
} }

2
go.mod
View File

@@ -1,4 +1,4 @@
module github.com/gonfva/docxlib module github.com/fumiama/docxlib
go 1.16 go 1.16

20
helper.go Normal file
View File

@@ -0,0 +1,20 @@
package docxlib
import (
"unsafe"
)
// BytesToString 没有内存开销的转换
func BytesToString(b []byte) string {
return *(*string)(unsafe.Pointer(&b))
}
// StringToBytes 没有内存开销的转换
func StringToBytes(s string) (b []byte) {
bh := (*slice)(unsafe.Pointer(&b))
sh := (*slice)(unsafe.Pointer(&s))
bh.data = sh.data
bh.len = sh.len
bh.cap = sh.len
return b
}

12
pack.go
View File

@@ -3,6 +3,7 @@ package docxlib
import ( import (
"archive/zip" "archive/zip"
"encoding/xml" "encoding/xml"
"strings"
"github.com/golang/glog" "github.com/golang/glog"
) )
@@ -10,7 +11,7 @@ import (
// This receives a zip file writer (word documents are a zip with multiple xml inside) // This receives a zip file writer (word documents are a zip with multiple xml inside)
// and writes the relevant files. Some of them come from the empty_constants file, // and writes the relevant files. Some of them come from the empty_constants file,
// others from the actual in-memory structure // others from the actual in-memory structure
func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { func (f *Docx) pack(zipWriter *zip.Writer) (err error) {
files := map[string]string{} files := map[string]string{}
files["_rels/.rels"] = TEMP_REL files["_rels/.rels"] = TEMP_REL
@@ -34,7 +35,7 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
return err return err
} }
_, err = w.Write([]byte(data)) _, err = w.Write(StringToBytes(data))
if err != nil { if err != nil {
return err return err
} }
@@ -44,12 +45,13 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
} }
func marshal(data interface{}) (out string, err error) { func marshal(data interface{}) (out string, err error) {
body, err := xml.Marshal(data) sb := strings.Builder{}
sb.WriteString(xml.Header)
err = xml.NewEncoder(&sb).Encode(data)
if err != nil { if err != nil {
glog.Errorln("Error marshalling", err) glog.Errorln("Error marshalling", err)
return return
} }
out = sb.String()
out = xml.Header + string(body)
return return
} }

15
slice.go Normal file
View File

@@ -0,0 +1,15 @@
package docxlib
import "unsafe"
// slice is the runtime representation of a slice.
// It cannot be used safely or portably and its representation may
// change in a later release.
//
// Unlike reflect.SliceHeader, its Data field is sufficient to guarantee the
// data it references will not be garbage collected.
type slice struct {
data unsafe.Pointer
len int
cap int
}

View File

@@ -17,11 +17,11 @@ type Paragraph struct {
XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"`
Data []ParagraphChild Data []ParagraphChild
file *DocxLib file *Docx
} }
func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
children := make([]ParagraphChild, 0) children := make([]ParagraphChild, 0, 64)
for { for {
t, err := d.Token() t, err := d.Token()
if err == io.EOF { if err == io.EOF {
@@ -30,7 +30,8 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
switch tt := t.(type) { switch tt := t.(type) {
case xml.StartElement: case xml.StartElement:
var elem ParagraphChild var elem ParagraphChild
if tt.Name.Local == "hyperlink" { switch tt.Name.Local {
case "hyperlink":
var value Hyperlink var value Hyperlink
d.DecodeElement(&value, &start) d.DecodeElement(&value, &start)
id := getAtt(tt.Attr, "id") id := getAtt(tt.Attr, "id")
@@ -41,20 +42,20 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
if anchor != "" { if anchor != "" {
value.ID = anchor value.ID = anchor
} }
elem = ParagraphChild{Link: &value} elem.Link = &value
} else if tt.Name.Local == "r" { case "r":
var value Run var value Run
d.DecodeElement(&value, &start) d.DecodeElement(&value, &start)
elem = ParagraphChild{Run: &value} elem.Run = &value
if value.InstrText == "" && value.Text == nil { if value.InstrText == "" && value.Text == nil {
glog.V(0).Infof("Empty run, we ignore") glog.V(0).Infof("Empty run, we ignore")
continue continue
} }
} else if tt.Name.Local == "rPr" { case "rPr":
var value RunProperties var value RunProperties
d.DecodeElement(&value, &start) d.DecodeElement(&value, &start)
elem = ParagraphChild{Properties: &value} elem.Properties = &value
} else { default:
continue continue
} }
children = append(children, elem) children = append(children, elem)

View File

@@ -76,19 +76,20 @@ func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
switch tt := t.(type) { switch tt := t.(type) {
case xml.StartElement: case xml.StartElement:
if tt.Name.Local == "rPr" { switch tt.Name.Local {
case "rPr":
var value RunProperties var value RunProperties
d.DecodeElement(&value, &start) d.DecodeElement(&value, &start)
elem.RunProperties = &value elem.RunProperties = &value
} else if tt.Name.Local == "instrText" { case "instrText":
var value string var value string
d.DecodeElement(&value, &start) d.DecodeElement(&value, &start)
elem.InstrText = value elem.InstrText = value
} else if tt.Name.Local == "t" { case "t":
var value Text var value Text
d.DecodeElement(&value, &start) d.DecodeElement(&value, &start)
elem.Text = &value elem.Text = &value
} else { default:
continue continue
} }
} }
@@ -109,8 +110,7 @@ func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
switch tt := t.(type) { switch tt := t.(type) {
case xml.CharData: case xml.CharData:
cd := tt.Copy() elem.Text = string(tt) // implicitly copy
elem.Text = string(cd)
} }
} }

View File

@@ -4,7 +4,7 @@ package docxlib
import ( import (
"archive/zip" "archive/zip"
"encoding/xml" "encoding/xml"
"io/ioutil" "io"
"github.com/golang/glog" "github.com/golang/glog"
) )
@@ -13,68 +13,63 @@ import (
// and parses the files that are relevant for us: // and parses the files that are relevant for us:
// 1.-Document // 1.-Document
// 2.-Relationships // 2.-Relationships
func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) { func unpack(zipReader *zip.Reader) (docx *Docx, err error) {
var doc *Document docx = new(Docx)
var relations *Relationships
for _, f := range zipReader.File { for _, f := range zipReader.File {
if f.Name == "word/_rels/document.xml.rels" { if f.Name == "word/_rels/document.xml.rels" {
relations, err = processRelations(f) err = processRelations(f, &docx.DocRelation)
if err != nil { if err != nil {
return nil, err return
} }
} }
if f.Name == "word/document.xml" { if f.Name == "word/document.xml" {
doc, err = processDoc(f) err = processDoc(f, &docx.Document)
if err != nil { if err != nil {
return nil, err return
} }
} }
} }
docx = &DocxLib{ return
Document: *doc,
DocRelation: *relations,
}
return docx, nil
} }
// Processes one of the relevant files, the one with the actual document // Processes one of the relevant files, the one with the actual document
func processDoc(file *zip.File) (*Document, error) { func processDoc(file *zip.File, doc *Document) error {
filebytes, err := readZipFile(file) filebytes, err := readZipFile(file)
if err != nil { if err != nil {
glog.Errorln("Error reading from internal zip file") glog.Errorln("Error reading from internal zip file")
return nil, err return err
} }
glog.V(0).Infoln("Doc:", string(filebytes)) glog.V(0).Infoln("Doc:", string(filebytes))
doc := Document{ doc.XMLW = XMLNS_W
XMLW: XMLNS_W, doc.XMLR = XMLNS_R
XMLR: XMLNS_R, doc.XMLName.Space = XMLNS_W
XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} doc.XMLName.Local = "document"
err = xml.Unmarshal(filebytes, &doc) err = xml.Unmarshal(filebytes, doc)
if err != nil { if err != nil {
glog.Errorln("Error unmarshalling doc", string(filebytes)) glog.Errorln("Error unmarshalling doc", string(filebytes))
return nil, err return err
} }
glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs) glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs)
return &doc, nil return nil
} }
// Processes one of the relevant files, the one with the relationships // Processes one of the relevant files, the one with the relationships
func processRelations(file *zip.File) (*Relationships, error) { func processRelations(file *zip.File, rels *Relationships) error {
filebytes, err := readZipFile(file) filebytes, err := readZipFile(file)
if err != nil { if err != nil {
glog.Errorln("Error reading from internal zip file") glog.Errorln("Error reading from internal zip file")
return nil, err return err
} }
glog.V(0).Infoln("Relations:", string(filebytes)) glog.V(0).Infoln("Relations:", string(filebytes))
rels := Relationships{Xmlns: XMLNS_R} rels.Xmlns = XMLNS_R
err = xml.Unmarshal(filebytes, &rels) err = xml.Unmarshal(filebytes, rels)
if err != nil { if err != nil {
glog.Errorln("Error unmarshalling relationships") glog.Errorln("Error unmarshalling relationships")
return nil, err return err
} }
return &rels, nil return nil
} }
// From a zip file structure, we return a byte array // From a zip file structure, we return a byte array
@@ -84,5 +79,5 @@ func readZipFile(zf *zip.File) ([]byte, error) {
return nil, err return nil, err
} }
defer f.Close() defer f.Close()
return ioutil.ReadAll(f) return io.ReadAll(f)
} }