diff --git a/LICENSE b/LICENSE index f4b2c27..6e1d169 100644 --- a/LICENSE +++ b/LICENSE @@ -3,6 +3,7 @@ MIT License Copyright (c) 2020 gingfrederik Copyright (c) 2021 Gonzalo Fernandez-Victorio Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com) +Copyright (c) 2023 Fumiama Minamoto (源文雨) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 02cf46e..4473a3b 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go. +This is a variant optimized and expanded by fumiama. The original repo is [gonfva/docxlib](https://github.com/gonfva/docxlib). + ## Introduction As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents. @@ -31,7 +33,7 @@ In the mean time, shared as an example in case somebody finds it useful. Go modules supported ```sh -go get github.com/gonfva/docxlib +go get github.com/fumiama/docxlib ``` ### Usage @@ -39,8 +41,7 @@ go get github.com/gonfva/docxlib See [main](main/main.go) for an example ``` -$ go build -o docxlib ./main -$ ./docxlib +$ go run ./cmd/main Preparing new document to write at /tmp/new-file.docx Document writen. Now trying to read it @@ -53,7 +54,7 @@ End of main ``` You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go) ``` -$ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx +$ go build -o docxlib ./cmd/getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...] I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...] I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340] diff --git a/apilink.go b/apilink.go index 80260ae..f3d44e3 100644 --- a/apilink.go +++ b/apilink.go @@ -1,18 +1,19 @@ package docxlib -import "strconv" +import ( + "strconv" + "sync/atomic" +) // when adding an hyperlink we need to store a reference in the relationship field -func (f *DocxLib) addLinkRelation(link string) string { +func (f *Docx) addLinkRelation(link string) string { rel := &Relationship{ - ID: "rId" + strconv.Itoa(f.rId), + ID: "rId" + strconv.Itoa(int(atomic.AddUintptr(&f.rId, 1))), Type: REL_HYPERLINK, Target: link, TargetMode: REL_TARGETMODE, } - f.rId += 1 - f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel) return rel.ID diff --git a/apipara.go b/apipara.go index 6eff353..28a2746 100644 --- a/apipara.go +++ b/apipara.go @@ -1,17 +1,18 @@ package docxlib // AddParagraph adds a new paragraph -func (f *DocxLib) AddParagraph() *Paragraph { +func (f *Docx) AddParagraph() *Paragraph { p := &Paragraph{ - Data: make([]ParagraphChild, 0), + Data: make([]ParagraphChild, 0, 64), file: f, } f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p) + return p } -func (f *DocxLib) Paragraphs() []*Paragraph { +func (f *Docx) Paragraphs() []*Paragraph { return f.Document.Body.Paragraphs } diff --git a/apirun.go b/apirun.go index 25a9110..54b0298 100644 --- a/apirun.go +++ b/apirun.go @@ -14,6 +14,7 @@ func (r *Run) Size(size int) *Run { r.RunProperties.Size = &Size{ Val: size * 2, } + return r } diff --git a/getstructure/main.go b/cmd/getstructure/main.go similarity index 94% rename from getstructure/main.go rename to cmd/getstructure/main.go index db70522..c4beef7 100644 --- a/getstructure/main.go +++ b/cmd/getstructure/main.go @@ -5,8 +5,8 @@ import ( "fmt" "os" + "github.com/fumiama/docxlib" "github.com/golang/glog" - "github.com/gonfva/docxlib" ) var fileLocation *string @@ -40,7 +40,7 @@ func main() { if child.Link != nil { id := child.Link.ID text := child.Link.Run.InstrText - link, err := doc.References(id) + link, err := doc.Refer(id) if err != nil { fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) } else { diff --git a/main/main.go b/cmd/main/main.go similarity index 96% rename from main/main.go rename to cmd/main/main.go index f129a9c..9698a06 100644 --- a/main/main.go +++ b/cmd/main/main.go @@ -5,7 +5,7 @@ import ( "fmt" "os" - "github.com/gonfva/docxlib" + "github.com/fumiama/docxlib" ) var fileLocation *string @@ -60,7 +60,7 @@ func main() { if child.Link != nil { id := child.Link.ID text := child.Link.Run.InstrText - link, err := doc.References(id) + link, err := doc.Refer(id) if err != nil { fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text) } else { diff --git a/docxlib.go b/docxlib.go index db93c23..b2c2e57 100644 --- a/docxlib.go +++ b/docxlib.go @@ -6,33 +6,40 @@ import ( "io" ) -// DocxLib is the structure that allow to access the internal represntation +var ( + // ErrRefIDNotFound cannot find such reference + ErrRefIDNotFound = errors.New("ref id not found") +) + +// Docx is the structure that allow to access the internal represntation // in memory of the doc (either read or about to be written) -type DocxLib struct { +type Docx struct { Document Document DocRelation Relationships - rId int + rId uintptr } // New generates a new empty docx file that we can manipulate and // later on, save -func New() *DocxLib { - return emptyFile() +func New() *Docx { + return newEmptyFile() } // Parse generates a new docx file in memory from a reader // You can it invoke from a file -// readFile, err := os.Open(FILE_PATH) -// if err != nil { -// panic(err) -// } -// fileinfo, err := readFile.Stat() -// if err != nil { -// panic(err) -// } -// size := fileinfo.Size() -// doc, err := docxlib.Parse(readFile, int64(size)) +// +// readFile, err := os.Open(FILE_PATH) +// if err != nil { +// panic(err) +// } +// fileinfo, err := readFile.Stat() +// if err != nil { +// panic(err) +// } +// size := fileinfo.Size() +// doc, err := docxlib.Parse(readFile, int64(size)) +// // but also you can invoke from a webform (BEWARE of trusting users data!!!) // // func uploadFile(w http.ResponseWriter, r *http.Request) { @@ -48,7 +55,7 @@ func New() *DocxLib { // defer file.Close() // docxlib.Parse(file, handler.Size) // } -func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) { +func Parse(reader io.ReaderAt, size int64) (doc *Docx, err error) { zipReader, err := zip.NewReader(reader, size) if err != nil { return nil, err @@ -58,21 +65,21 @@ func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) { } // Write allows to save a docx to a writer -func (f *DocxLib) Write(writer io.Writer) (err error) { +func (f *Docx) Write(writer io.Writer) (err error) { zipWriter := zip.NewWriter(writer) defer zipWriter.Close() return f.pack(zipWriter) } -// References gets the url for a reference -func (f *DocxLib) References(id string) (href string, err error) { +// Refer gets the url for a reference +func (f *Docx) Refer(id string) (href string, err error) { for _, a := range f.DocRelation.Relationships { if a.ID == id { href = a.Target return } } - err = errors.New("id not found") + err = ErrRefIDNotFound return } diff --git a/empty.go b/empty.go index 929e3e9..d0f47c0 100644 --- a/empty.go +++ b/empty.go @@ -2,29 +2,8 @@ package docxlib import "encoding/xml" -func emptyRelationships() []*Relationship { - defaultRel := []*Relationship{ - { - ID: "rId1", - Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`, - Target: "styles.xml", - }, - { - ID: "rId2", - Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`, - Target: "theme/theme1.xml", - }, - { - ID: "rId3", - Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`, - Target: "fontTable.xml", - }, - } - return defaultRel -} - -func emptyFile() *DocxLib { - docx := &DocxLib{ +func newEmptyFile() *Docx { + return &Docx{ Document: Document{ XMLName: xml.Name{ Space: "w", @@ -35,14 +14,29 @@ func emptyFile() *DocxLib { XMLName: xml.Name{ Space: "w", }, - Paragraphs: make([]*Paragraph, 0), + Paragraphs: make([]*Paragraph, 0, 64), }, }, DocRelation: Relationships{ - Xmlns: XMLNS, - Relationships: emptyRelationships(), + Xmlns: XMLNS, + Relationships: []*Relationship{ + { + ID: "rId1", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`, + Target: "styles.xml", + }, + { + ID: "rId2", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`, + Target: "theme/theme1.xml", + }, + { + ID: "rId3", + Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`, + Target: "fontTable.xml", + }, + }, }, - rId: 4, + rId: 3, } - return docx } diff --git a/go.mod b/go.mod index fe15a75..b3b2de1 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/gonfva/docxlib +module github.com/fumiama/docxlib go 1.16 diff --git a/helper.go b/helper.go new file mode 100644 index 0000000..a2bfaf2 --- /dev/null +++ b/helper.go @@ -0,0 +1,20 @@ +package docxlib + +import ( + "unsafe" +) + +// BytesToString 没有内存开销的转换 +func BytesToString(b []byte) string { + return *(*string)(unsafe.Pointer(&b)) +} + +// StringToBytes 没有内存开销的转换 +func StringToBytes(s string) (b []byte) { + bh := (*slice)(unsafe.Pointer(&b)) + sh := (*slice)(unsafe.Pointer(&s)) + bh.data = sh.data + bh.len = sh.len + bh.cap = sh.len + return b +} diff --git a/pack.go b/pack.go index 5569a86..d5e7359 100644 --- a/pack.go +++ b/pack.go @@ -3,6 +3,7 @@ package docxlib import ( "archive/zip" "encoding/xml" + "strings" "github.com/golang/glog" ) @@ -10,7 +11,7 @@ import ( // This receives a zip file writer (word documents are a zip with multiple xml inside) // and writes the relevant files. Some of them come from the empty_constants file, // others from the actual in-memory structure -func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { +func (f *Docx) pack(zipWriter *zip.Writer) (err error) { files := map[string]string{} files["_rels/.rels"] = TEMP_REL @@ -34,7 +35,7 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { return err } - _, err = w.Write([]byte(data)) + _, err = w.Write(StringToBytes(data)) if err != nil { return err } @@ -44,12 +45,13 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) { } func marshal(data interface{}) (out string, err error) { - body, err := xml.Marshal(data) + sb := strings.Builder{} + sb.WriteString(xml.Header) + err = xml.NewEncoder(&sb).Encode(data) if err != nil { glog.Errorln("Error marshalling", err) return } - - out = xml.Header + string(body) + out = sb.String() return } diff --git a/slice.go b/slice.go new file mode 100644 index 0000000..0be4613 --- /dev/null +++ b/slice.go @@ -0,0 +1,15 @@ +package docxlib + +import "unsafe" + +// slice is the runtime representation of a slice. +// It cannot be used safely or portably and its representation may +// change in a later release. +// +// Unlike reflect.SliceHeader, its Data field is sufficient to guarantee the +// data it references will not be garbage collected. +type slice struct { + data unsafe.Pointer + len int + cap int +} diff --git a/structnodes.go b/structnodes.go index a08dbe0..8aaab31 100644 --- a/structnodes.go +++ b/structnodes.go @@ -17,11 +17,11 @@ type Paragraph struct { XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"` Data []ParagraphChild - file *DocxLib + file *Docx } func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { - children := make([]ParagraphChild, 0) + children := make([]ParagraphChild, 0, 64) for { t, err := d.Token() if err == io.EOF { @@ -30,7 +30,8 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt := t.(type) { case xml.StartElement: var elem ParagraphChild - if tt.Name.Local == "hyperlink" { + switch tt.Name.Local { + case "hyperlink": var value Hyperlink d.DecodeElement(&value, &start) id := getAtt(tt.Attr, "id") @@ -41,20 +42,20 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { if anchor != "" { value.ID = anchor } - elem = ParagraphChild{Link: &value} - } else if tt.Name.Local == "r" { + elem.Link = &value + case "r": var value Run d.DecodeElement(&value, &start) - elem = ParagraphChild{Run: &value} + elem.Run = &value if value.InstrText == "" && value.Text == nil { glog.V(0).Infof("Empty run, we ignore") continue } - } else if tt.Name.Local == "rPr" { + case "rPr": var value RunProperties d.DecodeElement(&value, &start) - elem = ParagraphChild{Properties: &value} - } else { + elem.Properties = &value + default: continue } children = append(children, elem) diff --git a/structrun.go b/structrun.go index 0805627..fdb7096 100644 --- a/structrun.go +++ b/structrun.go @@ -76,19 +76,20 @@ func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt := t.(type) { case xml.StartElement: - if tt.Name.Local == "rPr" { + switch tt.Name.Local { + case "rPr": var value RunProperties d.DecodeElement(&value, &start) elem.RunProperties = &value - } else if tt.Name.Local == "instrText" { + case "instrText": var value string d.DecodeElement(&value, &start) elem.InstrText = value - } else if tt.Name.Local == "t" { + case "t": var value Text d.DecodeElement(&value, &start) elem.Text = &value - } else { + default: continue } } @@ -109,8 +110,7 @@ func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { switch tt := t.(type) { case xml.CharData: - cd := tt.Copy() - elem.Text = string(cd) + elem.Text = string(tt) // implicitly copy } } diff --git a/unpack.go b/unpack.go index ba04311..d595ad7 100644 --- a/unpack.go +++ b/unpack.go @@ -4,7 +4,7 @@ package docxlib import ( "archive/zip" "encoding/xml" - "io/ioutil" + "io" "github.com/golang/glog" ) @@ -13,68 +13,63 @@ import ( // and parses the files that are relevant for us: // 1.-Document // 2.-Relationships -func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) { - var doc *Document - var relations *Relationships +func unpack(zipReader *zip.Reader) (docx *Docx, err error) { + docx = new(Docx) for _, f := range zipReader.File { if f.Name == "word/_rels/document.xml.rels" { - relations, err = processRelations(f) + err = processRelations(f, &docx.DocRelation) if err != nil { - return nil, err + return } } if f.Name == "word/document.xml" { - doc, err = processDoc(f) + err = processDoc(f, &docx.Document) if err != nil { - return nil, err + return } } } - docx = &DocxLib{ - Document: *doc, - DocRelation: *relations, - } - return docx, nil + return } // Processes one of the relevant files, the one with the actual document -func processDoc(file *zip.File) (*Document, error) { +func processDoc(file *zip.File, doc *Document) error { filebytes, err := readZipFile(file) if err != nil { glog.Errorln("Error reading from internal zip file") - return nil, err + return err } glog.V(0).Infoln("Doc:", string(filebytes)) - doc := Document{ - XMLW: XMLNS_W, - XMLR: XMLNS_R, - XMLName: xml.Name{Space: XMLNS_W, Local: "document"}} - err = xml.Unmarshal(filebytes, &doc) + doc.XMLW = XMLNS_W + doc.XMLR = XMLNS_R + doc.XMLName.Space = XMLNS_W + doc.XMLName.Local = "document" + err = xml.Unmarshal(filebytes, doc) if err != nil { glog.Errorln("Error unmarshalling doc", string(filebytes)) - return nil, err + return err } glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs) - return &doc, nil + return nil } // Processes one of the relevant files, the one with the relationships -func processRelations(file *zip.File) (*Relationships, error) { +func processRelations(file *zip.File, rels *Relationships) error { filebytes, err := readZipFile(file) if err != nil { glog.Errorln("Error reading from internal zip file") - return nil, err + return err } glog.V(0).Infoln("Relations:", string(filebytes)) - rels := Relationships{Xmlns: XMLNS_R} - err = xml.Unmarshal(filebytes, &rels) + rels.Xmlns = XMLNS_R + err = xml.Unmarshal(filebytes, rels) if err != nil { glog.Errorln("Error unmarshalling relationships") - return nil, err + return err } - return &rels, nil + return nil } // From a zip file structure, we return a byte array @@ -84,5 +79,5 @@ func readZipFile(zf *zip.File) ([]byte, error) { return nil, err } defer f.Close() - return ioutil.ReadAll(f) + return io.ReadAll(f) }