1
0
mirror of https://github.com/fumiama/go-docx.git synced 2026-06-27 14:40:24 +08:00

优化代码结构

This commit is contained in:
源文雨
2023-02-08 16:19:09 +08:00
parent d8f39cecf1
commit 7ff4850504
16 changed files with 153 additions and 114 deletions

View File

@@ -3,6 +3,7 @@ MIT License
Copyright (c) 2020 gingfrederik
Copyright (c) 2021 Gonzalo Fernandez-Victorio
Copyright (c) 2021 Basement Crowd Ltd (https://www.basementcrowd.com)
Copyright (c) 2023 Fumiama Minamoto (源文雨)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@@ -2,6 +2,8 @@
Yet another library to read and write .docx (a.k.a. Microsoft Word documents or ECMA-376 Office Open XML) files in Go.
This is a variant optimized and expanded by fumiama. The original repo is [gonfva/docxlib](https://github.com/gonfva/docxlib).
## Introduction
As part of my work for [Basement Crowd](https://www.basementcrowd.com) and [FromCounsel](https://www.fromcounsel.com), we were in need of a basic library to manipulate (both read and write) Microsoft Word documents.
@@ -31,7 +33,7 @@ In the mean time, shared as an example in case somebody finds it useful.
Go modules supported
```sh
go get github.com/gonfva/docxlib
go get github.com/fumiama/docxlib
```
### Usage
@@ -39,8 +41,7 @@ go get github.com/gonfva/docxlib
See [main](main/main.go) for an example
```
$ go build -o docxlib ./main
$ ./docxlib
$ go run ./cmd/main
Preparing new document to write at /tmp/new-file.docx
Document writen.
Now trying to read it
@@ -53,7 +54,7 @@ End of main
```
You can also increase the log level (-logtostderr=true -v=0) and just dump a specific file(-file /tmp/new-file.docx). See [getstructure/main](getstructure/main.go)
```
$ go build -o docxlib ./getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx
$ go build -o docxlib ./cmd/getstructure/ && ./docxlib -logtostderr=true -v=0 -file /tmp/new-file.docx
I0511 12:37:40.898493 18466 unpack.go:69] Relations: [...]
I0511 12:37:40.898787 18466 unpack.go:47] Doc: [...]
I0511 12:37:40.899330 18466 unpack.go:58] Paragraph [0xc000026d40 0xc000027d00 0xc000172340]

View File

@@ -1,18 +1,19 @@
package docxlib
import "strconv"
import (
"strconv"
"sync/atomic"
)
// when adding an hyperlink we need to store a reference in the relationship field
func (f *DocxLib) addLinkRelation(link string) string {
func (f *Docx) addLinkRelation(link string) string {
rel := &Relationship{
ID: "rId" + strconv.Itoa(f.rId),
ID: "rId" + strconv.Itoa(int(atomic.AddUintptr(&f.rId, 1))),
Type: REL_HYPERLINK,
Target: link,
TargetMode: REL_TARGETMODE,
}
f.rId += 1
f.DocRelation.Relationships = append(f.DocRelation.Relationships, rel)
return rel.ID

View File

@@ -1,17 +1,18 @@
package docxlib
// AddParagraph adds a new paragraph
func (f *DocxLib) AddParagraph() *Paragraph {
func (f *Docx) AddParagraph() *Paragraph {
p := &Paragraph{
Data: make([]ParagraphChild, 0),
Data: make([]ParagraphChild, 0, 64),
file: f,
}
f.Document.Body.Paragraphs = append(f.Document.Body.Paragraphs, p)
return p
}
func (f *DocxLib) Paragraphs() []*Paragraph {
func (f *Docx) Paragraphs() []*Paragraph {
return f.Document.Body.Paragraphs
}

View File

@@ -14,6 +14,7 @@ func (r *Run) Size(size int) *Run {
r.RunProperties.Size = &Size{
Val: size * 2,
}
return r
}

View File

@@ -5,8 +5,8 @@ import (
"fmt"
"os"
"github.com/fumiama/docxlib"
"github.com/golang/glog"
"github.com/gonfva/docxlib"
)
var fileLocation *string
@@ -40,7 +40,7 @@ func main() {
if child.Link != nil {
id := child.Link.ID
text := child.Link.Run.InstrText
link, err := doc.References(id)
link, err := doc.Refer(id)
if err != nil {
fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
} else {

View File

@@ -5,7 +5,7 @@ import (
"fmt"
"os"
"github.com/gonfva/docxlib"
"github.com/fumiama/docxlib"
)
var fileLocation *string
@@ -60,7 +60,7 @@ func main() {
if child.Link != nil {
id := child.Link.ID
text := child.Link.Run.InstrText
link, err := doc.References(id)
link, err := doc.Refer(id)
if err != nil {
fmt.Printf("\tWe found a link with id %s and text %s without target\n", id, text)
} else {

View File

@@ -6,33 +6,40 @@ import (
"io"
)
// DocxLib is the structure that allow to access the internal represntation
var (
// ErrRefIDNotFound cannot find such reference
ErrRefIDNotFound = errors.New("ref id not found")
)
// Docx is the structure that allow to access the internal represntation
// in memory of the doc (either read or about to be written)
type DocxLib struct {
type Docx struct {
Document Document
DocRelation Relationships
rId int
rId uintptr
}
// New generates a new empty docx file that we can manipulate and
// later on, save
func New() *DocxLib {
return emptyFile()
func New() *Docx {
return newEmptyFile()
}
// Parse generates a new docx file in memory from a reader
// You can it invoke from a file
// readFile, err := os.Open(FILE_PATH)
// if err != nil {
// panic(err)
// }
// fileinfo, err := readFile.Stat()
// if err != nil {
// panic(err)
// }
// size := fileinfo.Size()
// doc, err := docxlib.Parse(readFile, int64(size))
//
// readFile, err := os.Open(FILE_PATH)
// if err != nil {
// panic(err)
// }
// fileinfo, err := readFile.Stat()
// if err != nil {
// panic(err)
// }
// size := fileinfo.Size()
// doc, err := docxlib.Parse(readFile, int64(size))
//
// but also you can invoke from a webform (BEWARE of trusting users data!!!)
//
// func uploadFile(w http.ResponseWriter, r *http.Request) {
@@ -48,7 +55,7 @@ func New() *DocxLib {
// defer file.Close()
// docxlib.Parse(file, handler.Size)
// }
func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) {
func Parse(reader io.ReaderAt, size int64) (doc *Docx, err error) {
zipReader, err := zip.NewReader(reader, size)
if err != nil {
return nil, err
@@ -58,21 +65,21 @@ func Parse(reader io.ReaderAt, size int64) (doc *DocxLib, err error) {
}
// Write allows to save a docx to a writer
func (f *DocxLib) Write(writer io.Writer) (err error) {
func (f *Docx) Write(writer io.Writer) (err error) {
zipWriter := zip.NewWriter(writer)
defer zipWriter.Close()
return f.pack(zipWriter)
}
// References gets the url for a reference
func (f *DocxLib) References(id string) (href string, err error) {
// Refer gets the url for a reference
func (f *Docx) Refer(id string) (href string, err error) {
for _, a := range f.DocRelation.Relationships {
if a.ID == id {
href = a.Target
return
}
}
err = errors.New("id not found")
err = ErrRefIDNotFound
return
}

View File

@@ -2,29 +2,8 @@ package docxlib
import "encoding/xml"
func emptyRelationships() []*Relationship {
defaultRel := []*Relationship{
{
ID: "rId1",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
Target: "styles.xml",
},
{
ID: "rId2",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
Target: "theme/theme1.xml",
},
{
ID: "rId3",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
Target: "fontTable.xml",
},
}
return defaultRel
}
func emptyFile() *DocxLib {
docx := &DocxLib{
func newEmptyFile() *Docx {
return &Docx{
Document: Document{
XMLName: xml.Name{
Space: "w",
@@ -35,14 +14,29 @@ func emptyFile() *DocxLib {
XMLName: xml.Name{
Space: "w",
},
Paragraphs: make([]*Paragraph, 0),
Paragraphs: make([]*Paragraph, 0, 64),
},
},
DocRelation: Relationships{
Xmlns: XMLNS,
Relationships: emptyRelationships(),
Xmlns: XMLNS,
Relationships: []*Relationship{
{
ID: "rId1",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles`,
Target: "styles.xml",
},
{
ID: "rId2",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme`,
Target: "theme/theme1.xml",
},
{
ID: "rId3",
Type: `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable`,
Target: "fontTable.xml",
},
},
},
rId: 4,
rId: 3,
}
return docx
}

2
go.mod
View File

@@ -1,4 +1,4 @@
module github.com/gonfva/docxlib
module github.com/fumiama/docxlib
go 1.16

20
helper.go Normal file
View File

@@ -0,0 +1,20 @@
package docxlib
import (
"unsafe"
)
// BytesToString 没有内存开销的转换
func BytesToString(b []byte) string {
return *(*string)(unsafe.Pointer(&b))
}
// StringToBytes 没有内存开销的转换
func StringToBytes(s string) (b []byte) {
bh := (*slice)(unsafe.Pointer(&b))
sh := (*slice)(unsafe.Pointer(&s))
bh.data = sh.data
bh.len = sh.len
bh.cap = sh.len
return b
}

12
pack.go
View File

@@ -3,6 +3,7 @@ package docxlib
import (
"archive/zip"
"encoding/xml"
"strings"
"github.com/golang/glog"
)
@@ -10,7 +11,7 @@ import (
// This receives a zip file writer (word documents are a zip with multiple xml inside)
// and writes the relevant files. Some of them come from the empty_constants file,
// others from the actual in-memory structure
func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
func (f *Docx) pack(zipWriter *zip.Writer) (err error) {
files := map[string]string{}
files["_rels/.rels"] = TEMP_REL
@@ -34,7 +35,7 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
return err
}
_, err = w.Write([]byte(data))
_, err = w.Write(StringToBytes(data))
if err != nil {
return err
}
@@ -44,12 +45,13 @@ func (f *DocxLib) pack(zipWriter *zip.Writer) (err error) {
}
func marshal(data interface{}) (out string, err error) {
body, err := xml.Marshal(data)
sb := strings.Builder{}
sb.WriteString(xml.Header)
err = xml.NewEncoder(&sb).Encode(data)
if err != nil {
glog.Errorln("Error marshalling", err)
return
}
out = xml.Header + string(body)
out = sb.String()
return
}

15
slice.go Normal file
View File

@@ -0,0 +1,15 @@
package docxlib
import "unsafe"
// slice is the runtime representation of a slice.
// It cannot be used safely or portably and its representation may
// change in a later release.
//
// Unlike reflect.SliceHeader, its Data field is sufficient to guarantee the
// data it references will not be garbage collected.
type slice struct {
data unsafe.Pointer
len int
cap int
}

View File

@@ -17,11 +17,11 @@ type Paragraph struct {
XMLName xml.Name `xml:"http://schemas.openxmlformats.org/wordprocessingml/2006/main p"`
Data []ParagraphChild
file *DocxLib
file *Docx
}
func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
children := make([]ParagraphChild, 0)
children := make([]ParagraphChild, 0, 64)
for {
t, err := d.Token()
if err == io.EOF {
@@ -30,7 +30,8 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
switch tt := t.(type) {
case xml.StartElement:
var elem ParagraphChild
if tt.Name.Local == "hyperlink" {
switch tt.Name.Local {
case "hyperlink":
var value Hyperlink
d.DecodeElement(&value, &start)
id := getAtt(tt.Attr, "id")
@@ -41,20 +42,20 @@ func (p *Paragraph) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
if anchor != "" {
value.ID = anchor
}
elem = ParagraphChild{Link: &value}
} else if tt.Name.Local == "r" {
elem.Link = &value
case "r":
var value Run
d.DecodeElement(&value, &start)
elem = ParagraphChild{Run: &value}
elem.Run = &value
if value.InstrText == "" && value.Text == nil {
glog.V(0).Infof("Empty run, we ignore")
continue
}
} else if tt.Name.Local == "rPr" {
case "rPr":
var value RunProperties
d.DecodeElement(&value, &start)
elem = ParagraphChild{Properties: &value}
} else {
elem.Properties = &value
default:
continue
}
children = append(children, elem)

View File

@@ -76,19 +76,20 @@ func (r *Run) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
switch tt := t.(type) {
case xml.StartElement:
if tt.Name.Local == "rPr" {
switch tt.Name.Local {
case "rPr":
var value RunProperties
d.DecodeElement(&value, &start)
elem.RunProperties = &value
} else if tt.Name.Local == "instrText" {
case "instrText":
var value string
d.DecodeElement(&value, &start)
elem.InstrText = value
} else if tt.Name.Local == "t" {
case "t":
var value Text
d.DecodeElement(&value, &start)
elem.Text = &value
} else {
default:
continue
}
}
@@ -109,8 +110,7 @@ func (r *Text) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
switch tt := t.(type) {
case xml.CharData:
cd := tt.Copy()
elem.Text = string(cd)
elem.Text = string(tt) // implicitly copy
}
}

View File

@@ -4,7 +4,7 @@ package docxlib
import (
"archive/zip"
"encoding/xml"
"io/ioutil"
"io"
"github.com/golang/glog"
)
@@ -13,68 +13,63 @@ import (
// and parses the files that are relevant for us:
// 1.-Document
// 2.-Relationships
func unpack(zipReader *zip.Reader) (docx *DocxLib, err error) {
var doc *Document
var relations *Relationships
func unpack(zipReader *zip.Reader) (docx *Docx, err error) {
docx = new(Docx)
for _, f := range zipReader.File {
if f.Name == "word/_rels/document.xml.rels" {
relations, err = processRelations(f)
err = processRelations(f, &docx.DocRelation)
if err != nil {
return nil, err
return
}
}
if f.Name == "word/document.xml" {
doc, err = processDoc(f)
err = processDoc(f, &docx.Document)
if err != nil {
return nil, err
return
}
}
}
docx = &DocxLib{
Document: *doc,
DocRelation: *relations,
}
return docx, nil
return
}
// Processes one of the relevant files, the one with the actual document
func processDoc(file *zip.File) (*Document, error) {
func processDoc(file *zip.File, doc *Document) error {
filebytes, err := readZipFile(file)
if err != nil {
glog.Errorln("Error reading from internal zip file")
return nil, err
return err
}
glog.V(0).Infoln("Doc:", string(filebytes))
doc := Document{
XMLW: XMLNS_W,
XMLR: XMLNS_R,
XMLName: xml.Name{Space: XMLNS_W, Local: "document"}}
err = xml.Unmarshal(filebytes, &doc)
doc.XMLW = XMLNS_W
doc.XMLR = XMLNS_R
doc.XMLName.Space = XMLNS_W
doc.XMLName.Local = "document"
err = xml.Unmarshal(filebytes, doc)
if err != nil {
glog.Errorln("Error unmarshalling doc", string(filebytes))
return nil, err
return err
}
glog.V(0).Infoln("Paragraph", doc.Body.Paragraphs)
return &doc, nil
return nil
}
// Processes one of the relevant files, the one with the relationships
func processRelations(file *zip.File) (*Relationships, error) {
func processRelations(file *zip.File, rels *Relationships) error {
filebytes, err := readZipFile(file)
if err != nil {
glog.Errorln("Error reading from internal zip file")
return nil, err
return err
}
glog.V(0).Infoln("Relations:", string(filebytes))
rels := Relationships{Xmlns: XMLNS_R}
err = xml.Unmarshal(filebytes, &rels)
rels.Xmlns = XMLNS_R
err = xml.Unmarshal(filebytes, rels)
if err != nil {
glog.Errorln("Error unmarshalling relationships")
return nil, err
return err
}
return &rels, nil
return nil
}
// From a zip file structure, we return a byte array
@@ -84,5 +79,5 @@ func readZipFile(zf *zip.File) ([]byte, error) {
return nil, err
}
defer f.Close()
return ioutil.ReadAll(f)
return io.ReadAll(f)
}