First working release.

Other files (not .docx) support is not guaranteed
This commit is contained in:
BarsTiger
2023-02-06 21:40:47 +02:00
parent 2a189cc689
commit 13b48be802
21 changed files with 1378 additions and 143 deletions

View File

View File

@@ -0,0 +1,30 @@
import os.path
import tempfile
import zipfile
import shutil
class ExtractedDocument:
def __init__(self, path: str):
self.documentroot = None
self.app = None
self.core = None
if zipfile.is_zipfile(path):
self.documentroot = tempfile.mkdtemp()
zipfile.ZipFile(path).extractall(self.documentroot)
self.app = os.path.join(self.documentroot, 'docProps', 'app.xml')
self.core = os.path.join(self.documentroot, 'docProps', 'core.xml')
def pack(self, path):
with zipfile.ZipFile(path, "w", compresslevel=9, compression=zipfile.ZIP_DEFLATED) as z:
for root, dirs, files in os.walk(self.documentroot):
for file in files:
z.write(os.path.join(root, file),
os.path.relpath(os.path.join(root, file),
self.documentroot))
def remove(self):
try:
shutil.rmtree(self.documentroot, True)
except Exception as e:
print(f'Error while removing {self.documentroot}: {e}, remove it manually if you want')

View File

@@ -0,0 +1,36 @@
import os.path
import lxml
from lxml import etree
from modules.document.document_file import ExtractedDocument
from modules.helpers import xml
from modules.helpers.convert import Int
from datetime import datetime
class DocumentProps:
def __init__(self, extracted_document: ExtractedDocument):
if not extracted_document.documentroot or not os.path.isdir(extracted_document.documentroot):
self.parsed_ = False
return
self.parsed_ = True
core_xml = etree.parse(extracted_document.core)
app_xml = etree.parse(extracted_document.app)
self.extracted_document = extracted_document
self.application = xml.get_value(app_xml, '//Application')
self.paragraphs = Int(xml.get_value(app_xml, '//Paragraphs'))
self.lines = Int(xml.get_value(app_xml, '//Lines'))
self.characters = Int(xml.get_value(app_xml, '//Characters'))
self.words = Int(xml.get_value(app_xml, '//Words'))
self.pages = Int(xml.get_value(app_xml, '//Pages'))
self.total_time = Int(xml.get_value(app_xml, '//TotalTime'))
self.template = xml.get_value(app_xml, '//Template')
self.modified = datetime.strptime(xml.get_value(core_xml, 'dcterms:modified'), '%Y-%m-%dT%H:%M:%SZ')
self.created = datetime.strptime(xml.get_value(core_xml, 'dcterms:created'), '%Y-%m-%dT%H:%M:%SZ')
self.revision = xml.get_value(core_xml, 'cp:revision')
self.last_modified_by = xml.get_value(core_xml, 'cp:lastModifiedBy')
self.creator = xml.get_value(core_xml, 'dc:creator')
self.core_xml = core_xml
self.app_xml = app_xml