1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
|
# Metadata
PDF files can have two types of metadata: "Regular" and XMP ones. They can both exist at the same time.
## Reading metadata
```{testsetup}
pypdf_test_setup("user/metadata", {
"example.pdf": "../resources/example.pdf",
"commented-xmp.pdf": "../resources/commented-xmp.pdf",
})
```
```{testcode}
from pypdf import PdfReader
reader = PdfReader("example.pdf")
meta = reader.metadata
# All the following could be None!
print(meta.title)
print(meta.author)
print(meta.subject)
print(meta.creator)
print(meta.producer)
print(meta.creation_date)
print(meta.modification_date)
```
% Two last rows masked to allow to change example.pdf
```{testoutput}
:hide:
PDF Example Document
None
None
None
Skia/PDF m103 Google Docs Renderer
...
...
```
## Writing metadata
```{testcode}
from datetime import datetime
from pypdf import PdfReader, PdfWriter
reader = PdfReader("example.pdf")
writer = PdfWriter()
# Add all pages to the writer
for page in reader.pages:
writer.add_page(page)
# If you want to add the old metadata, include these two lines
if reader.metadata is not None:
writer.add_metadata(reader.metadata)
# Format the current date and time for the metadata
utc_time = "-05'00'" # UTC time optional
time = datetime.now().strftime(f"D\072%Y%m%d%H%M%S{utc_time}")
# Add the new metadata
writer.add_metadata(
{
"/Author": "Martin",
"/Producer": "Libre Writer",
"/Title": "Title",
"/Subject": "Subject",
"/Keywords": "Keywords",
"/CreationDate": time,
"/ModDate": time,
"/Creator": "Creator",
"/CustomField": "CustomField",
}
)
# Save the new PDF to a file
writer.write("out-meta-create.pdf")
```
## Updating metadata
```{testcode}
from pypdf import PdfWriter
writer = PdfWriter(clone_from="example.pdf")
# Change some values
writer.add_metadata(
{
"/Author": "Martin",
"/Producer": "Libre Writer",
"/Title": "Title",
}
)
# Clear all data but keep the entry in PDF
writer.metadata = {}
# Replace all entries with new set of entries
writer.metadata = {
"/Author": "Martin",
"/Producer": "Libre Writer",
}
# Save the new PDF to a file
writer.write("out-meta-update.pdf")
```
## Removing metadata entry
```{testcode}
from pypdf import PdfWriter
writer = PdfWriter("example.pdf")
# Remove Metadata (/Info entry)
writer.metadata = None
# Save the new PDF to a file
writer.write("out-meta-remove.pdf")
```
## Reading XMP metadata
```{testcode}
from pypdf import PdfReader
reader = PdfReader("example.pdf")
meta = reader.xmp_metadata
if meta:
print(meta.dc_title)
print(meta.dc_description)
print(meta.xmp_create_date)
```
```{testoutput}
:hide:
{'x-default': 'PDF Example Document'}
{}
2025-10-30 09:29:55
```
## Creating XMP metadata
You can create XMP metadata easily using the `XmpInformation.create()` method:
```{testcode}
from pypdf import PdfWriter
from pypdf.xmp import XmpInformation
# Create a new XMP metadata object
xmp = XmpInformation.create()
# Set metadata fields
xmp.dc_title = {"x-default": "My Document Title"}
xmp.dc_creator = ["Author One", "Author Two"]
xmp.dc_description = {"x-default": "Document description"}
xmp.dc_subject = ["keyword1", "keyword2", "keyword3"]
xmp.pdf_producer = "pypdf"
# Create a writer and add the metadata
writer = PdfWriter()
writer.add_blank_page(612, 792) # Add a page
writer.xmp_metadata = xmp
writer.write("out-xmp-create.pdf")
```
## Setting XMP metadata fields
The `XmpInformation` class provides property-based access for all supported metadata fields:
### Dublin Core fields
```{testcode}
from datetime import datetime
from pypdf.xmp import XmpInformation
xmp = XmpInformation.create()
# Single value fields
xmp.dc_coverage = "Global coverage"
xmp.dc_format = "application/pdf"
xmp.dc_identifier = "unique-id-123"
xmp.dc_source = "Original Source"
# Array fields (bags - unordered)
xmp.dc_contributor = ["Contributor One", "Contributor Two"]
xmp.dc_language = ["en", "fr", "de"]
xmp.dc_publisher = ["Publisher One"]
xmp.dc_relation = ["Related Doc 1", "Related Doc 2"]
xmp.dc_subject = ["keyword1", "keyword2"]
xmp.dc_type = ["Document", "Text"]
# Sequence fields (ordered arrays)
xmp.dc_creator = ["Primary Author", "Secondary Author"]
xmp.dc_date = [datetime.now()]
# Language alternative fields
xmp.dc_title = {"x-default": "Title", "en": "English Title", "fr": "Titre français"}
xmp.dc_description = {"x-default": "Description", "en": "English Description"}
xmp.dc_rights = {"x-default": "All rights reserved"}
```
### XMP fields
```{testcode}
from datetime import datetime
# Date fields accept both datetime objects and strings
xmp.xmp_create_date = datetime.now()
xmp.xmp_modify_date = datetime.fromisoformat("2023-12-25T10:30:45Z")
xmp.xmp_metadata_date = datetime.now()
# Text field
xmp.xmp_creator_tool = "pypdf"
```
### PDF fields
```{testcode}
xmp.pdf_keywords = "keyword1, keyword2, keyword3"
xmp.pdf_pdfversion = "1.4"
xmp.pdf_producer = "pypdf"
```
### XMP Media Management fields
```{testcode}
xmp.xmpmm_document_id = "uuid:12345678-1234-1234-1234-123456789abc"
xmp.xmpmm_instance_id = "uuid:87654321-4321-4321-4321-cba987654321"
```
### PDF/A fields
```{testcode}
xmp.pdfaid_part = "1"
xmp.pdfaid_conformance = "B"
```
### Clearing metadata fields
You can clear any field by assigning `None`:
```{testcode}
xmp.dc_title = None
xmp.dc_creator = None
xmp.pdf_producer = None
```
### Incrementally updating XMP metadata fields
When modifying existing XMP metadata, it is often necessary to add or update individual entries while preserving existing values. The XMP properties return standard Python data structures that can be manipulated directly:
```{testcode}
from pypdf.xmp import XmpInformation
xmp = XmpInformation.create()
# Language alternative fields return dictionaries
title = xmp.dc_title or {}
title["en"] = "English Title"
title["fr"] = "Titre français"
xmp.dc_title = title
# Bag fields (unordered collections) return lists
subjects = xmp.dc_subject or []
subjects.append("new_keyword")
xmp.dc_subject = subjects
# Sequence fields (ordered collections) return lists
creators = xmp.dc_creator or []
creators.append("New Author")
xmp.dc_creator = creators
```
This approach provides direct control over the data structures while maintaining the property-based interface.
## Modifying XMP metadata
Modifying XMP metadata is a bit more complicated.
As an example, we want to add the following PDF/UA identifier section to the XMP metadata:
```xml
<rdf:Description rdf:about="" xmlns:pdfuaid="http://www.aiim.org/pdfua/ns/id/">
<pdfuaid:part>1</pdfuaid:part>
</rdf:Description>
```
This could be written like this:
```{testcode}
from pypdf import PdfWriter
writer = PdfWriter(clone_from="commented-xmp.pdf")
metadata = writer.xmp_metadata
assert metadata # Ensure that it is not `None`.
rdf_root = metadata.rdf_root
xmp_meta = rdf_root.parentNode
xmp_document = xmp_meta.parentNode
# Please note that without a text node, the corresponding elements might
# be omitted completely.
pdfuaid_description = xmp_document.createElement("rdf:Description")
pdfuaid_description.setAttribute("rdf:about", "")
pdfuaid_description.setAttribute("xmlns:pdfuaid", "http://www.aiim.org/pdfua/ns/id/")
pdfuaid_part = xmp_document.createElement("pdfuaid:part")
pdfuaid_part_text = xmp_document.createTextNode("1")
pdfuaid_part.appendChild(pdfuaid_part_text)
pdfuaid_description.appendChild(pdfuaid_part)
rdf_root.appendChild(pdfuaid_description)
metadata.stream.set_data(xmp_document.toxml().encode("utf-8"))
writer.write("out-xmp-update.pdf")
```
For further details on modifying the structure, please refer to {py:mod}`xml.dom.minidom`.
|