1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 """
39 Provides filesystem-related objects.
40 @sort: FilesystemList, BackupFileList, PurgeItemList
41 @author: Kenneth J. Pronovici <pronovic@ieee.org>
42 """
43
44
45
46
47
48
49
50 import os
51 import re
52 import math
53 import logging
54 import tarfile
55 import hashlib
56
57
58 from CedarBackup3.knapsack import firstFit, bestFit, worstFit, alternateFit
59 from CedarBackup3.util import AbsolutePathList, UnorderedList, RegexList
60 from CedarBackup3.util import removeKeys, displayBytes, calculateFileAge, encodePath, dereferenceLink
61
62
63
64
65
66
67 logger = logging.getLogger("CedarBackup3.log.filesystem")
75
76
77
78
79
80 """
81 Represents a list of filesystem items.
82
83 This is a generic class that represents a list of filesystem items. Callers
84 can add individual files or directories to the list, or can recursively add
85 the contents of a directory. The class also allows for up-front exclusions
86 in several forms (all files, all directories, all items matching a pattern,
87 all items whose basename matches a pattern, or all directories containing a
88 specific "ignore file"). Symbolic links are typically backed up
89 non-recursively, i.e. the link to a directory is backed up, but not the
90 contents of that link (we don't want to deal with recursive loops, etc.).
91
92 The custom methods such as L{addFile} will only add items if they exist on
93 the filesystem and do not match any exclusions that are already in place.
94 However, since a FilesystemList is a subclass of Python's standard list
95 class, callers can also add items to the list in the usual way, using
96 methods like C{append()} or C{insert()}. No validations apply to items
97 added to the list in this way; however, many list-manipulation methods deal
98 "gracefully" with items that don't exist in the filesystem, often by
99 ignoring them.
100
101 Once a list has been created, callers can remove individual items from the
102 list using standard methods like C{pop()} or C{remove()} or they can use
103 custom methods to remove specific types of entries or entries which match a
104 particular pattern.
105
106 @note: Regular expression patterns that apply to paths are assumed to be
107 bounded at front and back by the beginning and end of the string, i.e. they
108 are treated as if they begin with C{^} and end with C{$}. This is true
109 whether we are matching a complete path or a basename.
110
111 @sort: __init__, addFile, addDir, addDirContents, removeFiles, removeDirs,
112 removeLinks, removeMatch, removeInvalid, normalize,
113 excludeFiles, excludeDirs, excludeLinks, excludePaths,
114 excludePatterns, excludeBasenamePatterns, ignoreFile
115 """
116
117
118
119
120
121
139
140
141
142
143
144
146 """
147 Property target used to set the exclude files flag.
148 No validations, but we normalize the value to C{True} or C{False}.
149 """
150 if value:
151 self._excludeFiles = True
152 else:
153 self._excludeFiles = False
154
156 """
157 Property target used to get the exclude files flag.
158 """
159 return self._excludeFiles
160
162 """
163 Property target used to set the exclude directories flag.
164 No validations, but we normalize the value to C{True} or C{False}.
165 """
166 if value:
167 self._excludeDirs = True
168 else:
169 self._excludeDirs = False
170
172 """
173 Property target used to get the exclude directories flag.
174 """
175 return self._excludeDirs
176
178 """
179 Property target used to set the exclude soft links flag.
180 No validations, but we normalize the value to C{True} or C{False}.
181 """
182 if value:
183 self._excludeLinks = True
184 else:
185 self._excludeLinks = False
186
188 """
189 Property target used to get the exclude soft links flag.
190 """
191 return self._excludeLinks
192
194 """
195 Property target used to set the exclude paths list.
196 A C{None} value is converted to an empty list.
197 Elements do not have to exist on disk at the time of assignment.
198 @raise ValueError: If any list element is not an absolute path.
199 """
200 self._excludePaths = AbsolutePathList()
201 if value is not None:
202 self._excludePaths.extend(value)
203
205 """
206 Property target used to get the absolute exclude paths list.
207 """
208 return self._excludePaths
209
211 """
212 Property target used to set the exclude patterns list.
213 A C{None} value is converted to an empty list.
214 """
215 self._excludePatterns = RegexList()
216 if value is not None:
217 self._excludePatterns.extend(value)
218
220 """
221 Property target used to get the exclude patterns list.
222 """
223 return self._excludePatterns
224
226 """
227 Property target used to set the exclude basename patterns list.
228 A C{None} value is converted to an empty list.
229 """
230 self._excludeBasenamePatterns = RegexList()
231 if value is not None:
232 self._excludeBasenamePatterns.extend(value)
233
235 """
236 Property target used to get the exclude basename patterns list.
237 """
238 return self._excludeBasenamePatterns
239
241 """
242 Property target used to set the ignore file.
243 The value must be a non-empty string if it is not C{None}.
244 @raise ValueError: If the value is an empty string.
245 """
246 if value is not None:
247 if len(value) < 1:
248 raise ValueError("The ignore file must be a non-empty string.")
249 self._ignoreFile = value
250
252 """
253 Property target used to get the ignore file.
254 """
255 return self._ignoreFile
256
257 excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
258 excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
259 excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
260 excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
261 excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
262 "List of regular expression patterns (matching complete path) to be excluded.")
263 excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
264 None, "List of regular expression patterns (matching basename) to be excluded.")
265 ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
266
267
268
269
270
271
273 """
274 Adds a file to the list.
275
276 The path must exist and must be a file or a link to an existing file. It
277 will be added to the list subject to any exclusions that are in place.
278
279 @param path: File path to be added to the list
280 @type path: String representing a path on disk
281
282 @return: Number of items added to the list.
283
284 @raise ValueError: If path is not a file or does not exist.
285 @raise ValueError: If the path could not be encoded properly.
286 """
287 path = encodePath(path)
288 if not os.path.exists(path) or not os.path.isfile(path):
289 logger.debug("Path [%s] is not a file or does not exist on disk.", path)
290 raise ValueError("Path is not a file or does not exist on disk.")
291 if self.excludeLinks and os.path.islink(path):
292 logger.debug("Path [%s] is excluded based on excludeLinks.", path)
293 return 0
294 if self.excludeFiles:
295 logger.debug("Path [%s] is excluded based on excludeFiles.", path)
296 return 0
297 if path in self.excludePaths:
298 logger.debug("Path [%s] is excluded based on excludePaths.", path)
299 return 0
300 for pattern in self.excludePatterns:
301 pattern = encodePath(pattern)
302 if re.compile(r"^%s$" % pattern).match(path):
303 logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
304 return 0
305 for pattern in self.excludeBasenamePatterns:
306 pattern = encodePath(pattern)
307 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
308 logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
309 return 0
310 self.append(path)
311 logger.debug("Added file to list: [%s]", path)
312 return 1
313
315 """
316 Adds a directory to the list.
317
318 The path must exist and must be a directory or a link to an existing
319 directory. It will be added to the list subject to any exclusions that
320 are in place. The L{ignoreFile} does not apply to this method, only to
321 L{addDirContents}.
322
323 @param path: Directory path to be added to the list
324 @type path: String representing a path on disk
325
326 @return: Number of items added to the list.
327
328 @raise ValueError: If path is not a directory or does not exist.
329 @raise ValueError: If the path could not be encoded properly.
330 """
331 path = encodePath(path)
332 path = normalizeDir(path)
333 if not os.path.exists(path) or not os.path.isdir(path):
334 logger.debug("Path [%s] is not a directory or does not exist on disk.", path)
335 raise ValueError("Path is not a directory or does not exist on disk.")
336 if self.excludeLinks and os.path.islink(path):
337 logger.debug("Path [%s] is excluded based on excludeLinks.", path)
338 return 0
339 if self.excludeDirs:
340 logger.debug("Path [%s] is excluded based on excludeDirs.", path)
341 return 0
342 if path in self.excludePaths:
343 logger.debug("Path [%s] is excluded based on excludePaths.", path)
344 return 0
345 for pattern in self.excludePatterns:
346 pattern = encodePath(pattern)
347 if re.compile(r"^%s$" % pattern).match(path):
348 logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
349 return 0
350 for pattern in self.excludeBasenamePatterns:
351 pattern = encodePath(pattern)
352 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
353 logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
354 return 0
355 self.append(path)
356 logger.debug("Added directory to list: [%s]", path)
357 return 1
358
359 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
360 """
361 Adds the contents of a directory to the list.
362
363 The path must exist and must be a directory or a link to a directory.
364 The contents of the directory (as well as the directory path itself) will
365 be recursively added to the list, subject to any exclusions that are in
366 place. If you only want the directory and its immediate contents to be
367 added, then pass in C{recursive=False}.
368
369 @note: If a directory's absolute path matches an exclude pattern or path,
370 or if the directory contains the configured ignore file, then the
371 directory and all of its contents will be recursively excluded from the
372 list.
373
374 @note: If the passed-in directory happens to be a soft link, it will be
375 recursed. However, the linkDepth parameter controls whether any soft
376 links I{within} the directory will be recursed. The link depth is
377 maximum depth of the tree at which soft links should be followed. So, a
378 depth of 0 does not follow any soft links, a depth of 1 follows only
379 links within the passed-in directory, a depth of 2 follows the links at
380 the next level down, etc.
381
382 @note: Any invalid soft links (i.e. soft links that point to
383 non-existent items) will be silently ignored.
384
385 @note: The L{excludeDirs} flag only controls whether any given directory
386 path itself is added to the list once it has been discovered. It does
387 I{not} modify any behavior related to directory recursion.
388
389 @note: If you call this method I{on a link to a directory} that link will
390 never be dereferenced (it may, however, be followed).
391
392 @param path: Directory path whose contents should be added to the list
393 @type path: String representing a path on disk
394
395 @param recursive: Indicates whether directory contents should be added recursively.
396 @type recursive: Boolean value
397
398 @param addSelf: Indicates whether the directory itself should be added to the list.
399 @type addSelf: Boolean value
400
401 @param linkDepth: Maximum depth of the tree at which soft links should be followed
402 @type linkDepth: Integer value, where zero means not to follow any soft links
403
404 @param dereference: Indicates whether soft links, if followed, should be dereferenced
405 @type dereference: Boolean value
406
407 @return: Number of items recursively added to the list
408
409 @raise ValueError: If path is not a directory or does not exist.
410 @raise ValueError: If the path could not be encoded properly.
411 """
412 path = encodePath(path)
413 path = normalizeDir(path)
414 return self._addDirContentsInternal(path, addSelf, recursive, linkDepth, dereference)
415
416 - def _addDirContentsInternal(self, path, includePath=True, recursive=True, linkDepth=0, dereference=False):
417 """
418 Internal implementation of C{addDirContents}.
419
420 This internal implementation exists due to some refactoring. Basically,
421 some subclasses have a need to add the contents of a directory, but not
422 the directory itself. This is different than the standard C{FilesystemList}
423 behavior and actually ends up making a special case out of the first
424 call in the recursive chain. Since I don't want to expose the modified
425 interface, C{addDirContents} ends up being wholly implemented in terms
426 of this method.
427
428 The linkDepth parameter controls whether soft links are followed when we
429 are adding the contents recursively. Any recursive calls reduce the
430 value by one. If the value zero or less, then soft links will just be
431 added as directories, but will not be followed. This means that links
432 are followed to a I{constant depth} starting from the top-most directory.
433
434 There is one difference between soft links and directories: soft links
435 that are added recursively are not placed into the list explicitly. This
436 is because if we do add the links recursively, the resulting tar file
437 gets a little confused (it has a link and a directory with the same
438 name).
439
440 @note: If you call this method I{on a link to a directory} that link will
441 never be dereferenced (it may, however, be followed).
442
443 @param path: Directory path whose contents should be added to the list.
444 @param includePath: Indicates whether to include the path as well as contents.
445 @param recursive: Indicates whether directory contents should be added recursively.
446 @param linkDepth: Depth of soft links that should be followed
447 @param dereference: Indicates whether soft links, if followed, should be dereferenced
448
449 @return: Number of items recursively added to the list
450
451 @raise ValueError: If path is not a directory or does not exist.
452 """
453 added = 0
454 if not os.path.exists(path) or not os.path.isdir(path):
455 logger.debug("Path [%s] is not a directory or does not exist on disk.", path)
456 raise ValueError("Path is not a directory or does not exist on disk.")
457 if path in self.excludePaths:
458 logger.debug("Path [%s] is excluded based on excludePaths.", path)
459 return added
460 for pattern in self.excludePatterns:
461 pattern = encodePath(pattern)
462 if re.compile(r"^%s$" % pattern).match(path):
463 logger.debug("Path [%s] is excluded based on pattern [%s].", path, pattern)
464 return added
465 for pattern in self.excludeBasenamePatterns:
466 pattern = encodePath(pattern)
467 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
468 logger.debug("Path [%s] is excluded based on basename pattern [%s].", path, pattern)
469 return added
470 if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
471 logger.debug("Path [%s] is excluded based on ignore file.", path)
472 return added
473 if includePath:
474 added += self.addDir(path)
475 for entry in os.listdir(path):
476 entrypath = os.path.join(path, entry)
477 if os.path.isfile(entrypath):
478 if linkDepth > 0 and dereference:
479 derefpath = dereferenceLink(entrypath)
480 if derefpath != entrypath:
481 added += self.addFile(derefpath)
482 added += self.addFile(entrypath)
483 elif os.path.isdir(entrypath):
484 if os.path.islink(entrypath):
485 if recursive:
486 if linkDepth > 0:
487 newDepth = linkDepth - 1
488 if dereference:
489 derefpath = dereferenceLink(entrypath)
490 if derefpath != entrypath:
491 added += self._addDirContentsInternal(derefpath, True, recursive, newDepth, dereference)
492 added += self.addDir(entrypath)
493 else:
494 added += self._addDirContentsInternal(entrypath, False, recursive, newDepth, dereference)
495 else:
496 added += self.addDir(entrypath)
497 else:
498 added += self.addDir(entrypath)
499 else:
500 if recursive:
501 newDepth = linkDepth - 1
502 added += self._addDirContentsInternal(entrypath, True, recursive, newDepth, dereference)
503 else:
504 added += self.addDir(entrypath)
505 return added
506
507
508
509
510
511
513 """
514 Removes file entries from the list.
515
516 If C{pattern} is not passed in or is C{None}, then all file entries will
517 be removed from the list. Otherwise, only those file entries matching
518 the pattern will be removed. Any entry which does not exist on disk
519 will be ignored (use L{removeInvalid} to purge those entries).
520
521 This method might be fairly slow for large lists, since it must check the
522 type of each item in the list. If you know ahead of time that you want
523 to exclude all files, then you will be better off setting L{excludeFiles}
524 to C{True} before adding items to the list.
525
526 @param pattern: Regular expression pattern representing entries to remove
527
528 @return: Number of entries removed
529 @raise ValueError: If the passed-in pattern is not a valid regular expression.
530 """
531 removed = 0
532 if pattern is None:
533 for entry in self[:]:
534 if os.path.exists(entry) and os.path.isfile(entry):
535 self.remove(entry)
536 logger.debug("Removed path [%s] from list.", entry)
537 removed += 1
538 else:
539 try:
540 pattern = encodePath(pattern)
541 compiled = re.compile(pattern)
542 except re.error:
543 raise ValueError("Pattern is not a valid regular expression.")
544 for entry in self[:]:
545 if os.path.exists(entry) and os.path.isfile(entry):
546 if compiled.match(entry):
547 self.remove(entry)
548 logger.debug("Removed path [%s] from list.", entry)
549 removed += 1
550 logger.debug("Removed a total of %d entries.", removed)
551 return removed
552
554 """
555 Removes directory entries from the list.
556
557 If C{pattern} is not passed in or is C{None}, then all directory entries
558 will be removed from the list. Otherwise, only those directory entries
559 matching the pattern will be removed. Any entry which does not exist on
560 disk will be ignored (use L{removeInvalid} to purge those entries).
561
562 This method might be fairly slow for large lists, since it must check the
563 type of each item in the list. If you know ahead of time that you want
564 to exclude all directories, then you will be better off setting
565 L{excludeDirs} to C{True} before adding items to the list (note that this
566 will not prevent you from recursively adding the I{contents} of
567 directories).
568
569 @param pattern: Regular expression pattern representing entries to remove
570
571 @return: Number of entries removed
572 @raise ValueError: If the passed-in pattern is not a valid regular expression.
573 """
574 removed = 0
575 if pattern is None:
576 for entry in self[:]:
577 if os.path.exists(entry) and os.path.isdir(entry):
578 self.remove(entry)
579 logger.debug("Removed path [%s] from list.", entry)
580 removed += 1
581 else:
582 try:
583 pattern = encodePath(pattern)
584 compiled = re.compile(pattern)
585 except re.error:
586 raise ValueError("Pattern is not a valid regular expression.")
587 for entry in self[:]:
588 if os.path.exists(entry) and os.path.isdir(entry):
589 if compiled.match(entry):
590 self.remove(entry)
591 logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
592 removed += 1
593 logger.debug("Removed a total of %d entries.", removed)
594 return removed
595
597 """
598 Removes soft link entries from the list.
599
600 If C{pattern} is not passed in or is C{None}, then all soft link entries
601 will be removed from the list. Otherwise, only those soft link entries
602 matching the pattern will be removed. Any entry which does not exist on
603 disk will be ignored (use L{removeInvalid} to purge those entries).
604
605 This method might be fairly slow for large lists, since it must check the
606 type of each item in the list. If you know ahead of time that you want
607 to exclude all soft links, then you will be better off setting
608 L{excludeLinks} to C{True} before adding items to the list.
609
610 @param pattern: Regular expression pattern representing entries to remove
611
612 @return: Number of entries removed
613 @raise ValueError: If the passed-in pattern is not a valid regular expression.
614 """
615 removed = 0
616 if pattern is None:
617 for entry in self[:]:
618 if os.path.exists(entry) and os.path.islink(entry):
619 self.remove(entry)
620 logger.debug("Removed path [%s] from list.", entry)
621 removed += 1
622 else:
623 try:
624 pattern = encodePath(pattern)
625 compiled = re.compile(pattern)
626 except re.error:
627 raise ValueError("Pattern is not a valid regular expression.")
628 for entry in self[:]:
629 if os.path.exists(entry) and os.path.islink(entry):
630 if compiled.match(entry):
631 self.remove(entry)
632 logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
633 removed += 1
634 logger.debug("Removed a total of %d entries.", removed)
635 return removed
636
638 """
639 Removes from the list all entries matching a pattern.
640
641 This method removes from the list all entries which match the passed in
642 C{pattern}. Since there is no need to check the type of each entry, it
643 is faster to call this method than to call the L{removeFiles},
644 L{removeDirs} or L{removeLinks} methods individually. If you know which
645 patterns you will want to remove ahead of time, you may be better off
646 setting L{excludePatterns} or L{excludeBasenamePatterns} before adding
647 items to the list.
648
649 @note: Unlike when using the exclude lists, the pattern here is I{not}
650 bounded at the front and the back of the string. You can use any pattern
651 you want.
652
653 @param pattern: Regular expression pattern representing entries to remove
654
655 @return: Number of entries removed.
656 @raise ValueError: If the passed-in pattern is not a valid regular expression.
657 """
658 try:
659 pattern = encodePath(pattern)
660 compiled = re.compile(pattern)
661 except re.error:
662 raise ValueError("Pattern is not a valid regular expression.")
663 removed = 0
664 for entry in self[:]:
665 if compiled.match(entry):
666 self.remove(entry)
667 logger.debug("Removed path [%s] from list based on pattern [%s].", entry, pattern)
668 removed += 1
669 logger.debug("Removed a total of %d entries.", removed)
670 return removed
671
673 """
674 Removes from the list all entries that do not exist on disk.
675
676 This method removes from the list all entries which do not currently
677 exist on disk in some form. No attention is paid to whether the entries
678 are files or directories.
679
680 @return: Number of entries removed.
681 """
682 removed = 0
683 for entry in self[:]:
684 if not os.path.exists(entry):
685 self.remove(entry)
686 logger.debug("Removed path [%s] from list.", entry)
687 removed += 1
688 logger.debug("Removed a total of %d entries.", removed)
689 return removed
690
691
692
693
694
695
697 """Normalizes the list, ensuring that each entry is unique."""
698 orig = len(self)
699 self.sort()
700 dups = list(filter(lambda x, self=self: self[x] == self[x+1], list(range(0, len(self) - 1))))
701 items = list(map(lambda x, self=self: self[x], dups))
702 list(map(self.remove, items))
703 new = len(self)
704 logger.debug("Completed normalizing list; removed %d items (%d originally, %d now).", new-orig, orig, new)
705
707 """
708 Verifies that all entries in the list exist on disk.
709 @return: C{True} if all entries exist, C{False} otherwise.
710 """
711 for entry in self:
712 if not os.path.exists(entry):
713 logger.debug("Path [%s] is invalid; list is not valid.", entry)
714 return False
715 logger.debug("All entries in list are valid.")
716 return True
717
718
719
720
721
722
723 -class SpanItem(object):
724 """
725 Item returned by L{BackupFileList.generateSpan}.
726 """
727 - def __init__(self, fileList, size, capacity, utilization):
728 """
729 Create object.
730 @param fileList: List of files
731 @param size: Size (in bytes) of files
732 @param utilization: Utilization, as a percentage (0-100)
733 """
734 self.fileList = fileList
735 self.size = size
736 self.capacity = capacity
737 self.utilization = utilization
738
745
746
747
748
749
750 """
751 List of files to be backed up.
752
753 A BackupFileList is a L{FilesystemList} containing a list of files to be
754 backed up. It only contains files, not directories (soft links are treated
755 like files). On top of the generic functionality provided by
756 L{FilesystemList}, this class adds functionality to keep a hash (checksum)
757 for each file in the list, and it also provides a method to calculate the
758 total size of the files in the list and a way to export the list into tar
759 form.
760
761 @sort: __init__, addDir, totalSize, generateSizeMap, generateDigestMap,
762 generateFitted, generateTarfile, removeUnchanged
763 """
764
765
766
767
768
772
773
774
775
776
777
779 """
780 Adds a directory to the list.
781
782 Note that this class does not allow directories to be added by themselves
783 (a backup list contains only files). However, since links to directories
784 are technically files, we allow them to be added.
785
786 This method is implemented in terms of the superclass method, with one
787 additional validation: the superclass method is only called if the
788 passed-in path is both a directory and a link. All of the superclass's
789 existing validations and restrictions apply.
790
791 @param path: Directory path to be added to the list
792 @type path: String representing a path on disk
793
794 @return: Number of items added to the list.
795
796 @raise ValueError: If path is not a directory or does not exist.
797 @raise ValueError: If the path could not be encoded properly.
798 """
799 path = encodePath(path)
800 path = normalizeDir(path)
801 if os.path.isdir(path) and not os.path.islink(path):
802 return 0
803 else:
804 return FilesystemList.addDir(self, path)
805
806
807
808
809
810
812 """
813 Returns the total size among all files in the list.
814 Only files are counted.
815 Soft links that point at files are ignored.
816 Entries which do not exist on disk are ignored.
817 @return: Total size, in bytes
818 """
819 total = 0.0
820 for entry in self:
821 if os.path.isfile(entry) and not os.path.islink(entry):
822 total += float(os.stat(entry).st_size)
823 return total
824
826 """
827 Generates a mapping from file to file size in bytes.
828 The mapping does include soft links, which are listed with size zero.
829 Entries which do not exist on disk are ignored.
830 @return: Dictionary mapping file to file size
831 """
832 table = { }
833 for entry in self:
834 if os.path.islink(entry):
835 table[entry] = 0.0
836 elif os.path.isfile(entry):
837 table[entry] = float(os.stat(entry).st_size)
838 return table
839
841 """
842 Generates a mapping from file to file digest.
843
844 Currently, the digest is an SHA hash, which should be pretty secure. In
845 the future, this might be a different kind of hash, but we guarantee that
846 the type of the hash will not change unless the library major version
847 number is bumped.
848
849 Entries which do not exist on disk are ignored.
850
851 Soft links are ignored. We would end up generating a digest for the file
852 that the soft link points at, which doesn't make any sense.
853
854 If C{stripPrefix} is passed in, then that prefix will be stripped from
855 each key when the map is generated. This can be useful in generating two
856 "relative" digest maps to be compared to one another.
857
858 @param stripPrefix: Common prefix to be stripped from paths
859 @type stripPrefix: String with any contents
860
861 @return: Dictionary mapping file to digest value
862 @see: L{removeUnchanged}
863 """
864 table = { }
865 if stripPrefix is not None:
866 for entry in self:
867 if os.path.isfile(entry) and not os.path.islink(entry):
868 table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
869 else:
870 for entry in self:
871 if os.path.isfile(entry) and not os.path.islink(entry):
872 table[entry] = BackupFileList._generateDigest(entry)
873 return table
874
875 @staticmethod
877 """
878 Generates an SHA digest for a given file on disk.
879
880 The original code for this function used this simplistic implementation,
881 which requires reading the entire file into memory at once in order to
882 generate a digest value::
883
884 sha.new(open(path).read()).hexdigest()
885
886 Not surprisingly, this isn't an optimal solution. The U{Simple file
887 hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
888 Python Cookbook recipe describes how to incrementally generate a hash
889 value by reading in chunks of data rather than reading the file all at
890 once. The recipe relies on the the C{update()} method of the various
891 Python hashing algorithms.
892
893 In my tests using a 110 MB file on CD, the original implementation
894 requires 111 seconds. This implementation requires only 40-45 seconds,
895 which is a pretty substantial speed-up.
896
897 Experience shows that reading in around 4kB (4096 bytes) at a time yields
898 the best performance. Smaller reads are quite a bit slower, and larger
899 reads don't make much of a difference. The 4kB number makes me a little
900 suspicious, and I think it might be related to the size of a filesystem
901 read at the hardware level. However, I've decided to just hardcode 4096
902 until I have evidence that shows it's worthwhile making the read size
903 configurable.
904
905 @param path: Path to generate digest for.
906
907 @return: ASCII-safe SHA digest for the file.
908 @raise OSError: If the file cannot be opened.
909 """
910
911 s = hashlib.sha1()
912 with open(path, mode="rb") as f:
913 readBytes = 4096
914 while readBytes > 0:
915 readString = f.read(readBytes)
916 s.update(readString)
917 readBytes = len(readString)
918 digest = s.hexdigest()
919 logger.debug("Generated digest [%s] for file [%s].", digest, path)
920 return digest
921
923 """
924 Generates a list of items that fit in the indicated capacity.
925
926 Sometimes, callers would like to include every item in a list, but are
927 unable to because not all of the items fit in the space available. This
928 method returns a copy of the list, containing only the items that fit in
929 a given capacity. A copy is returned so that we don't lose any
930 information if for some reason the fitted list is unsatisfactory.
931
932 The fitting is done using the functions in the knapsack module. By
933 default, the first fit algorithm is used, but you can also choose
934 from best fit, worst fit and alternate fit.
935
936 @param capacity: Maximum capacity among the files in the new list
937 @type capacity: Integer, in bytes
938
939 @param algorithm: Knapsack (fit) algorithm to use
940 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
941
942 @return: Copy of list with total size no larger than indicated capacity
943 @raise ValueError: If the algorithm is invalid.
944 """
945 table = self._getKnapsackTable()
946 function = BackupFileList._getKnapsackFunction(algorithm)
947 return function(table, capacity)[0]
948
950 """
951 Splits the list of items into sub-lists that fit in a given capacity.
952
953 Sometimes, callers need split to a backup file list into a set of smaller
954 lists. For instance, you could use this to "span" the files across a set
955 of discs.
956
957 The fitting is done using the functions in the knapsack module. By
958 default, the first fit algorithm is used, but you can also choose
959 from best fit, worst fit and alternate fit.
960
961 @note: If any of your items are larger than the capacity, then it won't
962 be possible to find a solution. In this case, a value error will be
963 raised.
964
965 @param capacity: Maximum capacity among the files in the new list
966 @type capacity: Integer, in bytes
967
968 @param algorithm: Knapsack (fit) algorithm to use
969 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
970
971 @return: List of L{SpanItem} objects.
972
973 @raise ValueError: If the algorithm is invalid.
974 @raise ValueError: If it's not possible to fit some items
975 """
976 spanItems = []
977 function = BackupFileList._getKnapsackFunction(algorithm)
978 table = self._getKnapsackTable(capacity)
979 iteration = 0
980 while len(table) > 0:
981 iteration += 1
982 fit = function(table, capacity)
983 if len(fit[0]) == 0:
984
985 raise ValueError("After iteration %d, unable to add any new items." % iteration)
986 removeKeys(table, fit[0])
987 utilization = (float(fit[1])/float(capacity))*100.0
988 item = SpanItem(fit[0], fit[1], capacity, utilization)
989 spanItems.append(item)
990 return spanItems
991
993 """
994 Converts the list into the form needed by the knapsack algorithms.
995 @return: Dictionary mapping file name to tuple of (file path, file size).
996 """
997 table = { }
998 for entry in self:
999 if os.path.islink(entry):
1000 table[entry] = (entry, 0.0)
1001 elif os.path.isfile(entry):
1002 size = float(os.stat(entry).st_size)
1003 if capacity is not None:
1004 if size > capacity:
1005 raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
1006 table[entry] = (entry, size)
1007 return table
1008
1009 @staticmethod
1011 """
1012 Returns a reference to the function associated with an algorithm name.
1013 Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
1014 @param algorithm: Name of the algorithm
1015 @return: Reference to knapsack function
1016 @raise ValueError: If the algorithm name is unknown.
1017 """
1018 if algorithm == "first_fit":
1019 return firstFit
1020 elif algorithm == "best_fit":
1021 return bestFit
1022 elif algorithm == "worst_fit":
1023 return worstFit
1024 elif algorithm == "alternate_fit":
1025 return alternateFit
1026 else:
1027 raise ValueError("Algorithm [%s] is invalid." % algorithm)
1028
1030 """
1031 Creates a tar file containing the files in the list.
1032
1033 By default, this method will create uncompressed tar files. If you pass
1034 in mode C{'targz'}, then it will create gzipped tar files, and if you
1035 pass in mode C{'tarbz2'}, then it will create bzipped tar files.
1036
1037 The tar file will be created as a GNU tar archive, which enables extended
1038 file name lengths, etc. Since GNU tar is so prevalent, I've decided that
1039 the extra functionality out-weighs the disadvantage of not being
1040 "standard".
1041
1042 If you pass in C{flat=True}, then a "flat" archive will be created, and
1043 all of the files will be added to the root of the archive. So, the file
1044 C{/tmp/something/whatever.txt} would be added as just C{whatever.txt}.
1045
1046 By default, the whole method call fails if there are problems adding any
1047 of the files to the archive, resulting in an exception. Under these
1048 circumstances, callers are advised that they might want to call
1049 L{removeInvalid()} and then attempt to extract the tar file a second
1050 time, since the most common cause of failures is a missing file (a file
1051 that existed when the list was built, but is gone again by the time the
1052 tar file is built).
1053
1054 If you want to, you can pass in C{ignore=True}, and the method will
1055 ignore errors encountered when adding individual files to the archive
1056 (but not errors opening and closing the archive itself).
1057
1058 We'll always attempt to remove the tarfile from disk if an exception will
1059 be thrown.
1060
1061 @note: No validation is done as to whether the entries in the list are
1062 files, since only files or soft links should be in an object like this.
1063 However, to be safe, everything is explicitly added to the tar archive
1064 non-recursively so it's safe to include soft links to directories.
1065
1066 @note: The Python C{tarfile} module, which is used internally here, is
1067 supposed to deal properly with long filenames and links. In my testing,
1068 I have found that it appears to be able to add long really long filenames
1069 to archives, but doesn't do a good job reading them back out, even out of
1070 an archive it created. Fortunately, all Cedar Backup does is add files
1071 to archives.
1072
1073 @param path: Path of tar file to create on disk
1074 @type path: String representing a path on disk
1075
1076 @param mode: Tar creation mode
1077 @type mode: One of either C{'tar'}, C{'targz'} or C{'tarbz2'}
1078
1079 @param ignore: Indicates whether to ignore certain errors.
1080 @type ignore: Boolean
1081
1082 @param flat: Creates "flat" archive by putting all items in root
1083 @type flat: Boolean
1084
1085 @raise ValueError: If mode is not valid
1086 @raise ValueError: If list is empty
1087 @raise ValueError: If the path could not be encoded properly.
1088 @raise TarError: If there is a problem creating the tar file
1089 """
1090
1091 path = encodePath(path)
1092 if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
1093 if mode == 'tar': tarmode = "w:"
1094 elif mode == 'targz': tarmode = "w:gz"
1095 elif mode == 'tarbz2': tarmode = "w:bz2"
1096 else: raise ValueError("Mode [%s] is not valid." % mode)
1097 try:
1098 tar = tarfile.open(path, tarmode)
1099 try:
1100 tar.format = tarfile.GNU_FORMAT
1101 except AttributeError:
1102 tar.posix = False
1103 for entry in self:
1104 try:
1105 if flat:
1106 tar.add(entry, arcname=os.path.basename(entry), recursive=False)
1107 else:
1108 tar.add(entry, recursive=False)
1109 except tarfile.TarError as e:
1110 if not ignore:
1111 raise e
1112 logger.info("Unable to add file [%s]; going on anyway.", entry)
1113 except OSError as e:
1114 if not ignore:
1115 raise tarfile.TarError(e)
1116 logger.info("Unable to add file [%s]; going on anyway.", entry)
1117 tar.close()
1118 except tarfile.ReadError as e:
1119 try: tar.close()
1120 except: pass
1121 if os.path.exists(path):
1122 try: os.remove(path)
1123 except: pass
1124 raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
1125 except tarfile.TarError as e:
1126 try: tar.close()
1127 except: pass
1128 if os.path.exists(path):
1129 try: os.remove(path)
1130 except: pass
1131 raise e
1132
1134 """
1135 Removes unchanged entries from the list.
1136
1137 This method relies on a digest map as returned from L{generateDigestMap}.
1138 For each entry in C{digestMap}, if the entry also exists in the current
1139 list I{and} the entry in the current list has the same digest value as in
1140 the map, the entry in the current list will be removed.
1141
1142 This method offers a convenient way for callers to filter unneeded
1143 entries from a list. The idea is that a caller will capture a digest map
1144 from C{generateDigestMap} at some point in time (perhaps the beginning of
1145 the week), and will save off that map using C{pickle} or some other
1146 method. Then, the caller could use this method sometime in the future to
1147 filter out any unchanged files based on the saved-off map.
1148
1149 If C{captureDigest} is passed-in as C{True}, then digest information will
1150 be captured for the entire list before the removal step occurs using the
1151 same rules as in L{generateDigestMap}. The check will involve a lookup
1152 into the complete digest map.
1153
1154 If C{captureDigest} is passed in as C{False}, we will only generate a
1155 digest value for files we actually need to check, and we'll ignore any
1156 entry in the list which isn't a file that currently exists on disk.
1157
1158 The return value varies depending on C{captureDigest}, as well. To
1159 preserve backwards compatibility, if C{captureDigest} is C{False}, then
1160 we'll just return a single value representing the number of entries
1161 removed. Otherwise, we'll return a tuple of C{(entries removed, digest
1162 map)}. The returned digest map will be in exactly the form returned by
1163 L{generateDigestMap}.
1164
1165 @note: For performance reasons, this method actually ends up rebuilding
1166 the list from scratch. First, we build a temporary dictionary containing
1167 all of the items from the original list. Then, we remove items as needed
1168 from the dictionary (which is faster than the equivalent operation on a
1169 list). Finally, we replace the contents of the current list based on the
1170 keys left in the dictionary. This should be transparent to the caller.
1171
1172 @param digestMap: Dictionary mapping file name to digest value.
1173 @type digestMap: Map as returned from L{generateDigestMap}.
1174
1175 @param captureDigest: Indicates that digest information should be captured.
1176 @type captureDigest: Boolean
1177
1178 @return: Results as discussed above (format varies based on arguments)
1179 """
1180 if captureDigest:
1181 removed = 0
1182 table = {}
1183 captured = {}
1184 for entry in self:
1185 if os.path.isfile(entry) and not os.path.islink(entry):
1186 table[entry] = BackupFileList._generateDigest(entry)
1187 captured[entry] = table[entry]
1188 else:
1189 table[entry] = None
1190 for entry in list(digestMap.keys()):
1191 if entry in table:
1192 if table[entry] is not None:
1193 digest = table[entry]
1194 if digest == digestMap[entry]:
1195 removed += 1
1196 del table[entry]
1197 logger.debug("Discarded unchanged file [%s].", entry)
1198 self[:] = list(table.keys())
1199 return (removed, captured)
1200 else:
1201 removed = 0
1202 table = {}
1203 for entry in self:
1204 table[entry] = None
1205 for entry in list(digestMap.keys()):
1206 if entry in table:
1207 if os.path.isfile(entry) and not os.path.islink(entry):
1208 digest = BackupFileList._generateDigest(entry)
1209 if digest == digestMap[entry]:
1210 removed += 1
1211 del table[entry]
1212 logger.debug("Discarded unchanged file [%s].", entry)
1213 self[:] = list(table.keys())
1214 return removed
1215
1222
1223
1224
1225
1226
1227 """
1228 List of files and directories to be purged.
1229
1230 A PurgeItemList is a L{FilesystemList} containing a list of files and
1231 directories to be purged. On top of the generic functionality provided by
1232 L{FilesystemList}, this class adds functionality to remove items that are
1233 too young to be purged, and to actually remove each item in the list from
1234 the filesystem.
1235
1236 The other main difference is that when you add a directory's contents to a
1237 purge item list, the directory itself is not added to the list. This way,
1238 if someone asks to purge within in C{/opt/backup/collect}, that directory
1239 doesn't get removed once all of the files within it is gone.
1240 """
1241
1242
1243
1244
1245
1249
1250
1251
1252
1253
1254
1255 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
1256 """
1257 Adds the contents of a directory to the list.
1258
1259 The path must exist and must be a directory or a link to a directory.
1260 The contents of the directory (but I{not} the directory path itself) will
1261 be recursively added to the list, subject to any exclusions that are in
1262 place. If you only want the directory and its contents to be added, then
1263 pass in C{recursive=False}.
1264
1265 @note: If a directory's absolute path matches an exclude pattern or path,
1266 or if the directory contains the configured ignore file, then the
1267 directory and all of its contents will be recursively excluded from the
1268 list.
1269
1270 @note: If the passed-in directory happens to be a soft link, it will be
1271 recursed. However, the linkDepth parameter controls whether any soft
1272 links I{within} the directory will be recursed. The link depth is
1273 maximum depth of the tree at which soft links should be followed. So, a
1274 depth of 0 does not follow any soft links, a depth of 1 follows only
1275 links within the passed-in directory, a depth of 2 follows the links at
1276 the next level down, etc.
1277
1278 @note: Any invalid soft links (i.e. soft links that point to
1279 non-existent items) will be silently ignored.
1280
1281 @note: The L{excludeDirs} flag only controls whether any given soft link
1282 path itself is added to the list once it has been discovered. It does
1283 I{not} modify any behavior related to directory recursion.
1284
1285 @note: The L{excludeDirs} flag only controls whether any given directory
1286 path itself is added to the list once it has been discovered. It does
1287 I{not} modify any behavior related to directory recursion.
1288
1289 @note: If you call this method I{on a link to a directory} that link will
1290 never be dereferenced (it may, however, be followed).
1291
1292 @param path: Directory path whose contents should be added to the list
1293 @type path: String representing a path on disk
1294
1295 @param recursive: Indicates whether directory contents should be added recursively.
1296 @type recursive: Boolean value
1297
1298 @param addSelf: Ignored in this subclass.
1299
1300 @param linkDepth: Depth of soft links that should be followed
1301 @type linkDepth: Integer value, where zero means not to follow any soft links
1302
1303 @param dereference: Indicates whether soft links, if followed, should be dereferenced
1304 @type dereference: Boolean value
1305
1306 @return: Number of items recursively added to the list
1307
1308 @raise ValueError: If path is not a directory or does not exist.
1309 @raise ValueError: If the path could not be encoded properly.
1310 """
1311 path = encodePath(path)
1312 path = normalizeDir(path)
1313 return super(PurgeItemList, self)._addDirContentsInternal(path, False, recursive, linkDepth, dereference)
1314
1315
1316
1317
1318
1319
1321 """
1322 Removes from the list files younger than a certain age (in days).
1323
1324 Any file whose "age" in days is less than (C{<}) the value of the
1325 C{daysOld} parameter will be removed from the list so that it will not be
1326 purged later when L{purgeItems} is called. Directories and soft links
1327 will be ignored.
1328
1329 The "age" of a file is the amount of time since the file was last used,
1330 per the most recent of the file's C{st_atime} and C{st_mtime} values.
1331
1332 @note: Some people find the "sense" of this method confusing or
1333 "backwards". Keep in mind that this method is used to remove items
1334 I{from the list}, not from the filesystem! It removes from the list
1335 those items that you would I{not} want to purge because they are too
1336 young. As an example, passing in C{daysOld} of zero (0) would remove
1337 from the list no files, which would result in purging all of the files
1338 later. I would be happy to make a synonym of this method with an
1339 easier-to-understand "sense", if someone can suggest one.
1340
1341 @param daysOld: Minimum age of files that are to be kept in the list.
1342 @type daysOld: Integer value >= 0.
1343
1344 @return: Number of entries removed
1345 """
1346 removed = 0
1347 daysOld = int(daysOld)
1348 if daysOld < 0:
1349 raise ValueError("Days old value must be an integer >= 0.")
1350 for entry in self[:]:
1351 if os.path.isfile(entry) and not os.path.islink(entry):
1352 try:
1353 ageInDays = calculateFileAge(entry)
1354 ageInWholeDays = math.floor(ageInDays)
1355 if ageInWholeDays < 0: ageInWholeDays = 0
1356 if ageInWholeDays < daysOld:
1357 removed += 1
1358 self.remove(entry)
1359 except OSError:
1360 pass
1361 return removed
1362
1364 """
1365 Purges all items in the list.
1366
1367 Every item in the list will be purged. Directories in the list will
1368 I{not} be purged recursively, and hence will only be removed if they are
1369 empty. Errors will be ignored.
1370
1371 To faciliate easy removal of directories that will end up being empty,
1372 the delete process happens in two passes: files first (including soft
1373 links), then directories.
1374
1375 @return: Tuple containing count of (files, dirs) removed
1376 """
1377 files = 0
1378 dirs = 0
1379 for entry in self:
1380 if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
1381 try:
1382 os.remove(entry)
1383 files += 1
1384 logger.debug("Purged file [%s].", entry)
1385 except OSError:
1386 pass
1387 for entry in self:
1388 if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
1389 try:
1390 os.rmdir(entry)
1391 dirs += 1
1392 logger.debug("Purged empty directory [%s].", entry)
1393 except OSError:
1394 pass
1395 return (files, dirs)
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406 -def normalizeDir(path):
1407 """
1408 Normalizes a directory name.
1409
1410 For our purposes, a directory name is normalized by removing the trailing
1411 path separator, if any. This is important because we want directories to
1412 appear within lists in a consistent way, although from the user's
1413 perspective passing in C{/path/to/dir/} and C{/path/to/dir} are equivalent.
1414
1415 @param path: Path to be normalized.
1416 @type path: String representing a path on disk
1417
1418 @return: Normalized path, which should be equivalent to the original.
1419 """
1420 if path != os.sep and path[-1:] == os.sep:
1421 return path[:-1]
1422 return path
1423
1424
1425
1426
1427
1428
1429 -def compareContents(path1, path2, verbose=False):
1430 """
1431 Compares the contents of two directories to see if they are equivalent.
1432
1433 The two directories are recursively compared. First, we check whether they
1434 contain exactly the same set of files. Then, we check to see every given
1435 file has exactly the same contents in both directories.
1436
1437 This is all relatively simple to implement through the magic of
1438 L{BackupFileList.generateDigestMap}, which knows how to strip a path prefix
1439 off the front of each entry in the mapping it generates. This makes our
1440 comparison as simple as creating a list for each path, then generating a
1441 digest map for each path and comparing the two.
1442
1443 If no exception is thrown, the two directories are considered identical.
1444
1445 If the C{verbose} flag is C{True}, then an alternate (but slower) method is
1446 used so that any thrown exception can indicate exactly which file caused the
1447 comparison to fail. The thrown C{ValueError} exception distinguishes
1448 between the directories containing different files, and containing the same
1449 files with differing content.
1450
1451 @note: Symlinks are I{not} followed for the purposes of this comparison.
1452
1453 @param path1: First path to compare.
1454 @type path1: String representing a path on disk
1455
1456 @param path2: First path to compare.
1457 @type path2: String representing a path on disk
1458
1459 @param verbose: Indicates whether a verbose response should be given.
1460 @type verbose: Boolean
1461
1462 @raise ValueError: If a directory doesn't exist or can't be read.
1463 @raise ValueError: If the two directories are not equivalent.
1464 @raise IOError: If there is an unusual problem reading the directories.
1465 """
1466 try:
1467 path1List = BackupFileList()
1468 path1List.addDirContents(path1)
1469 path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
1470 path2List = BackupFileList()
1471 path2List.addDirContents(path2)
1472 path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
1473 compareDigestMaps(path1Digest, path2Digest, verbose)
1474 except IOError as e:
1475 logger.error("I/O error encountered during consistency check.")
1476 raise e
1477
1479 """
1480 Compares two digest maps and throws an exception if they differ.
1481
1482 @param digest1: First digest to compare.
1483 @type digest1: Digest as returned from BackupFileList.generateDigestMap()
1484
1485 @param digest2: Second digest to compare.
1486 @type digest2: Digest as returned from BackupFileList.generateDigestMap()
1487
1488 @param verbose: Indicates whether a verbose response should be given.
1489 @type verbose: Boolean
1490
1491 @raise ValueError: If the two directories are not equivalent.
1492 """
1493 if not verbose:
1494 if digest1 != digest2:
1495 raise ValueError("Consistency check failed.")
1496 else:
1497 list1 = UnorderedList(list(digest1.keys()))
1498 list2 = UnorderedList(list(digest2.keys()))
1499 if list1 != list2:
1500 raise ValueError("Directories contain a different set of files.")
1501 for key in list1:
1502 if digest1[key] != digest2[key]:
1503 raise ValueError("File contents for [%s] vary between directories." % key)
1504