Python Backup Script

by on February 21, 2013 at 10:23 pm

Part one, take the first full backup. At the present time, this code is still under development and should not be used on a production machine. However, I am posting it here for reference.

Eventually, this code is going to be included in a backup client I am developing that will interface with glusterfs and Amazon S3 storage.

Currently, this code is tested to run on Python v2.7.4 on a Fedora 18 machine. With all three python files, and any number of properly defined job xml files in the jobs.d/ directory, these scripts are currently functional.

This is the controller file for taking the first backup.


# Create first full backup

import os, stat, time, seed_functions

def printHello():
  print "hello";

#--- This part of the script does the heavy lifting.
def seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles):
  #---execute the first backup for the job.
  print "Job completed successfully";

#--- If this is being run as a script, set temporary variables
if __name__ == '__main__':
  myFindPath = '/home/myuser/findfiles';
  myJobId = str('106');
  myTempPath = '/tmp/';
  myTempFileList = 'files.tmp';
  myFullTempPath = os.path.join(myTempPath,myTempFileList); 
  myTargetPath = '/home/target1/';
  myTargetMeta = 'job'+myJobId+'.meta';
  myTargetTarFile = 'job'+myJobId+'.tar';
  myTargetMetaPath = myTargetPath+myTargetMeta;
  myTargetTarFilePath = myTargetPath+myTargetTarFile;

  #---Path names below should be absolute path names (start with / ) and should not end with '/'
  myExlcudeFiles = list();

#myJdate = seed_functions.getJulianDate();
This is the file that does the actual real work ;)


#Shared Functions and Classes
import os, sys, stat, time, glob
from datetime import datetime

def getJulianDate():
#--- This function returns an interger value of today's Julian Date 
#--- preceded by two digit year.  IE: 01JAN2013 -> 13001
  nowtime =  str(;
  (year, month, day) = nowtime.split('-');
  day = int(day[:2]);
  month = int(month);
  year = int(year);
  t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0));
  jdate = (year % 2000) * 1000;
  jdate = jdate + time.gmtime(t)[7];
  return jdate;

import re
def findFiles(filepath,myFindPath,excludeFiles):
#--- Variables: fileListPath = string, myFindPath = string, excludeFiles = list() of strings)
#--- This function writes/overwrites the file @ 'fileListPath' which should be absolute path.
#--- The file @ fileListPath is a list of files found within myFindPath.
#--- myFindPath is desgined to be an absolute directory path.
#--- excludeFiles is a list() of absolute paths which should be excluded during the finding operation.
  with open(filepath, 'w') as ftemp:
    #--- Generate list of files that shouldn't be added
    for dirname, dirnames, filenames in os.walk(myFindPath):
      #--- Remove exlcuded directories
      for badfile in dirnames:
        if os.path.join(myFindPath,dirname,badfile) in excludeFiles:

      #--- Gather the other directories
      for subdirname in dirnames:
	  #--- we just want to add empty directories
	  if os.path.islink(os.path.join(dirname, subdirname)) or not os.listdir(os.path.join(dirname, subdirname)):
	    ftemp.write(os.path.join(dirname, subdirname)+'\n');

      #--- Add files to the list.
      for filename in filenames:
	  #--- check to see if the file is a regular file or a link:
	  if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)):
            #if not filename in myWildList:
	      ftemp.write(os.path.join(dirname, filename)+'\n');


def storeMetaData(fileListPath, fileMetaPath):
#---This function creates an output file with
#---Filename and absolute path of fileMetaPath
#---File format is:
#---  /path/to/file ::: modified datetime ::: seconds since 1970 ::: md5 hash
  with open(fileMetaPath, 'w') as fmeta:
    myFileList = open(fileListPath, 'r');
    for filez in myFileList:
      #--- create an array to append information.
      myFileMeta = list();
      #--- Append absolute path and file name as first item
      #--- Get file meta information from os.stat()
      myStat = os.stat(filez.strip());
      #---Append human readable date/time stamp.
      #---Append unix timestamp for easy comparison in the future.
      if not os.path.isdir(filez.strip()):
        myHash = md5(filez.strip());
        myHash = "---None: directory";
      metaDataString = str(myFileMeta[0]+":::"+myFileMeta[1]+":::"+str(myFileMeta[2])+":::"+str(myFileMeta[3]));

import hashlib,os
def md5(filename):
    ''' function to get md5 of file '''
    d = hashlib.md5();
    except Exception,e:
        print e;
        return d.hexdigest();

import tarfile
def mkTarFile(fileList, tarOutPath):
    thisTarOut = tarOutPath+".lzo"
    thisFileList = "-T "+fileList
    os.system("tar {options} {tarfile} {filex} &> /dev/null".format(options="cpvfa", tarfile=thisTarOut, filex=thisFileList));

#--- Not yet impemented functions below:

def findFiles2(fileListPath,myFindPath,excludeFiles):
#--- This function is for testing purposes only
  with open(fileListPath, 'w') as ftemp:
   for dirname, dirnames, filenames in os.walk(myFindPath):
    for subdirname in dirnames:
	#--- we just want to add empty directories
	if not os.listdir(os.path.join(dirname, subdirname)):
		ftemp.write(os.path.join(dirname, subdirname)+'\n');
    for filename in filenames:
	#--- check to see if the file is a regular file or a link:
	if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)):
		ftemp.write(os.path.join(dirname, filename)+'\n');

def removeExcludedFiles():
    #--- Not implemented yet.
    myWildList = list();
    wildMatch = re.compile("^\*");
    wildMatch2 = re.compile(".*\*");
    print "Excluding the following"
    for badfile in excludeFiles:
      print badfile;
      result = wildMatch.match(badfile);
      if not result:
        result2 = wildMatch2.match(badfile);
        print result2;
      if result or result2:
    print myWildList;
This is near completion; it parses the jobs in jobs.d/, verifies them, and runs them.

import os, re
import xml.etree.ElementTree as ET
import seed_files

def readConfFile():
  #--- Future feature to read a specified jobs.d from config file.
  jobdir = 'jobs.d';
  return jobdir;

def findJobs(jobdir):
  myJobList = list();
  confMatch = re.compile(".*\.xml$");
  for dirname, dirnames, filenames in os.walk(jobdir):
    for jobid in filenames:
      result = confMatch.match(jobid);
      if result:
  return myJobList;

def checkPath(pathText):
  confMatch = re.compile("^\/");
  result = confMatch.match(pathText);
  confMatch2 = re.compile("^\/$");
  result2 = confMatch2.match(pathText);
  if result2:
    exit('Path cannot be / ');
  if not result:
    exit('Directory path must be absolute path: '+pathText);
  if os.path.isdir(pathText) or os.path.ismount(pathText):
    print 'Path seems valid: ',pathText;
    exit('Invalid path: '+pathText);

def parseJobs(myFoundJobs):
  print "Found the following config files: ",myFoundJobs;
  print "------------------------------------------------";
  myJobList = list();
  for myJob in myFoundJobs:
    mySubList = list();
    print "Parsing and testing: ",myJob;
    tree = ET.parse(myJob);
    root = tree.getroot();

    for child in root:
      #--- Validate backup path is valide.
      #----future feature: master excludes in config file
      if child.tag == 'backupdir':
        print "Checking Backup Directory";
      if child.tag == 'backuptarget':
        print "Checking Backup Target"

      #--- Create another sublist for excluded directories.
      if child.tag == 'exclude':
        myExcludeList = list();
        for subchild in child:

      #--- Since it's not a sublist, we append directly
      elif not child.tag == 'exclude':
    print "------------------------------------------------";

  #--- Ensure some jobs were actually found.
  if myJobList.__len__() == 0:
    exit('Exit on Error: No Jobs Found!');
  #--- If we didn't exit above, we returned the parsed job list
  return myJobList;

def performBackup(myJobList):
  for job in myJobList:
    #--- job[0]:  JobID
    #--- job[1];  Backup Path
    #--- job[2]:  Backup Target
    #--- job[3]:  temp directory
    #--- job[4]:  Excluded directories
    myJobId = job[0];
    myFindPath = job[1];
    myTarget = job[2];
    myTempPath = job[3];
    myExcludes = job[4];
    myTargetPath = os.path.join(myTarget,str('job'+myJobId),'master');
    myTempFileList = 'backup_job'+myJobId+'.tmp';
    myFullTempPath = os.path.join(myTargetPath,myTempFileList); 
    myTargetMeta = 'job'+myJobId+'_master.meta';
    myTargetTarFile = 'job'+myJobId+'_master_seed.tar';
    myTargetMetaPath = os.path.join(myTargetPath,myTargetMeta);
    myTargetTarFilePath = os.path.join(myTargetPath,myTargetTarFile);

    if os.path.exists(os.path.join(myTarget,str('job'+myJobId))):
      print os.path.join(myFindPath,str('job'+myJobId));
      exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists!  Exiting to preserve data!');
    if os.path.exists(myTargetPath):
      exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists!  Exiting to preserve data!');

    myExcludeFiles = list();
    for excludes in myExcludes:
    print "------------------------------------------------";
    print "Starting Job: ",myJobId;

#--- Execute the script.
myJobList = list();
jobdir = readConfFile();
myFoundJobs = findJobs(jobdir);
myJobList = parseJobs(myFoundJobs);
print "Number of jobs: ",myJobList.__len__();
#promptContinue() #--- Let user review backup jobs, prompt for continue.

Job file in jobs.d/ directory

<?xml version="1.0"?>
in Uncategorized

, , , ,

You can skip to the end and leave a response. Pinging is currently not allowed.