Python Backup Script

0
by on February 21, 2013 at 10:23 pm

Part one, take the first full backup. At the present time, this code is still under development and should not be used on a production machine. However, I am posting it here for reference.

Eventually, this code is going to be included in a backup client I am developing that will interface with glusterfs and Amazon S3 storage.

Currently, this code is tested to run on Python v2.7.4 on a Fedora 18 machine. With all three python files, and any number of properly defined job xml files in the jobs.d/ directory, these scripts are currently functional.

seed_files.py

This is the controller file for taking the first backup.

#!/usr/bin/python

# Create first full backup

import os, stat, time, seed_functions

def printHello():
  print "hello";

#--- This part of the script does the heavy lifting.
def seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles):
  #---execute the first backup for the job.
  seed_functions.findFiles(myFullTempPath,myFindPath,myExcludeFiles);
  seed_functions.storeMetaData(myFullTempPath,myTargetMetaPath);
  seed_functions.mkTarFile(myFullTempPath,myTargetTarFilePath);
  print "Job completed successfully";

#--- If this is being run as a script, set temporary variables
if __name__ == '__main__':
  myFindPath = '/home/myuser/findfiles';
  myJobId = str('106');
  myTempPath = '/tmp/';
  myTempFileList = 'files.tmp';
  myFullTempPath = os.path.join(myTempPath,myTempFileList); 
  myTargetPath = '/home/target1/';
  myTargetMeta = 'job'+myJobId+'.meta';
  myTargetTarFile = 'job'+myJobId+'.tar';
  myTargetMetaPath = myTargetPath+myTargetMeta;
  myTargetTarFilePath = myTargetPath+myTargetTarFile;

  #---Path names below should be absolute path names (start with / ) and should not end with '/'
  myExlcudeFiles = list();
  myExlcudeFiles.append('/home/myuser/findfiles/dontbackup');
  myExlcudeFiles.append('/home/myuser/findfiles/somebigfiles');
  myExlcudeFiles.append('*.adf');
  myExlcudeFiles.append('badfilez*');
  
  seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles);

#myJdate = seed_functions.getJulianDate();

seed_functions.py
This is the file that does the actual real work ;)

#!/usr/bin/python

#Shared Functions and Classes
import os, sys, stat, time, glob
from datetime import datetime

def getJulianDate():
#--- This function returns an interger value of today's Julian Date 
#--- preceded by two digit year.  IE: 01JAN2013 -> 13001
  nowtime =  str(datetime.now());
  (year, month, day) = nowtime.split('-');
  day = int(day[:2]);
  month = int(month);
  year = int(year);
  t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0));
  jdate = (year % 2000) * 1000;
  jdate = jdate + time.gmtime(t)[7];
  return jdate;


import re
def findFiles(filepath,myFindPath,excludeFiles):
#--- Variables: fileListPath = string, myFindPath = string, excludeFiles = list() of strings)
#--- This function writes/overwrites the file @ 'fileListPath' which should be absolute path.
#--- The file @ fileListPath is a list of files found within myFindPath.
#--- myFindPath is desgined to be an absolute directory path.
#--- excludeFiles is a list() of absolute paths which should be excluded during the finding operation.
  with open(filepath, 'w') as ftemp:
    #--- Generate list of files that shouldn't be added
    #removeExcludedFiles();
    for dirname, dirnames, filenames in os.walk(myFindPath):
      #--- Remove exlcuded directories
      for badfile in dirnames:
        if os.path.join(myFindPath,dirname,badfile) in excludeFiles:
          dirnames.remove(badfile);

      #--- Gather the other directories
      for subdirname in dirnames:
	  #--- we just want to add empty directories
	  if os.path.islink(os.path.join(dirname, subdirname)) or not os.listdir(os.path.join(dirname, subdirname)):
	    ftemp.write(os.path.join(dirname, subdirname)+'\n');

      #--- Add files to the list.
      for filename in filenames:
	  #--- check to see if the file is a regular file or a link:
	  if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)):
            #if not filename in myWildList:
	      ftemp.write(os.path.join(dirname, filename)+'\n');

  ftemp.closed;	

def storeMetaData(fileListPath, fileMetaPath):
#---This function creates an output file with
#---Filename and absolute path of fileMetaPath
#---File format is:
#---  /path/to/file ::: modified datetime ::: seconds since 1970 ::: md5 hash
  with open(fileMetaPath, 'w') as fmeta:
    myFileList = open(fileListPath, 'r');
    for filez in myFileList:
      #--- create an array to append information.
      myFileMeta = list();
      #--- Append absolute path and file name as first item
      myFileMeta.append(os.path.abspath(filez.strip()));
      #--- Get file meta information from os.stat()
      myStat = os.stat(filez.strip());
      #---Append human readable date/time stamp.
      myFileMeta.append(time.ctime(myStat.st_mtime));
      #---Append unix timestamp for easy comparison in the future.
      myFileMeta.append(myStat.st_mtime);
      if not os.path.isdir(filez.strip()):
        myHash = md5(filez.strip());
      else:
        myHash = "---None: directory";
      myFileMeta.append(myHash);
      metaDataString = str(myFileMeta[0]+":::"+myFileMeta[1]+":::"+str(myFileMeta[2])+":::"+str(myFileMeta[3]));
      fmeta.write(metaDataString+'\n');
    myFileList.closed;
  fmeta.closed;

import hashlib,os
def md5(filename):
    ''' function to get md5 of file '''
    d = hashlib.md5();
    try:
        d.update(open(filename).read());
    except Exception,e:
        print e;
    else:
        return d.hexdigest();

import tarfile
def mkTarFile(fileList, tarOutPath):
    thisTarOut = tarOutPath+".lzo"
    thisFileList = "-T "+fileList
    os.system("tar {options} {tarfile} {filex} &> /dev/null".format(options="cpvfa", tarfile=thisTarOut, filex=thisFileList));

#--------
#--- Not yet impemented functions below:
#--------

def findFiles2(fileListPath,myFindPath,excludeFiles):
#--- This function is for testing purposes only
  with open(fileListPath, 'w') as ftemp:
   for dirname, dirnames, filenames in os.walk(myFindPath):
    for subdirname in dirnames:
	#--- we just want to add empty directories
	if not os.listdir(os.path.join(dirname, subdirname)):
		ftemp.write(os.path.join(dirname, subdirname)+'\n');
    for filename in filenames:
	#--- check to see if the file is a regular file or a link:
	if os.path.islink(os.path.join(dirname, filename)) or os.path.isfile(os.path.join(dirname, filename)):
		ftemp.write(os.path.join(dirname, filename)+'\n');
  ftemp.closed;

def removeExcludedFiles():
    #--- Not implemented yet.
    myWildList = list();
    wildMatch = re.compile("^\*");
    wildMatch2 = re.compile(".*\*");
    print "Excluding the following"
    for badfile in excludeFiles:
      print badfile;
      result = wildMatch.match(badfile);
      if not result:
        result2 = wildMatch2.match(badfile);
        print result2;
      if result or result2:
        myWildList.append(badfile);
    print myWildList;

start_seeds.py
This is near completion; it parses the jobs in jobs.d/, verifies them, and runs them.

#!/usr/bin/python
import os, re
import xml.etree.ElementTree as ET
import seed_files



def readConfFile():
  #--- Future feature to read a specified jobs.d from config file.
  jobdir = 'jobs.d';
  return jobdir;

def findJobs(jobdir):
  myJobList = list();
  confMatch = re.compile(".*\.xml$");
  for dirname, dirnames, filenames in os.walk(jobdir):
    for jobid in filenames:
      result = confMatch.match(jobid);
      if result:
        myJobList.append(os.path.join(jobdir,jobid));
  return myJobList;

def checkPath(pathText):
  confMatch = re.compile("^\/");
  result = confMatch.match(pathText);
  confMatch2 = re.compile("^\/$");
  result2 = confMatch2.match(pathText);
  if result2:
    exit('Path cannot be / ');
  if not result:
    exit('Directory path must be absolute path: '+pathText);
  if os.path.isdir(pathText) or os.path.ismount(pathText):
    print 'Path seems valid: ',pathText;
  else:
    exit('Invalid path: '+pathText);

def parseJobs(myFoundJobs):
  print "Found the following config files: ",myFoundJobs;
  print "------------------------------------------------";
  myJobList = list();
  for myJob in myFoundJobs:
    mySubList = list();
    print "Parsing and testing: ",myJob;
    tree = ET.parse(myJob);
    root = tree.getroot();

    for child in root:
      #--- Validate backup path is valide.
      #----future feature: master excludes in config file
      if child.tag == 'backupdir':
        print "Checking Backup Directory";
        checkPath(child.text);
      if child.tag == 'backuptarget':
        print "Checking Backup Target"
        checkPath(child.text);

      #--- Create another sublist for excluded directories.
      if child.tag == 'exclude':
        myExcludeList = list();
        for subchild in child:
	  myExcludeList.append(subchild.text);
        mySubList.append(myExcludeList);

      #--- Since it's not a sublist, we append directly
      elif not child.tag == 'exclude':
        mySubList.append(child.text);
    print "------------------------------------------------";
    myJobList.append(mySubList);

  #--- Ensure some jobs were actually found.
  if myJobList.__len__() == 0:
    exit('Exit on Error: No Jobs Found!');
  #--- If we didn't exit above, we returned the parsed job list
  return myJobList;

def performBackup(myJobList):
  for job in myJobList:
    #--- job[0]:  JobID
    #--- job[1];  Backup Path
    #--- job[2]:  Backup Target
    #--- job[3]:  temp directory
    #--- job[4]:  Excluded directories
    myJobId = job[0];
    myFindPath = job[1];
    myTarget = job[2];
    myTempPath = job[3];
    myExcludes = job[4];
    myTargetPath = os.path.join(myTarget,str('job'+myJobId),'master');
    myTempFileList = 'backup_job'+myJobId+'.tmp';
    myFullTempPath = os.path.join(myTargetPath,myTempFileList); 
    myTargetMeta = 'job'+myJobId+'_master.meta';
    myTargetTarFile = 'job'+myJobId+'_master_seed.tar';
    myTargetMetaPath = os.path.join(myTargetPath,myTargetMeta);
    myTargetTarFilePath = os.path.join(myTargetPath,myTargetTarFile);

    if os.path.exists(os.path.join(myTarget,str('job'+myJobId))):
      print os.path.join(myFindPath,str('job'+myJobId));
      exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists!  Exiting to preserve data!');
    else:
      os.makedirs(os.path.join(myTarget,str('job'+myJobId)));
    if os.path.exists(myTargetPath):
      exit('Critical Error on JobID: '+myJobId+'\n This job directory already exists!  Exiting to preserve data!');
    else:
      os.makedirs(myTargetPath);

    myExcludeFiles = list();
    for excludes in myExcludes:
      myExcludeFiles.append(excludes);
    myExcludeFiles.append(myTarget);
    print "------------------------------------------------";
    print "Starting Job: ",myJobId;
    seed_files.seedMain(myFindPath,myJobId,myFullTempPath,myTargetMetaPath,myTargetTarFilePath,myExcludeFiles);



#--- Execute the script.
myJobList = list();
jobdir = readConfFile();
myFoundJobs = findJobs(jobdir);
myJobList = parseJobs(myFoundJobs);
print "Number of jobs: ",myJobList.__len__();
#promptContinue() #--- Let user review backup jobs, prompt for continue.
performBackup(myJobList);

job103.xml
Job file in jobs.d/ directory

<?xml version="1.0"?>
<data>
  <jobid>103</jobid>
  <backupdir>/home/myuser/files</backupdir>
  <backuptarget>/offsitenfs/client1/target2</backuptarget>
  <temppath>/tmp</temppath>
  <exclude>
    <directory>/home/myuser/files/badfolder1</directory>
    <directory>/home/myuser/files/music/badfolder2</directory>
  </exclude>
</data>
in Uncategorized

, , , ,

You can skip to the end and leave a response. Pinging is currently not allowed.

Categories