Extract specific folders from tar.gz archive

68 views Asked by At

Could you please advice, how I can extract specific folders (and files) and ignore the root-level of directory inside the archive?

<ROOT_FOLDER>/dir1/file1.txt
<ROOT_FOLDER>/dir1/file2.txt
<ROOT_FOLDER>/dir1/dir2/file3.txt

I wrote a function, but it still gives me all the files from the archive, despite the endswith() filter.

#EXAMPLE!!! => package_basename = <ROOT_FOLDER>

tar_archive = os.path.join(directory_path, package_name)
destination = os.path.join(directory_temp, package_renamed)

def get_archive_content(tar_archive, package_basename):
    with tarfile.open(tar_archive, 'r:gz') as tar_file:
        members = tar_file.getmembers()

    filtered_members = [
        member.name.replace(package_basename + '/', './') for member in members
    ]

    return filtered_members    

members = get_archive_content(tar_archive, package_basename)           

def extract_required_files() -> None:
    with tarfile.open(tar_archive, 'r:gz') as tar_file:
        for member in members:
            if any(member.endswith(path + filename) for path in folder_libs for filename in libs):
                tar_file.extract(member, destination)

            elif any(member.endswith(filename) for filename in config_files):
                tar_file.extract(member, destination)
                
            elif any(re.search(rf"{filename}*.*", member) for filename in config_default_files):
                tar_file.extract(member, destination)
                        
            elif any(member.endswith(filename) for filename in script_files):
                tar_file.extract(member, destination)
                
            elif any(member.endswith(filename) for filename in start_files):
                tar_file.extract(member, destination)                    

            sys.stdout.write(member + '\n')
            
extract_required_files()

I tried to change to use re.search instead of .endswith(), but it doesn't help.

2

There are 2 answers

0
Guy On

You need to extract the file/folder using the full name, not the filtered one

package_basename = 'ROOT_FOLDER'
folder_destination = r'path/to/folder/destination'
with tarfile.open(tar_archive, 'r:gz') as tar_file:
    names = tar_file.getnames()
    filtered_names = {name.replace(f'{package_basename}/', ''): name for name in names}
    for name, full_name in filtered_names.items():
        destination = os.path.join(folder_destination, name)

        # get full folder
        if name.startswith('dir1'):
            tar_file.extract(full_name, destination)

        # get a specific file
        elif name.endswith('file3.txt'):
            tar_file.extract(full_name, destination)
1
darked89 On

This works:

#!/usr/bin/env python3

import tarfile
import os

dir_list = ["openssh-9.6p1/contrib/suse",  "openssh-9.6p1/contrib/aix"]
destination = "/tmp/extracted/"
tar_fn = "openssh-9.6p1.tar.gz"

def filter_tar(tar_fn, dir_list):
    with tarfile.open(tar_fn, 'r:gz') as fh:
        members = fh.getmembers()
        for member in members[:200]:
            for dir in dir_list:
                print(f"Checking {member.name} against {dir}")
                if member.name.startswith(dir):
                    print(f"Matched {member.name} against {dir}")
                    out_dir = os.path.join(destination, os.path.dirname(member.name))
                    print(f"Extracting {member.name} to {out_dir}")
                    fh.extract(member, path=out_dir)


filter_tar(tar_fn, dir_list)

I have used openssh-9.6p1.tar.gz for testing. You will need a list of dirs of interest (above a trivial 2 elements one, in reality you may need to create a text file with dirs of interest then read it in a separate function. What it did not work in your script: you shall not modify names of members (member.name.replace) since the mod names are different from the ones in the tar file. One can play with:

out_dir = os.path.join(destination, os.path.dirname(member.name))

to rename extracted files.