Add .readinto(b) method to tarfile's ExFileObject?

1.2k views Asked by At

So I'm trying to iterate over a number of files in a tar, and then load that data into some ctype structures's I've defined. Which was working fine with non-tar files, but then I found out that the ExFileObject returned by tarfile's extractfile(member) method doesn't support the .readinto(b) method.

So right now here's what I'm doing:

import os
import tarfile
import io
from ctypes import c_uint, c_char, c_ubyte, c_ushort, BigEndianStructure

class MyStructure(BigEndianStructure):
    _pack_ = True
    _fields_ = [
                ("id", c_uint), # 4 bytes
                ("namefield", c_char * 32), # 32 bytes
                ("timestamp", c_ubyte * 4), # 4 bytes
                ("payload_length", c_ushort), # 2 bytes
                ]  


def process_tar(tar_files):
    """
    untar and return file objects to be parsed
    """
    for filepath in tar_files:
        f = os.path.abspath(filepath)
        with tarfile.open(f, 'r:*') as tar_f:
            #tar_f.fileobject = io.BufferedReader
            for tarinfo_member in tar_f.getmembers():
                if tarinfo_member.isfile():
                    yield tar_f.extractfile(tarinfo_member)

f = "somefiles.tar.gz"                
for tar_member_fileobj in process_tar([f]):
    mystruct = MyStructure()
    tar_member_fileobj.readinto(mystruct)

And getting this:

---------------------------------------------------------------------------  AttributeError                            Traceback (most recent call last) <ipython-input-4-257ee4b46c31> in <module>()
     29 for tar_member_fileobj in process_tar([f]):
     30     mystruct = MyStructure()
---> 31     tar_member_fileobj.readinto(mystruct)

AttributeError: 'ExFileObject' object has no attribute 'readinto'

Is there a way that this method can be added to the ExFileObject? Or, is there another way to easily get my data loaded into my defined ctypes structures? I noticed that in the tarfile object it appears you can set the fileobject to be used for returned tarinfo files, but just swapping in io.BufferedReader didn't seem to work.

(I tried reading the ExFileObject into StringIO, but it doesn't seem to have readinto() implemented properly either... I'm thinking I could just extractall() to a filespace in memory and re-open the files as standard file objects, but I'd like to avoid that since I would then have additional configuration needed)

1

There are 1 answers

1
AudioBubble On BEST ANSWER

ExFileObject doesn't profited readinto method, but you still can do it by reading the header form the file using read and copy the data into the structure using memmove:

memmove(byref(mystruct),
        tar_member_fileobj.read(sizeof(mystruct)),
        sizeof(mystruct)
)

For example:

import os
import tarfile
import io
from ctypes import *

class MyStructure(BigEndianStructure):
    _pack_ = True
    _fields_ = [
                ("id", c_uint), # 4 bytes
                ("namefield", c_char * 32), # 32 bytes
                ("timestamp", c_ubyte * 4), # 4 bytes
                ("payload_length", c_ushort), # 2 bytes
                ]  


def process_tar(tar_files):
    """
    untar and return file objects to be parsed
    """
    for filepath in tar_files:
        f = os.path.abspath(filepath)
        with tarfile.open(f, 'r:*') as tar_f:
            #tar_f.fileobject = io.BufferedReader
            for tarinfo_member in tar_f.getmembers():
                if tarinfo_member.isfile():
                    yield tar_f.extractfile(tarinfo_member)



# test tar.gz file
open('somefiles.tar.gz', 'wb').write(
'\x1f\x8b\x08\x08\xad\x5c\x34\x52\x02\x00\x66\x6f\x6f\x2e\x74\
\x61\x72\x00\xed\xca\x49\x0a\x83\x40\x14\x04\xd0\x7f\x85\xdc\
\xa0\x43\x92\x6d\xf8\xdd\xa6\xed\x6d\xe6\xc9\x79\x1e\x76\x82\
\x0a\x82\x4b\xef\x8f\xf6\x42\xf0\x02\x6e\xe4\xbf\x4d\x41\x55\
\xb5\x5d\xdf\x9c\xeb\x6a\xa8\x60\x3d\x1c\x51\x29\xc5\x80\x69\
\xb8\xc8\x99\x14\x8c\x0b\xc1\x25\x4a\x53\x5c\x4c\x5d\x28\x9c\
\xfe\x08\x64\x6d\xbb\xc9\xed\xfe\x78\xbe\xde\x9f\xef\xef\x6f\
\xd9\x8e\xeb\xf9\x41\x18\xc5\x49\x9a\xe5\x45\xb9\xbf\x1e\x8e\
\x27\xd0\xbb\x61\x00\x21\x84\x90\x0d\x19\x01\xd4\xe8\x88\xcf\
\x00\x08\x00\x00'
)

f = "somefiles.tar.gz"                
for tar_member_fileobj in process_tar([f]):
    mystruct = MyStructure()

    memmove(byref(mystruct),
            tar_member_fileobj.read(sizeof(mystruct)),
            sizeof(mystruct)
    )

    print hex(mystruct.id)
    print mystruct.namefield
    print ''.join(map(chr, mystruct.timestamp))
    print hex(mystruct.payload_length)

the string \x1f\x8b\x08\x08\xad ... is a tar.gz file contains:

\x11\x11\x11\x11ABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%\x00ABCD33

after copying that data into mystruct it should print:

0x11111111L
ABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%
ABCD
0x3333