Why does os.walk() (Python) ignore a OneDrive directory depending on the number of files in it?

698 views Asked by At

I am doing an os.walk() over a certain part of my OneDrive synced folder structure. It all worked fine until recently. Now ALL files from one specific directory are ignored. I tested several possible reasons and narrowed it down to this: The directory that is ignored is the one that holds the most files (897 at this point).

If I remove two of the files from said directory (it does not matter which two), it works and all files are recognized. When I add the files again, the result is the same: No files from that directory turn up in my os.walk() result list.

I did check Microsoft's Restrictions and limitations in OneDrive and SharePoint, but am far from any of the file size and number (1 ,2) limits mentioned.

My code looks like this

files = []
for root, dir, files in os.walk(mainDirectory):
    for f in files:
        if 'Common part' in root:
            files.append(os.path.join(root, f))

'Common part' is a text string, that all relevant folders in the mainDirectory have in common.

The directory itself is recognized all the times, just the files are not added to my list. So, I tried another approach featuring glob.glob(). Here, the results are a bit different but still not satisfactory:

folders = []
for root, dir, files in os.walk(mainDirectory):
    for d in dir:
        if d.startswith('Common part')
            folders.append(os.path.join(root, d))

files = [glob.glob(os.path.join(f,'*.xlsx')) for f in folders]

This does give me approximately half the files from the problematic folder. Again, when I remove two files, it gives me the full list.

When I copy/move the files to a local (not OneDrive synced) path, it works. So I guess it does have to do with OneDrive. Having the files outside of OneDrive is not an option.

The directory in question is not directly in my OneDrive but a "Sync"/"Shortcut" from SharePoint.

All files can be opened, they are downloaded, not on-demand. I have removed the sync and re-synced the folder. I have restarted OneDrive (and my machine) several times

I am really at a loss here. Any hints welcome!

Update: Thanks to the help of @GordonAitchJay, it could be established, that at the threshold of files (or sum of file sizes?) functions like os.listdir() and win32file.FindFilesW() stop returning their usual output and instead return OSError: [WinError 87] The parameter is incorrect

Also, in the meantime, we reproduced the same behaviour on another machine within the same organization. This was conducted after a full reset of my OneDrive did not result in any improvement.

1

There are 1 answers

0
GordonAitchJay On BEST ANSWER

Though I can't prove it, it seems that OneDrive is up to some sort of tomfoolery that causes win32's FindNextFileW to fail with a ERROR_INVALID_PARAMETER error, but apparently only when it is called by Python's os.walk, os.listdir, and win32file.FindFilesW, and when some files have been deleted from the OneDrive directory syncing a SharePoint folder. Utterly bizarre. I'm thinking maybe OneDrive hooks FindNextFileW which remains after ending the OneDrive process and services with Task Manager.

A workaround is to use ctypes to call the lower level NtQueryDirectoryFile function (which is ultimately what FindNextFileW calls anyway).

Eryk Sun's answer to another question has a working example. I have copied it below, and have only changed the last couple lines:

import os
import msvcrt
import ctypes

from ctypes import wintypes

ntdll = ctypes.WinDLL('ntdll')
kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)

def NtError(status):
    err = ntdll.RtlNtStatusToDosError(status)
    return ctypes.WinError(err)

NTSTATUS = wintypes.LONG
STATUS_BUFFER_OVERFLOW = NTSTATUS(0x80000005).value
STATUS_NO_MORE_FILES = NTSTATUS(0x80000006).value
STATUS_INFO_LENGTH_MISMATCH = NTSTATUS(0xC0000004).value

ERROR_DIRECTORY = 0x010B
INVALID_HANDLE_VALUE = wintypes.HANDLE(-1).value
GENERIC_READ = 0x80000000
FILE_SHARE_READ = 1
OPEN_EXISTING = 3
FILE_FLAG_BACKUP_SEMANTICS = 0x02000000
FILE_ATTRIBUTE_DIRECTORY = 0x0010

FILE_INFORMATION_CLASS = wintypes.ULONG
FileDirectoryInformation = 1
FileBasicInformation = 4

LPSECURITY_ATTRIBUTES = wintypes.LPVOID
PIO_APC_ROUTINE = wintypes.LPVOID
ULONG_PTR = wintypes.WPARAM

class UNICODE_STRING(ctypes.Structure):
    _fields_ = (('Length',        wintypes.USHORT),
                ('MaximumLength', wintypes.USHORT),
                ('Buffer',        wintypes.LPWSTR))

PUNICODE_STRING = ctypes.POINTER(UNICODE_STRING)

class IO_STATUS_BLOCK(ctypes.Structure):
    class _STATUS(ctypes.Union):
        _fields_ = (('Status',  NTSTATUS),
                    ('Pointer', wintypes.LPVOID))
    _anonymous_ = '_Status',
    _fields_ = (('_Status',     _STATUS),
                ('Information', ULONG_PTR))

PIO_STATUS_BLOCK = ctypes.POINTER(IO_STATUS_BLOCK)

ntdll.NtQueryInformationFile.restype = NTSTATUS
ntdll.NtQueryInformationFile.argtypes = (
    wintypes.HANDLE,        # In  FileHandle
    PIO_STATUS_BLOCK,       # Out IoStatusBlock
    wintypes.LPVOID,        # Out FileInformation
    wintypes.ULONG,         # In  Length
    FILE_INFORMATION_CLASS) # In  FileInformationClass

ntdll.NtQueryDirectoryFile.restype = NTSTATUS
ntdll.NtQueryDirectoryFile.argtypes = (
    wintypes.HANDLE,        # In     FileHandle
    wintypes.HANDLE,        # In_opt Event
    PIO_APC_ROUTINE,        # In_opt ApcRoutine
    wintypes.LPVOID,        # In_opt ApcContext
    PIO_STATUS_BLOCK,       # Out    IoStatusBlock
    wintypes.LPVOID,        # Out    FileInformation
    wintypes.ULONG,         # In     Length
    FILE_INFORMATION_CLASS, # In     FileInformationClass
    wintypes.BOOLEAN,       # In     ReturnSingleEntry
    PUNICODE_STRING,        # In_opt FileName
    wintypes.BOOLEAN)       # In     RestartScan

kernel32.CreateFileW.restype = wintypes.HANDLE
kernel32.CreateFileW.argtypes = (
    wintypes.LPCWSTR,      # In     lpFileName
    wintypes.DWORD,        # In     dwDesiredAccess
    wintypes.DWORD,        # In     dwShareMode
    LPSECURITY_ATTRIBUTES, # In_opt lpSecurityAttributes
    wintypes.DWORD,        # In     dwCreationDisposition
    wintypes.DWORD,        # In     dwFlagsAndAttributes
    wintypes.HANDLE)       # In_opt hTemplateFile

class FILE_BASIC_INFORMATION(ctypes.Structure):
    _fields_ = (('CreationTime',   wintypes.LARGE_INTEGER),
                ('LastAccessTime', wintypes.LARGE_INTEGER),
                ('LastWriteTime',  wintypes.LARGE_INTEGER),
                ('ChangeTime',     wintypes.LARGE_INTEGER),
                ('FileAttributes', wintypes.ULONG))

class FILE_DIRECTORY_INFORMATION(ctypes.Structure):
    _fields_ = (('_Next',          wintypes.ULONG),
                ('FileIndex',      wintypes.ULONG),
                ('CreationTime',   wintypes.LARGE_INTEGER),
                ('LastAccessTime', wintypes.LARGE_INTEGER),
                ('LastWriteTime',  wintypes.LARGE_INTEGER),
                ('ChangeTime',     wintypes.LARGE_INTEGER),
                ('EndOfFile',      wintypes.LARGE_INTEGER),
                ('AllocationSize', wintypes.LARGE_INTEGER),
                ('FileAttributes', wintypes.ULONG),
                ('FileNameLength', wintypes.ULONG),
                ('_FileName',      wintypes.WCHAR * 1))

    @property
    def FileName(self):
        addr = ctypes.addressof(self) + type(self)._FileName.offset
        size = self.FileNameLength // ctypes.sizeof(wintypes.WCHAR)
        return (wintypes.WCHAR * size).from_address(addr).value

class DirEntry(FILE_DIRECTORY_INFORMATION):
    def __repr__(self):
        return '<{} {!r}>'.format(self.__class__.__name__, self.FileName)

    @classmethod
    def listbuf(cls, buf):
        result = []
        base_size = ctypes.sizeof(cls) - ctypes.sizeof(wintypes.WCHAR)
        offset = 0
        while True:
            fdi = cls.from_buffer(buf, offset)
            if fdi.FileNameLength and fdi.FileName not in ('.', '..'):
                cfdi = cls()
                size = base_size + fdi.FileNameLength
                ctypes.resize(cfdi, size)
                ctypes.memmove(ctypes.byref(cfdi), ctypes.byref(fdi), size)
                result.append(cfdi)
            if fdi._Next:
                offset += fdi._Next
            else:
                break
        return result

def isdir(path):
    if not isinstance(path, int):
        return os.path.isdir(path)
    try:
        hFile = msvcrt.get_osfhandle(path)
    except IOError:
        return False
    iosb = IO_STATUS_BLOCK()
    info = FILE_BASIC_INFORMATION()
    status = ntdll.NtQueryInformationFile(hFile, ctypes.byref(iosb),
                ctypes.byref(info), ctypes.sizeof(info),
                FileBasicInformation)
    return bool(status >= 0 and info.FileAttributes & FILE_ATTRIBUTE_DIRECTORY)

def ntlistdir(path=None):
    result = []

    if path is None:
        path = os.getcwd()

    if isinstance(path, int):
        close = False
        fd = path
        hFile = msvcrt.get_osfhandle(fd)
    else:
        close = True
        hFile = kernel32.CreateFileW(path, GENERIC_READ, FILE_SHARE_READ,
                    None, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, None)
        if hFile == INVALID_HANDLE_VALUE:
            raise ctypes.WinError(ctypes.get_last_error())
        fd = msvcrt.open_osfhandle(hFile, os.O_RDONLY)

    try:
        if not isdir(fd):
            raise ctypes.WinError(ERROR_DIRECTORY)
        iosb = IO_STATUS_BLOCK()
        info = (ctypes.c_char * 4096)()
        while True:
            status = ntdll.NtQueryDirectoryFile(hFile, None, None, None,
                        ctypes.byref(iosb), ctypes.byref(info),
                        ctypes.sizeof(info), FileDirectoryInformation,
                        False, None, False)
            if (status == STATUS_BUFFER_OVERFLOW or
                iosb.Information == 0 and status >= 0):
                info = (ctypes.c_char * (ctypes.sizeof(info) * 2))()
            elif status == STATUS_NO_MORE_FILES:
                break
            elif status >= 0:
                sublist = DirEntry.listbuf(info)
                result.extend(sublist)
            else:
                raise NtError(status)
    finally:
        if close:
            os.close(fd)

    return result

for entry in ntlistdir(r"C:\Users\UserName\OneDriveFolder\BigFolder"):
    print(entry.FileName)