How to access my own fake bucket with S3FileSystem, Pytest and Moto

1k views Asked by At

I'm trying to implement Unit Tests using Pytest, Moto (4.1.6) and s3fs (0.4.2) for my functions that interact with S3.

So far I am able to create a bucket and populate it with all the files that live in the data folder.

Unfortunately one of my requirements is that I need to access the bucket with the s3fs.core.S3FileSystem object class since that's how our internal library works and I'm trying to stay as close as possible to the original environment.

That wouldn't be a problem if I didn't get access denied when I try to access the fake bucket.

Here's the relevant code from conftest.py

#!/usr/bin/env python3

from moto import mock_s3
from pathlib import Path

import boto3
import os
import pytest
import s3fs


@pytest.fixture(scope="session")
def test_data_folder():
    return os.path.join(os.path.dirname(__file__), "data")


@pytest.fixture(scope="session")
@mock_s3
def s3_filesystem(test_data_folder):
    connection = boto3.client("s3", region_name="us-east-1")
    connection.create_bucket(Bucket="bucket")

    for path in Path(test_data_folder).rglob("*"):
        if path.is_file():
            with open(path, "rb") as parquet:
               data = parquet.read()
               connection.put_object(Bucket="bucket", Key=str(path), Body=data)

    bucket = boto3.resource("s3").Bucket("bucket")
    for object in bucket.objects.all():
        print(object.key)

    filesystem = s3fs.S3FileSystem(anon=True)
    filesystem.ls(test_data_folder)
    return filesystem

After this code runs I can see in the output of my print that several files that look like this exist in there: /Users/campos/repos/project/tests/data/20221027/transactions_test_20221027.parquet

I want to return the s3fs.core.S3FileSystem object to my tests, but when I try to run filesystem.ls(test_data_folder) in my debugger I get *** PermissionError: All access to this object has been disabled

Going a little deeper, the objects returned in from objects.bucket.all() look like this: s3.ObjectSummary(bucket_name='bucket', key='/Users/campos/repos/project/tests/data/20221027/orders_test_20221027.parquet')

I already tried adding a public access control list to the bucket creation like this s3.create_bucket(Bucket="bucket", ACL="public-read") but it didn't change anything.

I also saw this in the error messages:

api_params = {'Bucket': 'Users', 'Delimiter': '/', 'EncodingType': 'url', 'Prefix': 'ykb595/repos/gfctr-card-lob-pyexetl/tests/data/'}

...

botocore.errorfactory.NoSuchBucket: An error occurred (NoSuchBucket) when calling the ListObjectsV2 operation: The specified bucket does not exist

It's very clear that my files exist somewhere in some sort of bucket, but something it looks like something is not being able to find the bucket.

What am I missing?

Thank you in advance!!

2

There are 2 answers

3
Bert Blommers On BEST ANSWER

Change this filesystem.ls(test_data_folder) to filesystem.ls("bucket")

The test_data_folder resolves to home/username/.. (for me), which means that S3FS tries to find a bucket called home. But the bucket-name that you're using everywhere else is bucket.


Currently, the bucket structure created in S3/Moto looks like this: / -> home/ -> folder1/ -> folder2/ -> etc

I don't think S3FS likes this very much, either the structure itself or the fact that the first folder is named /. When removing the mock_s3 decorator and testing it against AWS itself, the call to filesystem.ls(bucket_name) fails horribly and filesystem.walk() returns the same strange result. ([('bucket', [], [''])])

When I create a flat structure in S3, the filesystem.walk() does work as expected (both against AWS and against Moto):

# Create files using only the last section of the path:
with open(path, "rb") as parquet:
    data = parquet.read()
    connection.put_object(Bucket=bucket_name, Key=path.parts[-1], Body=data)

results in:

[('/bucket', [], ['file1.py', 'file2.py', 'file3.py'])]

On a general note, you asked for to better understand, study and implement testing with AWS following best practices.

My recommendation is always to verify the behaviour against AWS first if you're unsure about the expected result. Open-source tools like S3FS and Moto are great to write unit tests, and verify that the known behaviour doesn't change. But if you don't know the expected result, you can never be sure whether the strange things that you're seeing are the result of S3FS, Moto, or AWS.

0
A Campos On

I had to do many little corrections to get this really working.

Keep in mind that it is expected that you have a data folder with some files to be put in the mocked S3 and it should live in the same directory as conftest.py

Note: I left the code for easy_log too because it can raise an exception so now you know how to deal with them too!

conftest.py

#!/usr/bin/env python3

from moto import mock_s3
from pathlib import Path

import boto3
import os
import pytest
import s3fs


@pytest.fixture(scope="session")
def test_data_folder():
    return os.path.join(os.path.dirname(__file__), "data")


@pytest.fixture(scope="session")
def aws_credentials():
    """Mocked AWS Credentials for moto."""
    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
    os.environ["AWS_SECURITY_TOKEN"] = "testing"
    os.environ["AWS_SESSION_TOKEN"] = "testing"
    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"


@pytest.fixture(scope="session")
def s3(aws_credentials):
    with mock_s3():
        yield boto3.client("s3", region_name="us-east-1")


@pytest.fixture(scope="session")
@mock_s3
def s3_filesystem(s3, test_data_folder):
    bucket_name = "bucket"
    s3.create_bucket(Bucket=bucket_name, ACL="public-read")

    for path in Path(test_data_folder).rglob("*"):
        if path.is_file():
            path_key = str(path).split("data")[1][1:]
            with open(path, "rb") as parquet:
                data = parquet.read()
                s3.put_object(Bucket=bucket_name, Key=path_key, Body=data)

    for count in range(100):  # Hack to avoid StopInteration error
        yield s3fs.S3FileSystem(anon=True)

test_utils.py

#!/usr/bin/env python3

from moto import mock_s3

import os
import pandas as pd
import pytest

from project import utils


@mock_s3
def test_read_from_lake(mocker, s3_filesystem, test_data_folder):
    mocker.patch(
        "project.utils.create_session",
        return_value=next(s3_filesystem),
    )

    path = "bucket/orders/"

    df = utils.read_from_lake(path, "orders")
    assert df.equals(pd.read_parquet(path, filesystem=next(s3_filesystem)))

    with pytest.raises(SystemExit) as error:
        assert utils.read_from_lake("bad/path", "orders")

utils.py

#!/usr/bin/env python3

import fnmatch, os, sys
import pandas as pd

end_colored_output = "\033[0m"

color_level_map = {
    "error": "\033[91m",  # RED
    "warn": "\033[93m",  # YELLOW
    "info": "\033[92m",  # GREEN
}


def print_colored_output(message, level, type):
    color = color_level_map[level]
    print(f"{color}{type}{message}{end_colored_output}")


def easy_log(message, hint=None, level="error", exception=None, type=""):
    """Defaults to Error, in which case the program will exit"""
    print_colored_output(message, level, type=type)
    if hint:
        print_colored_output(hint, level="info", type="Hint: ")
    if level == "error":
        if exception:
            raise exception
        sys.exit()


def create_session(dataset, write_access=False):
    pass # I'm mocking this in test_utils.py


def read_from_lake(path, dataset):
    s3 = create_session(dataset)

    files = []
    for root, dirnames, filenames in s3.walk(path):
        for filename in fnmatch.filter(filenames, "*.parquet"):
            files.append(os.path.join(root, filename))

    if len(files) > 0:
        easy_log(f"Found {len(files)} files in path {path}", level="info")
        df = pd.concat(pd.read_parquet(file, filesystem=s3) for file in files)
        return df
    else:
        easy_log(message=f"No files for Dataset {dataset} and Path {path}")