vendredi 4 décembre 2020

How to filter efficiently huge list by multiple rules?

I am writing an open-source PyPi package, that should filter the AWS EC2 instances.

In my function ec_compare__from_dict, I am filtering a list of 350+ elements that takes 364Kb on disk.

The following example of execution returns 1 filtered element:

>>> ec_compare__from_dict(_partial=_partial,InstanceType='z1d',FreeTierEligible=False,SupportedUsageClasses='spot',BareMetal=True)
[{'InstanceType': 'z1d.metal', 'CurrentGeneration': True, 'FreeTierEligible': False, 'SupportedUsageClasses': ['on-demand', 'spot'], 'SupportedRootDeviceTypes': ['ebs'], 'BareMetal': True, 'ProcessorInfo': {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': 4.0}, 'VCpuInfo': {'DefaultVCpus': 48}, 'MemoryInfo': {'SizeInMiB': 393216}, 'InstanceStorageSupported': True, 'InstanceStorageInfo': {'TotalSizeInGB': 1800, 'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]}, 'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'}, 'NetworkInfo': {'NetworkPerformance': '25 Gigabit', 'MaximumNetworkInterfaces': 15, 'Ipv4AddressesPerInterface': 50, 'Ipv6AddressesPerInterface': 50, 'Ipv6Supported': True, 'EnaSupport': 'required'}, 'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']}, 'HibernationSupported': False, 'BurstablePerformanceSupported': False, 'DedicatedHostsSupported': True, 'AutoRecoverySupported': False}]

My problem is the following: I want to filter the list with all filters that have different rules in one single list comprehension.

But I am losing readability and I am creating a spaghetti code. Please point me to the better design decisions.

from typing import List


def ec2keys(*arg) -> List:
    values = {'str': ['InstanceType', 'Hypervisor'], 'bool': ['FreeTierEligible', 'HibernationSupported', 'CurrentGeneration', 'BurstablePerformanceSupported', 'AutoRecoverySupported', 'DedicatedHostsSupported', 'InstanceStorageSupported', 'BareMetal'], 'list': ['SupportedUsageClasses', 'SupportedRootDeviceTypes'], 'dict': ['InstanceStorageInfo', 'VCpuInfo', 'EbsInfo', 'FpgaInfo', 'PlacementGroupInfo', 'GpuInfo', 'InferenceAcceleratorInfo', 'MemoryInfo', 'NetworkInfo', 'ProcessorInfo'], 'other': []} 
    return [elem for k,v in  values.items() if k in arg or not arg for elem in v]

def ec_compare__from_dict(_partial: List,**kwargs):
    _instance_type = kwargs.get('InstanceType')
    flat_keys = set(ec2keys('str', 'bool')).intersection(
        set(kwargs.keys())) - {'InstanceType'}
    complex_filter_keys = set(ec2keys()).intersection(
        set(kwargs.keys()))
    list_keys_dict = {k: list(
        (lambda x: x if isinstance(x, list) else [x])(kwargs.get(k)))
        for k in set(ec2keys('list')).intersection(
            set(kwargs.keys()))
    }
    # here I started with list comprehension
    _partial = [x for x in _partial
                if all(elem in x.keys() for elem in flat_keys)
                and all(elem in x.keys() for elem in complex_filter_keys)
                and all(x[elem] == kwargs[elem] for elem in flat_keys)
                ]
    # this is re-apply filter again to all elements 
    if isinstance(_instance_type, str) and _instance_type:
        _partial = [x for x in _partial
                    if str(x['InstanceType']).startswith(_instance_type)
                    ]
    elif isinstance(_instance_type, (list, set)) and _instance_type:
        _partial = [x for x in _partial
                    if any(str(x['InstanceType']).startswith(elem)
                           for elem in _instance_type)
                    ]

    # this is how I filter list values
    if list_keys_dict:
        _partial = [x for x in _partial
                    if any(set(x[k]).intersection(v) for k, v in list_keys_dict.items())
                    ]
    return _partial

Example data

_partial = [{'InstanceType': 'z1d.metal', 'CurrentGeneration': True, 'FreeTierEligible': False, 'SupportedUsageClasses': ['on-demand', 'spot'], 'SupportedRootDeviceTypes': ['ebs'], 'BareMetal': True, 'ProcessorInfo': {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': 4.0}, 'VCpuInfo': {'DefaultVCpus': 48}, 'MemoryInfo': {'SizeInMiB': 393216}, 'InstanceStorageSupported': True, 'InstanceStorageInfo': {'TotalSizeInGB': 1800, 'Disks': [{'SizeInGB': 900, 'Count': 2, 'Type': 'ssd'}]}, 'EbsInfo': {'EbsOptimizedSupport': 'default', 'EncryptionSupport': 'supported'}, 'NetworkInfo': {'NetworkPerformance': '25 Gigabit', 'MaximumNetworkInterfaces': 15, 'Ipv4AddressesPerInterface': 50, 'Ipv6AddressesPerInterface': 50, 'Ipv6Supported': True, 'EnaSupport': 'required'}, 'PlacementGroupInfo': {'SupportedStrategies': ['cluster', 'partition', 'spread']}, 'HibernationSupported': False, 'BurstablePerformanceSupported': False, 'DedicatedHostsSupported': True, 'AutoRecoverySupported': False}]

Aucun commentaire:

Enregistrer un commentaire