Code Review Asked by Viktor V. on October 27, 2021
I’m new to Python and I have written the spider on the Scrapy framework which parses some advertisement websites and proceed data further. The spider works as I expect but I would like to show you the code.
I’m looking for tips, recommendations and critics where I can refactor to improve my code and knowledge further. I expect to see advice from code best practices, flexibility, style point of view and so on.
Advertisement pipelines classes:
logger = logging.getLogger(__name__)
class AdvertPipeline(object):
def __init__(self):
engine = db_connect()
init_db(engine)
self.Session = sessionmaker(bind=engine)
self.income_adverts = set()
self.old_adverts = set()
self.old_statistics_adverts = set()
self.actual_adverts = []
def open_spider(self, cls, statistics_cls, spider):
session = self.Session()
# Get latest adverts
self.old_adverts = set(session.query(cls).all())
# Get latest archived adverts
latest_archived_adverts = session.query(
statistics_cls.id,
func.max(statistics_cls.created_at).label('maxdate')
).group_by(statistics_cls.id).subquery('t2')
# Get actual archived adverts for further data proceeding
self.old_statistics_adverts = session.query(statistics_cls)
.join(latest_archived_adverts, and_(
statistics_cls.id == latest_archived_adverts.c.id,
statistics_cls.created_at == latest_archived_adverts.c.maxdate))
.all()
session.close()
def process_item(self, item, cls, object_link, spider):
data = cls(**item)
data.link = urljoin(object_link, str(data.id))
data.m2_price = data.price / data.m2 if data.m2 else None
self.income_adverts.add(data)
def close_spider(self, spider):
session = self.Session()
for adv in self.actual_adverts:
try:
# NB! If it's possible to make this function with less if-else statement, please show me how I can simplify this piece of code
# Commit all particular advert operations to db or rollback everything concerning this advert
if 'delete' in adv:
session.delete(adv['delete'])
if 'archive' in adv:
session.add(adv['archive'])
if 'new' in adv:
session.add(adv['new'])
session.commit()
if 'delete' in adv:
logger.info("[%s][DELETE] ID: %s, price: %s" % (
adv['delete'].__class__,
adv['delete'].id,
adv['delete'].price)
)
if 'archive' in adv:
logger.info("[%s][ARCHIVE] ID: %s, price: %s" % (
adv['archive'].__class__,
adv['archive'].id,
adv['archive'].price)
)
if 'new' in adv:
logger.info("[%s][NEW] ID: %s, price: %s" % (
adv['new'].__class__,
adv['new'].id,
adv['new'].price)
)
except Exception as e:
logger.error("Error occurred: n %s n %s" % (e, adv))
session.rollback()
pass
session.close()
def process_adverts(self, statistics_cls):
# Parser takes half or less adverts from advertisement websites some times. It happens very rarely.
if len(self.income_adverts) < MIN_PARSED_ADVERTS_COUNT:
raise ValueError('It seems not all adverts were parsed. Parsed adverts count: %d' % (len(self.income_adverts)))
logger.info("[%s] Start adverts proceeding" % self.__class__)
# Convert all sets to particular dict view
adverts_dict = {adv.id: adv for adv in self.income_adverts}
old_adverts_dict = {old_advert.id: old_advert for old_advert in self.old_adverts}
old_stat_adverts_dict = {
old_stat_advert.id: old_stat_advert for old_stat_advert in self.old_statistics_adverts
}
# NB! If it's possible to make this function with less if-else statement, please show me how I can simplify this piece of code
for advert in adverts_dict.keys():
if advert not in old_adverts_dict:
# Absolutely new advert or inactive advert becomes active with changed price or m2 in advert
self.actual_adverts.append({'new': adverts_dict[advert]})
if advert in old_stat_adverts_dict:
# Remove duplicate advert from current adverts and archived adverts
if advert in old_adverts_dict:
if old_adverts_dict[advert].price == old_stat_adverts_dict[advert].price:
self.actual_adverts.append({'delete': old_stat_adverts_dict[advert]})
else:
# Remove duplicate advert from income adverts and archived adverts
if adverts_dict[advert].price == old_stat_adverts_dict[advert].price:
self.actual_adverts.append({'delete': old_stat_adverts_dict[advert]})
# Add modified and archive old adverts
for advert in adverts_dict.keys():
if advert in old_adverts_dict:
if adverts_dict[advert].price != old_adverts_dict[advert].price:
# I have no idea how I can move particular attributes` values from one class object to different one
stat_advert = statistics_cls()
stat_advert.copy_properties(old_adverts_dict[advert])
# NB! If you know how is possible to simplify below code, take me know please
# I usually put particular advert operations with db in one "stack" - self.actual_adverts
# So then I can rollback all db transactions of particular advert[new, delete, archive]
# if something wrong happens
self.actual_adverts.append(
{
'delete': old_adverts_dict[advert],
'new': adverts_dict[advert],
'archive': stat_advert
}
)
inactive_adverts = [old_advert for old_advert in self.old_adverts if old_advert not in self.income_adverts]
# Archive inactive adverts
for inactive_advert in inactive_adverts:
stat_advert = statistics_cls()
stat_advert.copy_properties(inactive_advert)
stat_advert.created_at = datetime.now(pytz.timezone(EE_TIMEZONE))
stat_advert.status = ADVERT_STATUSES[ADVERT_SOLD]
self.actual_adverts.append({
'delete': inactive_advert,
'archive': stat_advert
})
logger.info("[%s] End adverts proceeding" % self.__class__)
# NB! If it's possible to simplify below classa, please take me know how I can do it
# This some Pipeline classes are necessary to invoke them in different parser scenarios for specific adverts
class FooSaleAdvertPipeline(AdvertPipeline):
def open_spider(self, spider, **kwargs):
super(FooSaleAdvertPipeline, self).open_spider(FooSaleAdvert, FooSaleAdvertStatistics, spider)
def process_item(self, item, spider, **kwargs):
super(FooSaleAdvertPipeline, self).process_item(item, FooSaleAdvert, FOO_OBJECT_LINK, spider)
def close_spider(self, spider):
super(FooSaleAdvertPipeline, self).process_adverts(FooSaleAdvertStatistics)
super(FooSaleAdvertPipeline, self).close_spider(spider)
class FooRentAdvertPipeline(AdvertPipeline):
def open_spider(self, spider, **kwargs):
super(FooRentAdvertPipeline, self).open_spider(FooRentAdvert, FooRentAdvertStatistics, spider)
def process_item(self, item, spider, **kwargs):
super(FooRentAdvertPipeline, self).process_item(item, FooRentAdvert, FOO_OBJECT_LINK, spider)
def close_spider(self, spider):
super(FooRentAdvertPipeline, self).process_adverts(FooRentAdvertStatistics)
super(FooRentAdvertPipeline, self).close_spider(spider)
class BarSaleAdvertPipeline(AdvertPipeline):
def open_spider(self, spider, **kwargs):
super(BarSaleAdvertPipeline, self).open_spider(BarSaleAdvert, BarSaleAdvertStatistics, spider)
def process_item(self, item, spider, **kwargs):
super(BarSaleAdvertPipeline, self).process_item(item, BarSaleAdvert, BAR_DOMAIN, spider)
def close_spider(self, spider):
super(BarSaleAdvertPipeline, self).process_adverts(BarSaleAdvertStatistics)
super(BarSaleAdvertPipeline, self).close_spider(spider)
class BarRentAdvertPipeline(AdvertPipeline):
def open_spider(self, spider, **kwargs):
super(BarRentAdvertPipeline, self).open_spider(BarRentAdvert, BarRentAdvertStatistics, spider)
def process_item(self, item, spider, **kwargs):
super(BarRentAdvertPipeline, self).process_item(item, BarRentAdvert, BAR_DOMAIN, spider)
def close_spider(self, spider):
super(BarRentAdvertPipeline, self).process_adverts(BarRentAdvertStatistics)
super(BarRentAdvertPipeline, self).close_spider(spider)
Advert classes:
class MixinProperties:
def __init__(self):
pass
def copy_properties(self, old_cls):
for attr in dir(old_cls):
if not callable(getattr(old_cls, attr)) and not attr.startswith("__") and not attr.startswith("_"):
setattr(self, attr, getattr(old_cls, attr))
class Advert(Base):
__abstract__ = True
link = Column(String(255), nullable=False)
rooms = Column(Numeric(precision=3, scale=1), nullable=True, index=True)
floor = Column(Integer, nullable=True)
floor_max = Column(Integer, nullable=True)
m2 = Column(Numeric(precision=7, scale=2), nullable=True)
m2_price = Column(Numeric(precision=8, scale=2), nullable=True)
price = Column(Numeric(precision=8), nullable=False, primary_key=True)
address = Column(String(50), nullable=True)
zone = Column(String(50), nullable=True)
city = Column(String(50), nullable=False, index=True)
state = Column(String(50), nullable=False)
def __eq__(self, other):
return self.id == other.id
def __hash__(self):
return hash(self.id)
class BarAdvert(Advert):
__abstract__ = True
id = Column(Integer, nullable=False, primary_key=True)
status = Column(Integer, nullable=False, default=ADVERT_STATUSES[ADVERT_ACTIVE])
built_in_year = Column(Integer, nullable=True)
__mapper_args__ = {
"confirm_deleted_rows": False
}
class BarSaleAdvert(BarAdvert):
__tablename__ = "bar_sale_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now())
class BarSaleAdvertStatistics(BarAdvert, MixinProperties):
__tablename__ = "bar_statistics_sale_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now(), primary_key=True)
class BarRentAdvert(BarAdvert):
__tablename__ = "bar_rent_adverts"
__table_args__ = {'extend_existing': True}
__mapper_args__ = {
"confirm_deleted_rows": False
}
created_at = Column(DateTime, default=func.now())
class BarRentAdvertStatistics(BarAdvert, MixinProperties):
__tablename__ = "bar_statistics_rent_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now(), primary_key=True)
class FooAdvert(Advert):
__abstract__ = True
id = Column(String(25), nullable=False, primary_key=True)
type = Column(Integer, nullable=False)
broker_company = Column(String(50), nullable=True)
broker_person_name = Column(String(50), nullable=True)
broker_person_image = Column(String(255), nullable=True)
broker_person_phone = Column(String(50), nullable=True)
status = Column(Integer, nullable=False, default=ADVERT_STATUSES[ADVERT_ACTIVE])
__mapper_args__ = {
"confirm_deleted_rows": False
}
class FooSaleAdvert(FooAdvert):
__tablename__ = "foo_sale_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now())
class FooSaleAdvertStatistics(FooAdvert, MixinProperties):
__tablename__ = "foo_statistics_sale_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now(), primary_key=True)
class FooRentAdvert(FooAdvert):
__tablename__ = "foo_rent_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now())
class FooRentAdvertStatistics(FooAdvert, MixinProperties):
__tablename__ = "foo_statistics_rent_adverts"
__table_args__ = {'extend_existing': True}
created_at = Column(DateTime, default=func.now(), primary_key=True)
While this answer is a long time after the question was asked, it seems like a solid question, so I'll give answering it a shot.
I'll start off with a few small points that aren't all that important, but are nice to get fixed.
cls
to cls_
(for example data = cls(**item)
). cls
is usually used to indicate the class passed into a classmethod. A trailing underscore basically states "I know this name is taken, but it is still the best name possible."def process_item(self, item, cls, object_link, spider)
doesn't use the last parameter spider
.pass
in the exception handling..keys()
in for advert in adverts_dict.keys():
is not needed, as you don't mutate the dictionary while iterating over it. for advert in adverts_dict:
should be enough.id
is also taken, change it to id_
(id = Column(Integer, nullable=False, primary_key=True)
).if 'delete' in adv:
session.delete(adv['delete'])
if 'archive' in adv:
session.add(adv['archive'])
if 'new' in adv:
session.add(adv['new'])
session.commit()
if 'delete' in adv:
logger.info("[%s][DELETE] ID: %s, price: %s" % (
adv['delete'].__class__,
adv['delete'].id,
adv['delete'].price)
)
if 'archive' in adv:
logger.info("[%s][ARCHIVE] ID: %s, price: %s" % (
adv['archive'].__class__,
adv['archive'].id,
adv['archive'].price)
)
if 'new' in adv:
logger.info("[%s][NEW] ID: %s, price: %s" % (
adv['new'].__class__,
adv['new'].id,
adv['new'].price)
)
Since the only thing that really changes between each block is the string, you can parameterise it. I've left the delete separate to highlight that it is different. You could change the list to a dictionary of {action_name: action_func} and gettattr if you want to combine all of them.
if 'delete' in adv:
session.delete(adv['delete'])
for action in ('archive', 'new'):
if action in adv:
session.add(adv[action])
session.commit()
for action in ('delete', 'archive', 'new'):
if action in adv:
logger.info("[%s][%s] ID: %s, price: %s" % (
adv[action].__class__,
action.upper(),
adv[action].id,
adv[action].price)
for advert in adverts_dict.keys():
if advert not in old_adverts_dict:
# Absolutely new advert or inactive advert becomes active with changed price or m2 in advert
self.actual_adverts.append({'new': adverts_dict[advert]})
if advert in old_stat_adverts_dict:
# Remove duplicate advert from current adverts and archived adverts
if advert in old_adverts_dict:
if old_adverts_dict[advert].price == old_stat_adverts_dict[advert].price:
self.actual_adverts.append({'delete': old_stat_adverts_dict[advert]})
else:
# Remove duplicate advert from income adverts and archived adverts
if adverts_dict[advert].price == old_stat_adverts_dict[advert].price:
self.actual_adverts.append({'delete': old_stat_adverts_dict[advert]})
I don't really see a way to simplify this. You might get some better readability with more clearly distinct names. Another trick I've seen is to have the comments tell the story of the advert as it goes.
for advert in adverts_dict:
# If an advert is new (to us)
if advert not in old_adverts_dict:
# Add it to the new pile.
self.actual_adverts.append({'new': adverts_dict[advert]})
# Otherwise we need to may need to mark it off
if advert in old_stat_adverts_dict:
old_advert = old_stat_adverts_dict[advert]
# if we've had an advert with the same name before
if advert in old_adverts_dict:
# and the same pirce
if old_advert.price == old_adverts_dict[advert].price:
self.actual_adverts.append({'delete': old_advert})
else:
# or it is the pile of current adverts, (again de-duplicating by price).
if old_advert.price == adverts_dict[advert].price:
self.actual_adverts.append({'delete': old_advert})
# I have no idea how I can move particular attributes` values from one class object to different one
stat_advert = statistics_cls()
stat_advert.copy_properties(old_adverts_dict[advert])
This honestly looks fine to me. If you want to make it part of statistics_cls, and you can add to statistics_cls, you could implement a classmethod that takes the old advert, and constructs the object from the values. The built-in dict has a great example of this API, from_keys.
stat_advert.status = ADVERT_STATUSES[ADVERT_SOLD]
This looks like a good candidate for an enum (assuming you either have >= Python 3.4 or access to a library that does this. Alternatively, could you just use the global constant you have directly?
stat_advert.status = ADVERT_SOLD
I can't really see into the classes and how they are used, so I don't really have much I can add here. It does look like you have an open/close pattern, which may be nicer as a context manager, especially if you need to guarantee the close method is executed.
Answered by spyr03 on October 27, 2021
Get help from others!
Recent Questions
Recent Answers
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP